aws · tengomucho · Sep 19, 2025 · Sep 23, 2025 · Sep 24, 2025 · Sep 26, 2025
@@ -1,6 +1,6 @@
 [dev]
 # Set to "huggingface", for example, if you are a huggingface developer. Default is ""
-partner_developer = ""
+partner_developer = "huggingface"
 # Please only set it to true if you are preparing an EI related PR
 # Do remember to revert it back to false before merging any PR (including EI dedicated PR)
 ei_mode = false
@@ -9,7 +9,7 @@ ei_mode = false
 neuron_mode = false
 # Please only set it to true if you are preparing a NEURONX related PR
 # Do remember to revert it back to false before merging any PR (including NEURONX dedicated PR)
-neuronx_mode = false
+neuronx_mode = true
 # Please only set it to true if you are preparing a GRAVITON related PR
 # Do remember to revert it back to false before merging any PR (including GRAVITON dedicated PR)
 graviton_mode = false
@@ -37,12 +37,12 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+build_frameworks = ["huggingface_pytorch"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true
-build_inference = true
+build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"

@@ -2,8 +2,8 @@ account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
 region: &REGION <set-$REGION-in-environment>
 base_framework: &BASE_FRAMEWORK pytorch
 framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
-version: &VERSION 2.1.2
-short_version: &SHORT_VERSION "2.1"
+version: &VERSION 2.7.0
+short_version: &SHORT_VERSION "2.7"
 contributor: huggingface
 arch_type: x86
 
@@ -30,13 +30,13 @@ images:
   BuildNeuronHFPytorchPy310TrainingDockerImage:
     <<: *TRAINING_REPOSITORY
     build: &HUGGINGFACE_PYTORCH_INF_TRAINING_PY3 false
-    image_size_baseline: 20000
+    image_size_baseline: 28000
     device_type: &DEVICE_TYPE neuronx
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py310
-    neuron_sdk_version: &NEURON_SDK_VERSION sdk2.20.0
-    os_version: &OS_VERSION ubuntu20.04
-    transformers_version: &TRANSFORMERS_VERSION 4.48.1
+    neuron_sdk_version: &NEURON_SDK_VERSION sdk2.24.1
+    os_version: &OS_VERSION ubuntu22.04
+    transformers_version: &TRANSFORMERS_VERSION 4.51.0
     datasets_version: &DATASETS_VERSION 2.18.0
     tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION,"-", *NEURON_SDK_VERSION, '-', *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *NEURON_SDK_VERSION, /Dockerfile., *DEVICE_TYPE ]

@@ -0,0 +1,77 @@
+# https://github.com/aws/deep-learning-containers/blob/master/available_images.md
+# refer to the above page to pull latest PyTorch Neuronx image
+
+# docker image region us-west-2
+FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training-neuronx:2.7.0-neuronx-py310-sdk2.24.1-ubuntu22.04
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="2"
+
+# Version args
+ARG OPTIMUM_NEURON_VERSION=0.3.0
+ARG TRANSFORMERS_VERSION=4.51.0
+ARG DATASETS_VERSION=4.1.0
+ARG GEVENT_VERSION=24.10.3
+ARG PYTHON=python3
+
+RUN apt-get remove -y --purge emacs && \
+apt-get autoremove -y
+
+RUN pip install --upgrade pip
+
+# We need to set this environment variable to avoid the following error when building KenLM:
+# https://github.com/kpu/kenlm/issues/462
+ENV CMAKE_POLICY_VERSION_MINIMUM=3.5
+
+# Install Hugging Face libraries and its dependencies
+# Install optimum-neuron with this exta starting from next release. \
+# "optimum-neuron[training]"==${OPTIMUM_NEURON_VERSION} \
+RUN pip install --no-cache-dir \
+	"sagemaker==2.232.2" \
+	evaluate \
+	transformers[sklearn,sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \
+	datasets==${DATASETS_VERSION} \
+    optimum-neuron[training]==${OPTIMUM_NEURON_VERSION} \
+	gevent==${GEVENT_VERSION}
+
+# Pin numpy to version required by neuronx-cc
+# Update Pillow, urllib, wandb versions to fix high and critical vulnerabilities
+# neuronx-cc has requirement networkx~=2.6
+RUN pip install -U \
+	"tensorboard>=2.11.0" \
+	"numpy>=1.24.3,<=1.25.2" \
+	"numba" \
+	"Pillow==10.3.0" \
+	"requests<2.32.0" \
+        wandb \
+        pytorch-lightning \
+	Jinja2 \
+	mlflow \
+	tornado \
+	"awscli<2" \
+	boto3 \
+	botocore \
+	google-auth \
+	"urllib3>=1.26.17,<1.27" \
+	"networkx==2.6.3" \
+	bokeh \
+	torchvision==0.22.0 \
+    "opencv-python<4.12.0"
+
+RUN apt-get update \
+ && apt install -y --no-install-recommends \
+    git-lfs \
+	libgssapi-krb5-2 \
+	libexpat1 \
+	expat \
+	libarchive13 \
+	libgstreamer1.0-0 \
+	libgstreamer-plugins-base1.0-0 \
+ && apt-get upgrade -y apparmor \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/* \
+ # The pytorch-training-neuronx base image comes with unneeded files for setting up apex
+ # In order to pass the sanity test in the deep-learning-containers workflow, we will remove it here
+ && rm -rf /root/apex_setup.py
+
+ENV WANDB_MODE=disabled
@@ -0,0 +1,17 @@
+{
+    "76839": "[pkg: gevent] [installed: 24.10.3]",
+    "79077": "[pkg: h2] [installed: 4.2.0]",
+    "71691": "[pkg: mlflow] [installed: 3.4.0]",
+    "77740": "[pkg: protobuf] [installed: 3.20.3]",
+    "78558": "[pkg: regex] [installed: 2024.11.6]",
+    "77680": "[pkg: requests] [installed: 2.31.0]",
+    "71064": "[pkg: requests] [installed: 2.31.0]",
+    "77986": "[pkg: transformers] [installed: 4.51.0]",
+    "78153": "[pkg: transformers] [installed: 4.51.0]",
+    "79596": "[pkg: transformers] [installed: 4.51.0]",
+    "79595": "[pkg: transformers] [installed: 4.51.0]",
+    "79855": "[pkg: transformers] [installed: 4.51.0]",
+    "78688": "[pkg: transformers] [installed: 4.51.0]",
+    "77744": "[pkg: urllib3] [installed: 1.26.20]",
+    "78828": "[pkg: torch] [installed: 2.7.0]"
+}
@@ -0,0 +1,12 @@
+# telemetry.sh
+#!/bin/bash
+if [ -f /usr/local/bin/deep_learning_container.py ] && [[ -z "${OPT_OUT_TRACKING}" || "${OPT_OUT_TRACKING,,}" != "true" ]]; then
+    (
+        python /usr/local/bin/deep_learning_container.py \
+            --framework "huggingface_pytorch" \
+            --framework-version "2.7.0" \
+            --container-type "training" \
+            &>/dev/null &
+    )
+fi
+
@@ -29,16 +29,18 @@
 
 # hyperparameters, which are passed into the training job
 hyperparameters = {
-    "model_name_or_path": "hf-internal-testing/tiny-random-BertModel",
-    "dataset_name": "imdb",
+    "model_id": "Qwen/Qwen3-8B",
+    "tensor_parallel_size": 8,
+    "gradient_accumulation_steps": 8,
+    "zero_1": True,
     "do_train": True,
     "bf16": True,
-    "max_seq_length": 128,
-    "max_train_samples": 10,
-    "per_device_train_batch_size": 4,
+    "max_steps": 1,
+    "per_device_train_batch_size": 1,
     "num_train_epochs": 1,
     "logging_steps": 1,
     "output_dir": "/opt/ml/model",
+    "report_to": "none",
 }
 
 
@@ -110,6 +112,7 @@ def test_neuronx_text_classification(ecr_image, sagemaker_regions, py_version, i
         "instance_count": 1,
         "num_neuron_cores": 2,
     }
+
     invoke_neuron_helper_function(
         ecr_image, sagemaker_regions, _test_neuronx_text_classification_function, function_args
     )
@@ -119,13 +122,13 @@ def _test_neuronx_text_classification_function(
     ecr_image,
     sagemaker_session,
     py_version,
-    instance_type="ml.trn1.2xlarge",
+    instance_type="ml.trn1.32xlarge",
     instance_count=1,
     num_neuron_cores=2,
 ):
     pytorch_version = get_pytorch_version(ecr_image)
-    if pytorch_version in SpecifierSet("==2.1.*"):
-        optimum_neuron_version = "0.0.24"
+    if pytorch_version in SpecifierSet("==2.7.*"):
+        optimum_neuron_version = "0.3.0"
     else:
         raise ValueError(
             f"`optimum_neuron_version` to be set for PyTorch version {pytorch_version}."
@@ -136,12 +139,12 @@ def _test_neuronx_text_classification_function(
         "branch": "v" + optimum_neuron_version,
     }
 
-    source_dir = "./examples/text-classification"
+    source_dir = "./examples/training/qwen3"
 
     role = get_execution_role()
     with timeout(minutes=DEFAULT_TIMEOUT):
         estimator = HuggingFace(
-            entry_point="run_glue.py",
+            entry_point="finetune_qwen3.py",
             source_dir=source_dir,
             git_config=git_config,
             role="SageMakerRole",
@@ -150,7 +153,7 @@ def _test_neuronx_text_classification_function(
             instance_type=instance_type,
             sagemaker_session=sagemaker_session,
             py_version=py_version,
-            # distribution=distribution,  # Uncomment when it is enabled by HuggingFace Estimator
+            distribution=distribution,
             hyperparameters=hyperparameters,
         )
-        estimator.fit(job_name=sagemaker.utils.unique_name_from_base("test-hf-pt-glue-neuronx"))
+        estimator.fit(job_name=sagemaker.utils.unique_name_from_base("test-hf-pt-qwen3-neuronx"))
@@ -57,7 +57,9 @@ def is_test_job_efa_dedicated():
 def assign_sagemaker_remote_job_instance_type(image):
     if "graviton" in image or "arm64" in image:
         return "ml.c6g.2xlarge"
-    elif "neuronx" in image or "training-neuron" in image:
+    elif "training-neuronx" in image:
+        return "ml.trn1.32xlarge"
+    elif "neuronx" in image:
         return "ml.trn1.2xlarge"
     elif "inference-neuron" in image:
         return "ml.inf1.xlarge"