Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
abf24d8
chore: update HuggingFace Neuron training container
tengomucho Sep 19, 2025
e75107e
test(neuronx): update training test
tengomucho Sep 23, 2025
9fdecdd
Merge branch 'master' into update-hf-pt2.7-sdk2.24-trn
ahsan-z-khan Sep 24, 2025
51ae3fd
Merge branch 'master' into update-hf-pt2.7-sdk2.24-trn
ahsan-z-khan Sep 26, 2025
2d2f77b
Merge branch 'master' into update-hf-pt2.7-sdk2.24-trn
ahsan-z-khan Sep 29, 2025
be67daa
Merge branch 'master' into update-hf-pt2.7-sdk2.24-trn
ahsan-z-khan Sep 29, 2025
887dc16
update sdk version, dev toml, tests
ahsan-z-khan Oct 1, 2025
f25c8ec
Merge branch 'master' into update-hf-pt2.7-sdk2.24-trn
ahsan-z-khan Oct 2, 2025
64854ad
chore: update HuggingFace Neuron training container based on SDK one
tengomucho Oct 3, 2025
7cc49e1
Merge branch 'master' into update-hf-pt2.7-sdk2.24-trn
ahsan-z-khan Oct 3, 2025
e72d0cd
Merge branch 'master' into update-hf-pt2.7-sdk2.24-trn
ahsan-z-khan Oct 8, 2025
4737acd
Merge branch 'master' into update-hf-pt2.7-sdk2.24-trn
ahsan-z-khan Oct 8, 2025
a77e0e5
merge two pip install
ahsan-z-khan Oct 8, 2025
19830ff
Create Dockerfile.neuronx.py_scan_allowlist.json
arjraman Oct 8, 2025
de3464d
Update Dockerfile.neuronx.py_scan_allowlist.json
arjraman Oct 8, 2025
818817e
add bokeh
ahsan-z-khan Oct 8, 2025
25e18ff
change base image to training
ahsan-z-khan Oct 8, 2025
63bfd04
Update Dockerfile.neuronx.py_scan_allowlist.json
arjraman Oct 8, 2025
7b6765a
Update Dockerfile.neuronx.py_scan_allowlist.json
arjraman Oct 9, 2025
6b710eb
Update test_pre_release.py
arjraman Oct 9, 2025
3f0ee0c
Update test_pre_release.py
arjraman Oct 9, 2025
37c9e08
Update Dockerfile.neuronx
arjraman Oct 9, 2025
e3cd8e2
Update Dockerfile.neuronx
arjraman Oct 9, 2025
08f9a51
Update Dockerfile.neuronx
arjraman Oct 9, 2025
671683a
Update Dockerfile.neuronx
arjraman Oct 9, 2025
fca4a28
Update Dockerfile.neuronx
arjraman Oct 9, 2025
b194622
Update Dockerfile.neuronx.py_scan_allowlist.json
arjraman Oct 9, 2025
22056df
Update Dockerfile.neuronx
arjraman Oct 9, 2025
b7834de
Update Dockerfile.neuronx
arjraman Oct 9, 2025
b5badd6
Update Dockerfile.neuronx.py_scan_allowlist.json
arjraman Oct 9, 2025
010a0b7
test(training): update parameters in HF neuronx test
tengomucho Oct 9, 2025
297b8ff
fix(training): update parameters in HF neuronx test
tengomucho Oct 9, 2025
ffabd56
Update sagemaker.py
arjraman Oct 9, 2025
cabea97
Update Dockerfile.neuronx
arjraman Oct 9, 2025
6776c67
Update Dockerfile.neuronx
arjraman Oct 9, 2025
61597b2
Update __init__.py
arjraman Oct 9, 2025
fd53d95
Revert "Update __init__.py"
arjraman Oct 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions dlc_developer_config.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[dev]
# Set to "huggingface", for example, if you are a huggingface developer. Default is ""
partner_developer = ""
partner_developer = "huggingface"
# Please only set it to true if you are preparing an EI related PR
# Do remember to revert it back to false before merging any PR (including EI dedicated PR)
ei_mode = false
Expand All @@ -9,7 +9,7 @@ ei_mode = false
neuron_mode = false
# Please only set it to true if you are preparing a NEURONX related PR
# Do remember to revert it back to false before merging any PR (including NEURONX dedicated PR)
neuronx_mode = false
neuronx_mode = true
# Please only set it to true if you are preparing a GRAVITON related PR
# Do remember to revert it back to false before merging any PR (including GRAVITON dedicated PR)
graviton_mode = false
Expand Down Expand Up @@ -37,12 +37,12 @@ deep_canary_mode = false
[build]
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
build_frameworks = []
build_frameworks = ["huggingface_pytorch"]


# By default we build both training and inference containers. Set true/false values to determine which to build.
build_training = true
build_inference = true
build_inference = false

# Set do_build to "false" to skip builds and test the latest image built by this PR
# Note: at least one build is required to set do_build to "false"
Expand Down
12 changes: 6 additions & 6 deletions huggingface/pytorch/training/buildspec-neuronx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
region: &REGION <set-$REGION-in-environment>
base_framework: &BASE_FRAMEWORK pytorch
framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
version: &VERSION 2.1.2
short_version: &SHORT_VERSION "2.1"
version: &VERSION 2.7.0
short_version: &SHORT_VERSION "2.7"
contributor: huggingface
arch_type: x86

Expand All @@ -30,13 +30,13 @@ images:
BuildNeuronHFPytorchPy310TrainingDockerImage:
<<: *TRAINING_REPOSITORY
build: &HUGGINGFACE_PYTORCH_INF_TRAINING_PY3 false
image_size_baseline: 20000
image_size_baseline: 28000
device_type: &DEVICE_TYPE neuronx
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py310
neuron_sdk_version: &NEURON_SDK_VERSION sdk2.20.0
os_version: &OS_VERSION ubuntu20.04
transformers_version: &TRANSFORMERS_VERSION 4.48.1
neuron_sdk_version: &NEURON_SDK_VERSION sdk2.24.1
os_version: &OS_VERSION ubuntu22.04
transformers_version: &TRANSFORMERS_VERSION 4.51.0
datasets_version: &DATASETS_VERSION 2.18.0
tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION,"-", *NEURON_SDK_VERSION, '-', *OS_VERSION ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *NEURON_SDK_VERSION, /Dockerfile., *DEVICE_TYPE ]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# https://github.com/aws/deep-learning-containers/blob/master/available_images.md
# refer to the above page to pull latest PyTorch Neuronx image

# docker image region us-west-2
FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training-neuronx:2.7.0-neuronx-py310-sdk2.24.1-ubuntu22.04

LABEL maintainer="Amazon AI"
LABEL dlc_major_version="2"

# Version args
ARG OPTIMUM_NEURON_VERSION=0.3.0
ARG TRANSFORMERS_VERSION=4.51.0
ARG DATASETS_VERSION=4.1.0
ARG GEVENT_VERSION=24.10.3
ARG PYTHON=python3

RUN apt-get remove -y --purge emacs && \
apt-get autoremove -y

RUN pip install --upgrade pip

# We need to set this environment variable to avoid the following error when building KenLM:
# https://github.com/kpu/kenlm/issues/462
ENV CMAKE_POLICY_VERSION_MINIMUM=3.5

# Install Hugging Face libraries and its dependencies
# Install optimum-neuron with this exta starting from next release. \
# "optimum-neuron[training]"==${OPTIMUM_NEURON_VERSION} \
RUN pip install --no-cache-dir \
"sagemaker==2.232.2" \
evaluate \
transformers[sklearn,sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \
datasets==${DATASETS_VERSION} \
optimum-neuron[training]==${OPTIMUM_NEURON_VERSION} \
gevent==${GEVENT_VERSION}

# Pin numpy to version required by neuronx-cc
# Update Pillow, urllib, wandb versions to fix high and critical vulnerabilities
# neuronx-cc has requirement networkx~=2.6
RUN pip install -U \
"tensorboard>=2.11.0" \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tensorboard is also duplicate. please combine all of them

"numpy>=1.24.3,<=1.25.2" \
"numba" \
"Pillow==10.3.0" \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pillow is getting installed before too

"requests<2.32.0" \
wandb \
pytorch-lightning \
Jinja2 \
mlflow \
tornado \
"awscli<2" \
boto3 \
botocore \
google-auth \
"urllib3>=1.26.17,<1.27" \
"networkx==2.6.3" \
bokeh \
torchvision==0.22.0 \
"opencv-python<4.12.0"

RUN apt-get update \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do all APT installs in the beginning

&& apt install -y --no-install-recommends \
git-lfs \
libgssapi-krb5-2 \
libexpat1 \
expat \
libarchive13 \
libgstreamer1.0-0 \
libgstreamer-plugins-base1.0-0 \
&& apt-get upgrade -y apparmor \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
# The pytorch-training-neuronx base image comes with unneeded files for setting up apex
# In order to pass the sanity test in the deep-learning-containers workflow, we will remove it here
&& rm -rf /root/apex_setup.py

ENV WANDB_MODE=disabled
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"76839": "[pkg: gevent] [installed: 24.10.3]",
"79077": "[pkg: h2] [installed: 4.2.0]",
"71691": "[pkg: mlflow] [installed: 3.4.0]",
"77740": "[pkg: protobuf] [installed: 3.20.3]",
"78558": "[pkg: regex] [installed: 2024.11.6]",
"77680": "[pkg: requests] [installed: 2.31.0]",
"71064": "[pkg: requests] [installed: 2.31.0]",
"77986": "[pkg: transformers] [installed: 4.51.0]",
"78153": "[pkg: transformers] [installed: 4.51.0]",
"79596": "[pkg: transformers] [installed: 4.51.0]",
"79595": "[pkg: transformers] [installed: 4.51.0]",
"79855": "[pkg: transformers] [installed: 4.51.0]",
"78688": "[pkg: transformers] [installed: 4.51.0]",
"77744": "[pkg: urllib3] [installed: 1.26.20]",
"78828": "[pkg: torch] [installed: 2.7.0]"
}
12 changes: 12 additions & 0 deletions huggingface/pytorch/training/telemetry.sh
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you explain what is this doing?

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# telemetry.sh
#!/bin/bash
if [ -f /usr/local/bin/deep_learning_container.py ] && [[ -z "${OPT_OUT_TRACKING}" || "${OPT_OUT_TRACKING,,}" != "true" ]]; then
(
python /usr/local/bin/deep_learning_container.py \
--framework "huggingface_pytorch" \
--framework-version "2.7.0" \
--container-type "training" \
&>/dev/null &
)
fi

Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,18 @@

# hyperparameters, which are passed into the training job
hyperparameters = {
"model_name_or_path": "hf-internal-testing/tiny-random-BertModel",
"dataset_name": "imdb",
"model_id": "Qwen/Qwen3-8B",
"tensor_parallel_size": 8,
"gradient_accumulation_steps": 8,
"zero_1": True,
"do_train": True,
"bf16": True,
"max_seq_length": 128,
"max_train_samples": 10,
"per_device_train_batch_size": 4,
"max_steps": 1,
"per_device_train_batch_size": 1,
"num_train_epochs": 1,
"logging_steps": 1,
"output_dir": "/opt/ml/model",
"report_to": "none",
}


Expand Down Expand Up @@ -110,6 +112,7 @@ def test_neuronx_text_classification(ecr_image, sagemaker_regions, py_version, i
"instance_count": 1,
"num_neuron_cores": 2,
}

invoke_neuron_helper_function(
ecr_image, sagemaker_regions, _test_neuronx_text_classification_function, function_args
)
Expand All @@ -119,13 +122,13 @@ def _test_neuronx_text_classification_function(
ecr_image,
sagemaker_session,
py_version,
instance_type="ml.trn1.2xlarge",
instance_type="ml.trn1.32xlarge",
instance_count=1,
num_neuron_cores=2,
):
pytorch_version = get_pytorch_version(ecr_image)
if pytorch_version in SpecifierSet("==2.1.*"):
optimum_neuron_version = "0.0.24"
if pytorch_version in SpecifierSet("==2.7.*"):
optimum_neuron_version = "0.3.0"
else:
raise ValueError(
f"`optimum_neuron_version` to be set for PyTorch version {pytorch_version}."
Expand All @@ -136,12 +139,12 @@ def _test_neuronx_text_classification_function(
"branch": "v" + optimum_neuron_version,
}

source_dir = "./examples/text-classification"
source_dir = "./examples/training/qwen3"

role = get_execution_role()
with timeout(minutes=DEFAULT_TIMEOUT):
estimator = HuggingFace(
entry_point="run_glue.py",
entry_point="finetune_qwen3.py",
source_dir=source_dir,
git_config=git_config,
role="SageMakerRole",
Expand All @@ -150,7 +153,7 @@ def _test_neuronx_text_classification_function(
instance_type=instance_type,
sagemaker_session=sagemaker_session,
py_version=py_version,
# distribution=distribution, # Uncomment when it is enabled by HuggingFace Estimator
distribution=distribution,
hyperparameters=hyperparameters,
)
estimator.fit(job_name=sagemaker.utils.unique_name_from_base("test-hf-pt-glue-neuronx"))
estimator.fit(job_name=sagemaker.utils.unique_name_from_base("test-hf-pt-qwen3-neuronx"))
4 changes: 3 additions & 1 deletion test/test_utils/sagemaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ def is_test_job_efa_dedicated():
def assign_sagemaker_remote_job_instance_type(image):
if "graviton" in image or "arm64" in image:
return "ml.c6g.2xlarge"
elif "neuronx" in image or "training-neuron" in image:
elif "training-neuronx" in image:
return "ml.trn1.32xlarge"
elif "neuronx" in image:
return "ml.trn1.2xlarge"
elif "inference-neuron" in image:
return "ml.inf1.xlarge"
Expand Down