Merge pull request #2298 from AI-Hypercomputer:ckpt_conversion

Google-ML-Automation · Google-ML-Automation · commit 3b27bed7e3f1 · 2025-09-11T09:29:42.000-07:00
PiperOrigin-RevId: 805869447
diff --git a/end_to_end/tpu/gemma2/2b/test_gemma2_to_hf.sh b/end_to_end/tpu/gemma2/2b/test_gemma2_to_hf.sh
@@ -1,17 +1,20 @@
 #!/bin/bash
 
-# This file is both an integration test that runs once a day on a v4-8 and documentation for how to get started with Qwen3-4B.
+# This script is both an end-to-end test that runs once a day on a v4-8 and documentation for how to get started with Gemma2-2B.
 
-# The flow of this file is as follows:
-# 1. Convert the checkpoint downloaded from Hugging Face to make it compatible with MaxText
-# 2. Run a forward pass check to compare the logits and KL divergence between the converted ckpt and orginal golden HF model
+# The flow of this script is as follows:
+# 1. Convert a MaxText checkpoint to a Hugging Face model checkpoint.
+# 2. Run a forward pass check to compare the logits and KL divergence between the converted ckpt and orginal golden HF model.
+
+# Pre-requisites:
+# 1. Set HF_TOKEN environment variable to your Hugging Face access token with read permissions
+# export HF_TOKEN=<Hugging Face access token>
 
 
 set -ex
 idx=$(date +%Y-%m-%d-%H-%M)
 MODEL_NAME='gemma2-2b'
 export MODEL_VARIATION='2b'
-HF_TOKEN='' # Important!!! Save your hf access token here
 TOKENIZER_PATH="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_REPO_ROOT:-$PWD}/assets}"'/tokenizer.gemma'
 
 # Installing torch for deps in forward_pass_logit_checker.py
@@ -33,7 +36,7 @@ python3 -m MaxText.utils.ckpt_conversion.to_huggingface "${MAXTEXT_PKG_DIR:-${MA
     hf_access_token=${HF_TOKEN} \
     load_parameters_path=${CKPT_PATH} \
     base_output_directory=${LOCAL_PATH} \
-    scan_layers=false 
+    scan_layers=false
 
 # Alternatively, if uploaded the converted ckpt, HF requires local storage of model
 # mkdir -p "${LOCAL_PATH}"
@@ -48,4 +51,4 @@ python3 -m tests.forward_pass_logit_checker "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_R
     scan_layers=false \
     --hf_model_path=${LOCAL_PATH} \
     --max_kl_div=0.015 \
-    --run_hf_model=true 
+    --run_hf_model=true
diff --git a/end_to_end/tpu/gemma2/2b/test_gemma2_to_mt.sh b/end_to_end/tpu/gemma2/2b/test_gemma2_to_mt.sh
@@ -1,19 +1,22 @@
 #!/bin/bash
 
-# This file contains an end-to-end Airflow nightly test, designed to run once a day on a v4-8, along with documentation to guide users in getting started with Gemma2-2B.
+# This script is both an end-to-end test that runs once a day on a v4-8 and documentation for how to get started with Gemma2-2B.
 
-# The flow of this file is as follows:
-# 1. Convert the checkpoint downloaded from Hugging Face to make it compatible with MaxText
-# 2. Run a forward pass logits check to compare with the original HF golden model
-# 2. Run decoding, finetuning of Gemma2-2B. with the converted checkpoint.
-# 3. Run decoding from the finetuned checkpoint from step 2
+# The flow of this script is as follows:
+# 1. Convert the checkpoint downloaded from Hugging Face to make it compatible with MaxText.
+# 2. Run a forward pass logits check to compare with the original HF golden model.
+# 3. Run decoding, finetuning of Gemma2-2B. with the converted checkpoint.
+# 4. Run decoding from the finetuned checkpoint from step 3.
+
+# Pre-requisites:
+# 1. Set HF_TOKEN environment variable to your Hugging Face access token with read permissions
+# export HF_TOKEN=<Hugging Face access token>
 
 
 set -ex
 idx=$(date +%Y-%m-%d-%H-%M)
 MODEL_NAME='gemma2-2b'
 export MODEL_VARIATION='2b'
-HF_TOKEN='' # Important!!! Save your hf access token here
 HF_GOLDEN_MODEL='google/gemma-2-2b'
 TOKENIZER_PATH="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_REPO_ROOT:-$PWD}/assets}"'/tokenizer.gemma'
 
diff --git a/end_to_end/tpu/gemma3/4b/test_gemma3_to_hf.sh b/end_to_end/tpu/gemma3/4b/test_gemma3_to_hf.sh
@@ -1,19 +1,22 @@
 #!/bin/bash
 
-# This file is both an integration test that runs once a day on a v4-8 and documentation for how to get started with Qwen3-4B.
+# This script is both an end-to-end test that runs once a day on a v4-8 and documentation for how to get started with Gemma3-4B.
 
-# The flow of this file is as follows:
-# 1. Convert the checkpoint downloaded from Hugging Face to make it compatible with MaxText
-# 2. Run a forward pass check to compare the logits and KL divergence between the converted ckpt and orginal golden HF model
+# The flow of this script is as follows:
+# 1. Convert a MaxText checkpoint to a Hugging Face model checkpoint.
+# 2. Run a forward pass check to compare the logits and KL divergence between the converted ckpt and orginal golden HF model.
+
+# Pre-requisites:
+# 1. Set HF_TOKEN environment variable to your Hugging Face access token with read permissions
+# export HF_TOKEN=<Hugging Face access token>
 
 set -ex
 idx=$(date +%Y-%m-%d-%H-%M)
 MODEL_NAME='gemma3-4b'
 export MODEL_VARIATION='4b'
-HF_TOKEN='' # Important!!! Save your hf access token here
 TOKENIZER_PATH="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_REPO_ROOT:-$PWD}/assets}"'/tokenizer.gemma3'
 # To convert the multimodal model, make sure the use_multimodal is set to be true
-USE_MULTIMODAL=true
+USE_MULTIMODAL=false
 
 # Installing torch for deps in forward_pass_logit_checker.py
 python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
@@ -35,7 +38,7 @@ python3 -m MaxText.utils.ckpt_conversion.to_huggingface "${MAXTEXT_PKG_DIR:-${MA
     load_parameters_path=${CKPT_PATH} \
     base_output_directory=${LOCAL_PATH} \
     use_multimodal=${USE_MULTIMODAL} \
-    scan_layers=false 
+    scan_layers=false
 
 # Alternatively, if uploaded the converted ckpt, HF requires local storage of model
 # mkdir -p "${LOCAL_PATH}"
@@ -51,4 +54,4 @@ python3 -m tests.forward_pass_logit_checker "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_R
     scan_layers=false \
     --hf_model_path=${LOCAL_PATH} \
     --max_kl_div=0.015 \
-    --run_hf_model=true
+    --run_hf_model=true
diff --git a/end_to_end/tpu/gemma3/4b/test_gemma3_to_mt.sh b/end_to_end/tpu/gemma3/4b/test_gemma3_to_mt.sh
@@ -1,23 +1,26 @@
 #!/bin/bash
 
-# This file contains an end-to-end Airflow nightly test, designed to run once a day on a v4-8, along with documentation to guide users in getting started with Gemma3-4B.
+# This script is both an end-to-end test that runs once a day on a v4-8 and documentation for how to get started with Gemma3-4B.
 
-# The flow of this file is as follows:
-# 1. Convert the checkpoint downloaded from Hugging Face to make it compatible with MaxText
-# 2. Run a forward pass logits check to compare with the original HF golden model
-# 2. Run decoding, finetuning of Gemma3-4B. with the converted checkpoint.
-# 3. Run decoding from the finetuned checkpoint from step 2
+# The flow of this script is as follows:
+# 1. Convert the checkpoint downloaded from Hugging Face to make it compatible with MaxText.
+# 2. Run a forward pass logits check to compare with the original HF golden model.
+# 3. Run decoding, finetuning of Gemma3-4B. with the converted checkpoint.
+# 4. Run decoding from the finetuned checkpoint from step 3.
+
+# Pre-requisites:
+# 1. Set HF_TOKEN environment variable to your Hugging Face access token with read permissions
+# export HF_TOKEN=<Hugging Face access token>
 
 
 set -ex
 idx=$(date +%Y-%m-%d-%H-%M)
 MODEL_NAME='gemma3-4b'
 export MODEL_VARIATION='4b'
-HF_TOKEN='' # Important!!! Save your hf access token here
 HF_GOLDEN_MODEL='google/gemma-3-4b-it'
 TOKENIZER_PATH="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_REPO_ROOT:-$PWD}/assets}"'/tokenizer.gemma3'
 # To convert the multimodal model, make sure the use_multimodal is set to be true
-USE_MULTIMODAL=true
+USE_MULTIMODAL=false
 
 # Installing torch for deps in forward_pass_logit_checker.py
 python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
@@ -33,18 +36,18 @@ python3 -m MaxText.utils.ckpt_conversion.to_maxtext "${MAXTEXT_PKG_DIR:-${MAXTEX
     hf_access_token=${HF_TOKEN} \
     base_output_directory=${MODEL_BUCKET}/${MODEL_VARIATION}/unscanned/${idx} \
     use_multimodal=${USE_MULTIMODAL} \
-    scan_layers=false 
+    scan_layers=false
 
 export UNSCANNED_CKPT_PATH=${MODEL_BUCKET}/${MODEL_VARIATION}/unscanned/${idx}/0/items
 
-# # To get scanned ckpt, flip the scan_layers. 
+# # To get scanned ckpt, flip the scan_layers.
 # ToDo: gemma3 multimodal scanned ckpt
 # python3 -m MaxText.utils.ckpt_conversion.to_maxtext src/MaxText/configs/base.yml \
 #     model_name=${MODEL_NAME} \
 #     hf_access_token=${HF_TOKEN} \
 #     base_output_directory=${MODEL_BUCKET}/${MODEL_VARIATION}/scanned/${idx} \
 #     use_multimodal=${USE_MULTIMODAL} \
-#     scan_layers=true 
+#     scan_layers=true
 
 # export SCANNED_CKPT_PATH=${MODEL_BUCKET}/${MODEL_VARIATION}/scanned/${idx}/0/items
 
@@ -53,14 +56,14 @@ export UNSCANNED_CKPT_PATH=${MODEL_BUCKET}/${MODEL_VARIATION}/unscanned/${idx}/0
 
 # ToDo: improve forward_pass_logit_checker to test multi-modal prompt
 python3 -m tests.forward_pass_logit_checker "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml \
-    tokenizer_path==${TOKENIZER_PATH}  \
+    tokenizer_path=${TOKENIZER_PATH}  \
     load_parameters_path=${UNSCANNED_CKPT_PATH} \
     model_name=${MODEL_NAME} \
     use_multimodal=${USE_MULTIMODAL} \
     scan_layers=false \
     --hf_model_path=${HF_GOLDEN_MODEL} \
     --max_kl_div=0.015 \
-    --run_hf_model=true 
+    --run_hf_model=true
 
 # We can run decoding for unscanned checkpoints.
 if [ ${USE_MULTIMODAL} == true ]; then
@@ -84,4 +87,4 @@ if [ ${USE_MULTIMODAL} == true ]; then
     python3 -m MaxText.decode "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml model_name=${MODEL_NAME} tokenizer_path=${TOKENIZER_PATH} load_parameters_path=${BASE_OUTPUT_DIRECTORY}/${FINETUNE_RUN_NAME}/checkpoints/0/items per_device_batch_size=1 run_name=ht_test max_prefill_predict_length=272 max_target_length=300 steps=1 async_checkpointing=false scan_layers=false use_multimodal=${USE_MULTIMODAL} prompt=\'Describe\ image\ \<start_of_image\>\' image_path=\'src/MaxText/test_assets/test_image.jpg\' attention=\'dot_product\'
 else
     python3 -m MaxText.decode "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml model_name=${MODEL_NAME} tokenizer_path=${TOKENIZER_PATH} load_parameters_path=${BASE_OUTPUT_DIRECTORY}/${FINETUNE_RUN_NAME}/checkpoints/0/items per_device_batch_size=1 run_name=ht_test max_prefill_predict_length=8 max_target_length=16 steps=1 async_checkpointing=false scan_layers=false prompt='I love to' attention=\'dot_product\'
-fi
+fi
diff --git a/end_to_end/tpu/qwen3/4b/test_qwen3_to_hf.sh b/end_to_end/tpu/qwen3/4b/test_qwen3_to_hf.sh
@@ -1,17 +1,20 @@
 #!/bin/bash
 
-# This file is both an integration test that runs once a day on a v4-8 and documentation for how to get started with Qwen3-4B.
+# This script is both an end-to-end test that runs once a day on a v4-8 and documentation for how to get started with Qwen3-4B.
 
-# The flow of this file is as follows:
-# 1. Convert the checkpoint downloaded from Hugging Face to make it compatible with MaxText
-# 2. Run a forward pass check to compare the logits and KL divergence between the converted ckpt and orginal golden HF model
+# The flow of this script is as follows:
+# 1. Convert a MaxText checkpoint to a Hugging Face model checkpoint.
+# 2. Run a forward pass check to compare the logits and KL divergence between the converted ckpt and orginal golden HF model.
+
+# Pre-requisites:
+# 1. Set HF_TOKEN environment variable to your Hugging Face access token with read permissions
+# export HF_TOKEN=<Hugging Face access token>
 
 
 set -ex
 idx=$(date +%Y-%m-%d-%H-%M)
 MODEL_NAME='qwen3-4b'
 export MODEL_VARIATION='4b'
-HF_TOKEN='' # Important!!! Save your hf access token here
 
 # Installing torch for deps in forward_pass_logit_checker.py
 python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
@@ -32,7 +35,7 @@ python3 -m MaxText.utils.ckpt_conversion.to_huggingface "${MAXTEXT_PKG_DIR:-${MA
     hf_access_token=${HF_TOKEN} \
     load_parameters_path=${CKPT_PATH} \
     base_output_directory=${LOCAL_PATH} \
-    scan_layers=false 
+    scan_layers=false
 
 # Alternatively, if uploaded the converted ckpt, HF requires local storage of model
 # mkdir -p "${LOCAL_PATH}"
@@ -47,4 +50,4 @@ python3 -m tests.forward_pass_logit_checker "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_R
     scan_layers=false \
     --hf_model_path=${LOCAL_PATH} \
     --max_kl_div=0.015 \
-    --run_hf_model=true 
+    --run_hf_model=true
diff --git a/end_to_end/tpu/qwen3/4b/test_qwen3_to_mt.sh b/end_to_end/tpu/qwen3/4b/test_qwen3_to_mt.sh
@@ -1,18 +1,21 @@
 #!/bin/bash
 
-# This file contains an end-to-end Airflow nightly test, designed to run once a day on a v4-8, along with documentation to guide users in getting started with Gemma2-2B.
+# This script is both an end-to-end test that runs once a day on a v4-8 and documentation for how to get started with Qwen3-4B.
 
-# The flow of this file is as follows:
-# 1. Convert the checkpoint downloaded from Hugging Face to make it compatible with MaxText
-# 2. Run a forward pass logits check to compare with the original HF golden model
-# 2. Run decoding, finetuning of Qwen3-4B. with the converted checkpoint.
-# 3. Run decoding from the finetuned checkpoint from step 2
+# The flow of this script is as follows:
+# 1. Convert the checkpoint downloaded from Hugging Face to make it compatible with MaxText.
+# 2. Run a forward pass logits check to compare with the original HF golden model.
+# 3. Run decoding, finetuning of Qwen3-4B. with the converted checkpoint.
+# 4. Run decoding from the finetuned checkpoint from step 3.
+
+# Pre-requisites:
+# 1. Set HF_TOKEN environment variable to your Hugging Face access token with read permissions
+# export HF_TOKEN=<Hugging Face access token>
 
 set -ex
 idx=$(date +%Y-%m-%d-%H-%M)
 MODEL_NAME='qwen3-4b'
 export MODEL_VARIATION='4b'
-HF_TOKEN='' # Important!!! Save your hf access token here
 HF_GOLDEN_MODEL='Qwen/Qwen3-4B'
 TOKENIZER_PATH="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_REPO_ROOT:-$PWD}/assets}"'/qwen3-tokenizer'
 
@@ -29,23 +32,23 @@ python -m MaxText.utils.ckpt_conversion.to_maxtext "${MAXTEXT_PKG_DIR:-${MAXTEXT
     model_name=${MODEL_NAME} \
     hf_access_token=${HF_TOKEN} \
     base_output_directory=${MODEL_BUCKET}/${MODEL_VARIATION}/unscanned/${idx} \
-    scan_layers=false 
+    scan_layers=false
 
 export UNSCANNED_CKPT_PATH=${MODEL_BUCKET}/${MODEL_VARIATION}/unscanned/${idx}/0/items
 
 # We also test whether the forward pass logits match the original HF model
 # to get higher precision (eg. float32) run on CPU with `JAX_PLATFORMS=cpu`
 python3 -m tests.forward_pass_logit_checker "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml \
-    tokenizer_path==${TOKENIZER_PATH}\
+    tokenizer_path=${TOKENIZER_PATH}\
     load_parameters_path=${UNSCANNED_CKPT_PATH} \
     model_name=${MODEL_NAME} \
     scan_layers=false \
     --hf_model_path=${HF_GOLDEN_MODEL} \
     --max_kl_div=0.015 \
-    --run_hf_model=True 
+    --run_hf_model=True
 
 # We can run decoding for unscanned checkpoints.
-python3 -m MaxText.decode "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml tokenizer_path==${TOKENIZER_PATH} load_parameters_path=${UNSCANNED_CKPT_PATH} per_device_batch_size=1 run_name=runner_$(date +%Y-%m-%d-%H-%M) max_prefill_predict_length=8 max_target_length=16 dataset_type=synthetic steps=10 async_checkpointing=false scan_layers=false model_name=${MODEL_NAME} prompt="I love to"
+python3 -m MaxText.decode "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml tokenizer_path=${TOKENIZER_PATH} load_parameters_path=${UNSCANNED_CKPT_PATH} per_device_batch_size=1 run_name=runner_$(date +%Y-%m-%d-%H-%M) max_prefill_predict_length=8 max_target_length=16 dataset_type=synthetic steps=10 async_checkpointing=false scan_layers=false model_name=${MODEL_NAME} prompt="I love to"
 
 # # Non-Googlers please remember to point `DATASET_PATH` to the GCS bucket where you have your training data
 export DATASET_PATH=gs://maxtext-dataset
@@ -55,7 +58,7 @@ export BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs
 # We can also run finetuning by using the scanned converted checkpoint.
 # Note that scanned checkpoint helps with efficient finetuning
 export FINETUNE_RUN_NAME=runner_finetune_${idx}
-python3 -m MaxText.train "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml base_output_directory=${BASE_OUTPUT_DIRECTORY} dataset_path=${DATASET_PATH} tokenizer_path==${TOKENIZER_PATH} load_parameters_path=${UNSCANNED_CKPT_PATH} per_device_batch_size=1 run_name=${FINETUNE_RUN_NAME} max_target_length=8192 steps=10 async_checkpointing=false scan_layers=false model_name=${MODEL_NAME} checkpoint_period=5
+python3 -m MaxText.train "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml base_output_directory=${BASE_OUTPUT_DIRECTORY} dataset_path=${DATASET_PATH} tokenizer_path=${TOKENIZER_PATH} load_parameters_path=${UNSCANNED_CKPT_PATH} per_device_batch_size=1 run_name=${FINETUNE_RUN_NAME} max_target_length=8192 steps=10 async_checkpointing=false scan_layers=false model_name=${MODEL_NAME} checkpoint_period=5
 
 # Now, run decoding on the checkpoint generated from our finetune run.
-python3 -m MaxText.decode "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml tokenizer_path==${TOKENIZER_PATH} load_parameters_path=${BASE_OUTPUT_DIRECTORY}/${FINETUNE_RUN_NAME}/checkpoints/0/items per_device_batch_size=1 run_name=runner_$(date +%Y-%m-%d-%H-%M) max_prefill_predict_length=8 max_target_length=16 dataset_type=synthetic steps=10 async_checkpointing=false scan_layers=false model_name=${MODEL_NAME} prompt="I love to"
+python3 -m MaxText.decode "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml tokenizer_path=${TOKENIZER_PATH} load_parameters_path=${BASE_OUTPUT_DIRECTORY}/${FINETUNE_RUN_NAME}/checkpoints/0/items per_device_batch_size=1 run_name=runner_$(date +%Y-%m-%d-%H-%M) max_prefill_predict_length=8 max_target_length=16 dataset_type=synthetic steps=10 async_checkpointing=false scan_layers=false model_name=${MODEL_NAME} prompt="I love to"