From 3d5efb73152d47cc482f4d2ef7d904406bd8e94b Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Tue, 16 Sep 2025 21:06:08 +0400
Subject: [PATCH 01/15] Sft fro qwen

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 src/MaxText/examples/qwen_training.ipynb | 258 +++++++++++++++++++++++
 1 file changed, 258 insertions(+)
 create mode 100644 src/MaxText/examples/qwen_training.ipynb

diff --git a/src/MaxText/examples/qwen_training.ipynb b/src/MaxText/examples/qwen_training.ipynb
new file mode 100644
index 0000000000..6e717c08ec
--- /dev/null
+++ b/src/MaxText/examples/qwen_training.ipynb
@@ -0,0 +1,258 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Wr4OOETu8elP"
+      },
+      "outputs": [],
+      "source": [
+        "### (Optional) Run this if you just have this file and nothing else\n",
+        "\n",
+        "# 1. Clone the MaxText repository (from AI‑Hypercomputer)\n",
+        "!git clone https://github.com/AI-Hypercomputer/maxtext.git\n",
+        "\n",
+        "# 2. Navigate into the cloned directory\n",
+        "%cd maxtext"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5KPyOE8e9WbO"
+      },
+      "outputs": [],
+      "source": [
+        "### (Optional) Do not run this if you already installed the dependencies\n",
+        "\n",
+        "# 3. Ensure setup.sh is executable\n",
+        "!chmod +x setup.sh\n",
+        "\n",
+        "# 4. Execute the setup script\n",
+        "!./setup.sh\n",
+        "\n",
+        "# force numpy version\n",
+        "!pip install --force-reinstall numpy==2.1.2\n",
+        "#install nest_asyncio\n",
+        "!pip install nest_asyncio\n",
+        "\n",
+        "import nest_asyncio\n",
+        "nest_asyncio.apply()\n",
+        "# To fix \"This event loop is already running\" error in Colab\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "INVFpRSKJ0OP"
+      },
+      "outputs": [],
+      "source": [
+        "# 🚀 Jupyter cell: Download & convert Qwen3-0.6B for MaxText\n",
+        "\n",
+        "!pip install -q huggingface_hub git-lfs\n",
+        "!git lfs install\n",
+        "\n",
+        "# 1. Download Qwen3-0.6B from Hugging Face\n",
+        "from huggingface_hub import snapshot_download\n",
+        "snapshot_download(\"Qwen/Qwen3-0.6B\", local_dir=\"/content/Qwen3-0.6B\")\n",
+        "\n",
+        "# 2. Convert HuggingFace checkpoint → MaxText format\n",
+        "!python /content/maxtext/src/MaxText/convert_qwen3_moe.py \\\n",
+        "    --base_model_path /content/Qwen3-0.6B \\\n",
+        "    --maxtext_model_path /content/qwen3_06b_maxtext_ckpt \\\n",
+        "    --model_size qwen3-0.6b\n",
+        "\n",
+        "# 3. Set the checkpoint path for training\n",
+        "MODEL_CHECKPOINT_PATH = \"/content/qwen3_06b_maxtext_ckpt\"\n",
+        "print(\"✓ MODEL_CHECKPOINT_PATH set to:\", MODEL_CHECKPOINT_PATH)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ABBg-R-OLCRT"
+      },
+      "outputs": [],
+      "source": [
+        "ls /content/maxtext/src/MaxText | grep convert\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CJnhPxUq_G6a"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import sys\n",
+        "#  Set  home directory. Change this to your home directory where maxtext is cloned\n",
+        "MAXTEXT_HOME = os.path.join(\"/content\", \"maxtext\")\n",
+        "print(f\"Home directory (from Python): {MAXTEXT_HOME}\")\n",
+        "#set the path to the Llama3.1-8B-Instruct checkpoint you want to load, gs:// supported\n",
+        "#MODEL_CHECKPOINT_PATH = \"path/to/scanned/checkpoint\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CxzKMBQd_U5-"
+      },
+      "outputs": [],
+      "source": [
+        "from pathlib import Path\n",
+        "from typing import Optional, Dict, Any\n",
+        "\n",
+        "# Find MaxText directory and change working directory to it\n",
+        "current_dir = Path.cwd()\n",
+        "if current_dir.name == 'examples':\n",
+        "    # We're in the examples folder, go up one level\n",
+        "    maxtext_path = current_dir.parent.parent\n",
+        "else:\n",
+        "    # We're in the root, MaxText is a subfolder\n",
+        "    maxtext_path = Path(f'{MAXTEXT_HOME}') / 'src' / 'MaxText'\n",
+        "\n",
+        "# Change working directory to MaxText project root\n",
+        "os.chdir(maxtext_path)\n",
+        "sys.path.insert(0, str(maxtext_path))\n",
+        "\n",
+        "print(f\"✓ Changed working directory to: {os.getcwd()}\")\n",
+        "print(f\"✓ MaxText project root: {maxtext_path}\")\n",
+        "print(f\"✓ Added to Python path: {maxtext_path}\")\n",
+        "import jax\n",
+        "if not jax.distributed.is_initialized():\n",
+        "    jax.distributed.initialize()\n",
+        "print(f\"JAX version: {jax.__version__}\")\n",
+        "print(f\"JAX devices: {jax.devices()}\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rKS8nVYgAbwE"
+      },
+      "outputs": [],
+      "source": [
+        "# Hugging Face Authentication Setup\n",
+        "from huggingface_hub import login\n",
+        "\n",
+        "# Set your Hugging Face token here\n",
+        "HF_TOKEN = \"your_actual_token_here\"  # Replace with your actual token\n",
+        "login(token=HF_TOKEN)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aR0zTWkxAs4t"
+      },
+      "outputs": [],
+      "source": [
+        "# MaxText imports\n",
+        "try:\n",
+        "    from MaxText import pyconfig\n",
+        "    from MaxText.sft.sft_trainer import train as sft_train\n",
+        "\n",
+        "    MAXTEXT_AVAILABLE = True\n",
+        "    print(\"✓ MaxText imports successful\")\n",
+        "except ImportError as e:\n",
+        "    print(f\"⚠️ MaxText not available: {e}\")\n",
+        "    MAXTEXT_AVAILABLE = False"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "In-jdp1AAwrL"
+      },
+      "outputs": [],
+      "source": [
+        "# Fixed configuration setup for Qwen-0.6B on small TPU\n",
+        "if MAXTEXT_AVAILABLE:\n",
+        "    config_argv = [\n",
+        "        \"\",\n",
+        "        f\"{MAXTEXT_HOME}/src/MaxText/configs/sft.yml\",   # base SFT config\n",
+        "        \"model_name=qwen3-0.6b\",\n",
+        "        \"steps=20\",                                     # very short run for testing\n",
+        "        \"per_device_batch_size=1\",                      # minimal to avoid OOM\n",
+        "        \"max_target_length=512\",                        # shorter context to fit memory\n",
+        "        \"learning_rate=2.0e-5\",                         # safe small LR\n",
+        "        \"eval_steps=5\",\n",
+        "        \"weight_dtype=bfloat16\",\n",
+        "        \"dtype=bfloat16\",\n",
+        "        \"hf_path=HuggingFaceH4/ultrachat_200k\",                       # HuggingFace dataset/model if needed\n",
+        "        f\"hf_access_token={HF_TOKEN}\",\n",
+        "        \"base_output_directory=/tmp/maxtext_qwen06\",\n",
+        "        \"run_name=sft_qwen0.6b_test\",\n",
+        "        \"tokenizer_path=Qwen/Qwen3-0.6B\",                # Qwen tokenizer\n",
+        "        \"eval_interval=10\",\n",
+        "        \"steps=100\",\n",
+        "        \"profiler=xplane\",\n",
+        "    ]\n",
+        "\n",
+        "    # Initialize configuration using MaxText's pyconfig\n",
+        "    config = pyconfig.initialize(config_argv)\n",
+        "\n",
+        "    print(\"✓ Fixed configuration loaded:\")\n",
+        "    print(f\"  - Model: {config.model_name}\")\n",
+        "    print(f\"  - Dataset: {config.hf_path}\")\n",
+        "    print(f\"  - Steps: {config.steps}\")\n",
+        "    print(f\"  - Use SFT: {config.use_sft}\")\n",
+        "    print(f\"  - Learning Rate: {config.learning_rate}\")\n",
+        "else:\n",
+        "    print(\"MaxText not available - cannot load configuration\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "EJE1ookSAzz-"
+      },
+      "source": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "mgwpNgQYCJEd"
+      },
+      "outputs": [],
+      "source": [
+        "#  Execute the training using MaxText SFT trainer's train() function\n",
+        "if MAXTEXT_AVAILABLE:\n",
+        "    print(\"=\"*60)\n",
+        "    print(\"EXECUTING ACTUAL TRAINING\")\n",
+        "    print(\"=\"*60)\n",
+        "\n",
+        "\n",
+        "    sft_train(config)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "TPU",
+    "colab": {
+      "gpuType": "V5E1",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

From 2cea782402833481deb97cc0f23742ddf78c47b4 Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Tue, 16 Sep 2025 21:06:43 +0400
Subject: [PATCH 02/15] Sft fro qwen

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 src/MaxText/examples/qwen_training.ipynb | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/MaxText/examples/qwen_training.ipynb b/src/MaxText/examples/qwen_training.ipynb
index 6e717c08ec..eadd73f608 100644
--- a/src/MaxText/examples/qwen_training.ipynb
+++ b/src/MaxText/examples/qwen_training.ipynb
@@ -71,17 +71,6 @@
         "print(\"✓ MODEL_CHECKPOINT_PATH set to:\", MODEL_CHECKPOINT_PATH)\n"
       ]
     },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ABBg-R-OLCRT"
-      },
-      "outputs": [],
-      "source": [
-        "ls /content/maxtext/src/MaxText | grep convert\n"
-      ]
-    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -95,7 +84,6 @@
         "#  Set  home directory. Change this to your home directory where maxtext is cloned\n",
         "MAXTEXT_HOME = os.path.join(\"/content\", \"maxtext\")\n",
         "print(f\"Home directory (from Python): {MAXTEXT_HOME}\")\n",
-        "#set the path to the Llama3.1-8B-Instruct checkpoint you want to load, gs:// supported\n",
         "#MODEL_CHECKPOINT_PATH = \"path/to/scanned/checkpoint\""
       ]
     },

From e1cb7e4d43e57d7ed7e784210908c8128ed13572 Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Fri, 19 Sep 2025 22:17:54 +0400
Subject: [PATCH 03/15] Refined qwen colab

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 src/MaxText/examples/sft_qwen3_demo.ipynb | 257 ++++++++++++++++++++++
 1 file changed, 257 insertions(+)
 create mode 100644 src/MaxText/examples/sft_qwen3_demo.ipynb

diff --git a/src/MaxText/examples/sft_qwen3_demo.ipynb b/src/MaxText/examples/sft_qwen3_demo.ipynb
new file mode 100644
index 0000000000..2f5da3faca
--- /dev/null
+++ b/src/MaxText/examples/sft_qwen3_demo.ipynb
@@ -0,0 +1,257 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Run SFT on Qwen3-0.6B model\n",
+        "\n",
+        "This collab can run on the public TPU 5e-1\n",
+        "\n",
+        "This notebook demonstrates how to perform Supervised Fine-Tuning (SFT) on Qwen3-0.6B using the Hugging Face ultrachat_200k dataset with Tunix integration for efficient training.\n",
+        "\n",
+        "Dataset Overview\n",
+        "https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n",
+        "\n",
+        "Dataset Information:\n",
+        "\n",
+        "Name: HuggingFaceH4/ultrachat_200k\n",
+        "Type: Supervised Fine-Tuning dataset\n",
+        "Size: ~200k conversations\n",
+        "Format: Chat conversations with human-AI pairs\n",
+        "Splits: train_sft, test_sft\n",
+        "Data columns: ['messages']\n",
+        "Dataset Structure: Each example contains a 'messages' field with:\n",
+        "\n",
+        "role: 'user' or 'assistant'\n",
+        "content: The actual message text\n",
+        "Example data format:\n",
+        "\n",
+        "{\n",
+        "  \"messages\": [\n",
+        "    {\"role\": \"user\", \"content\": \"What is the capital of France?\"},\n",
+        "    {\"role\": \"assistant\", \"content\": \"The capital of France is Paris.\"}\n",
+        "  ]\n",
+        "}\n",
+        "\n",
+        "Prerequisites\n",
+        "HuggingFace access token for dataset download\n",
+        "Sufficient compute resources (TPU/GPU)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Wr4OOETu8elP"
+      },
+      "outputs": [],
+      "source": [
+        "### (Optional) Run this if you just have this file and nothing else\n",
+        "\n",
+        "# 1. Clone the MaxText repository (from AI‑Hypercomputer)\n",
+        "!git clone https://github.com/AI-Hypercomputer/maxtext.git\n",
+        "\n",
+        "# 2. Navigate into the cloned directory\n",
+        "%cd maxtext"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5KPyOE8e9WbO"
+      },
+      "outputs": [],
+      "source": [
+        "### (Optional) Do not run this if you already installed the dependencies\n",
+        "\n",
+        "# 3. Ensure setup.sh is executable\n",
+        "!chmod +x setup.sh\n",
+        "\n",
+        "# 4. Execute the setup script\n",
+        "!./setup.sh\n",
+        "\n",
+        "# force numpy version\n",
+        "!pip install --force-reinstall numpy==2.1.2\n",
+        "#install nest_asyncio\n",
+        "!pip install nest_asyncio\n",
+        "\n",
+        "import nest_asyncio\n",
+        "nest_asyncio.apply()\n",
+        "# To fix \"This event loop is already running\" error in Colab\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CJnhPxUq_G6a"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import sys\n",
+        "#  Set  home directory. Change this to your home directory where maxtext is cloned\n",
+        "MAXTEXT_HOME = os.path.join(\"/content\", \"maxtext\")\n",
+        "print(f\"Home directory (from Python): {MAXTEXT_HOME}\")\n",
+        "#MODEL_CHECKPOINT_PATH = \"path/to/scanned/checkpoint\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CxzKMBQd_U5-"
+      },
+      "outputs": [],
+      "source": [
+        "from pathlib import Path\n",
+        "from typing import Optional, Dict, Any\n",
+        "\n",
+        "# Find MaxText directory and change working directory to it\n",
+        "current_dir = Path.cwd()\n",
+        "if current_dir.name == 'examples':\n",
+        "    # We're in the examples folder, go up one level\n",
+        "    maxtext_path = current_dir.parent.parent\n",
+        "else:\n",
+        "    # We're in the root, MaxText is a subfolder\n",
+        "    maxtext_path = Path(f'{MAXTEXT_HOME}') / 'src' / 'MaxText'\n",
+        "\n",
+        "# Change working directory to MaxText project root\n",
+        "os.chdir(maxtext_path)\n",
+        "sys.path.insert(0, str(maxtext_path))\n",
+        "\n",
+        "print(f\"✓ Changed working directory to: {os.getcwd()}\")\n",
+        "print(f\"✓ MaxText project root: {maxtext_path}\")\n",
+        "print(f\"✓ Added to Python path: {maxtext_path}\")\n",
+        "import jax\n",
+        "if not jax.distributed.is_initialized():\n",
+        "    jax.distributed.initialize()\n",
+        "print(f\"JAX version: {jax.__version__}\")\n",
+        "print(f\"JAX devices: {jax.devices()}\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rKS8nVYgAbwE"
+      },
+      "outputs": [],
+      "source": [
+        "# Hugging Face Authentication Setup\n",
+        "from huggingface_hub import login\n",
+        "\n",
+        "# Set your Hugging Face token here\n",
+        "HF_TOKEN = \"your_actual_token_here\"  # Replace with your actual token\n",
+        "login(token=HF_TOKEN)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aR0zTWkxAs4t"
+      },
+      "outputs": [],
+      "source": [
+        "# MaxText imports\n",
+        "try:\n",
+        "    from MaxText import pyconfig\n",
+        "    from MaxText.sft.sft_trainer import train as sft_train\n",
+        "\n",
+        "    MAXTEXT_AVAILABLE = True\n",
+        "    print(\"✓ MaxText imports successful\")\n",
+        "except ImportError as e:\n",
+        "    print(f\"⚠️ MaxText not available: {e}\")\n",
+        "    MAXTEXT_AVAILABLE = False"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "In-jdp1AAwrL"
+      },
+      "outputs": [],
+      "source": [
+        "# Fixed configuration setup for Qwen-0.6B on small TPU\n",
+        "if MAXTEXT_AVAILABLE:\n",
+        "    config_argv = [\n",
+        "        \"\",\n",
+        "        f\"{MAXTEXT_HOME}/src/MaxText/configs/sft.yml\",   # base SFT config\n",
+        "        \"model_name=qwen3-0.6b\",\n",
+        "        \"steps=20\",                                     # very short run for testing\n",
+        "        \"per_device_batch_size=1\",                      # minimal to avoid OOM\n",
+        "        \"max_target_length=512\",                        # shorter context to fit memory\n",
+        "        \"learning_rate=2.0e-5\",                         # safe small LR\n",
+        "        \"eval_steps=5\",\n",
+        "        \"weight_dtype=bfloat16\",\n",
+        "        \"dtype=bfloat16\",\n",
+        "        \"hf_path=HuggingFaceH4/ultrachat_200k\",                       # HuggingFace dataset/model if needed\n",
+        "        f\"hf_access_token={HF_TOKEN}\",\n",
+        "        \"base_output_directory=/tmp/maxtext_qwen06\",\n",
+        "        \"run_name=sft_qwen0.6b_test\",\n",
+        "        \"tokenizer_path=Qwen/Qwen3-0.6B\",                # Qwen tokenizer\n",
+        "        \"eval_interval=10\",\n",
+        "        \"steps=100\",\n",
+        "        \"profiler=xplane\",\n",
+        "    ]\n",
+        "\n",
+        "    # Initialize configuration using MaxText's pyconfig\n",
+        "    config = pyconfig.initialize(config_argv)\n",
+        "\n",
+        "    print(\"✓ Fixed configuration loaded:\")\n",
+        "    print(f\"  - Model: {config.model_name}\")\n",
+        "    print(f\"  - Dataset: {config.hf_path}\")\n",
+        "    print(f\"  - Steps: {config.steps}\")\n",
+        "    print(f\"  - Use SFT: {config.use_sft}\")\n",
+        "    print(f\"  - Learning Rate: {config.learning_rate}\")\n",
+        "else:\n",
+        "    print(\"MaxText not available - cannot load configuration\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "EJE1ookSAzz-"
+      },
+      "source": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "mgwpNgQYCJEd"
+      },
+      "outputs": [],
+      "source": [
+        "#  Execute the training using MaxText SFT trainer's train() function\n",
+        "if MAXTEXT_AVAILABLE:\n",
+        "    print(\"=\"*60)\n",
+        "    print(\"EXECUTING ACTUAL TRAINING\")\n",
+        "    print(\"=\"*60)\n",
+        "\n",
+        "\n",
+        "    sft_train(config)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "TPU",
+    "colab": {
+      "gpuType": "V5E1",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

From 59e8ebff4f42771092e9af70bea2787e88334e9c Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Fri, 19 Sep 2025 22:22:54 +0400
Subject: [PATCH 04/15] Del old one

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 src/MaxText/examples/qwen_training.ipynb | 246 -----------------------
 1 file changed, 246 deletions(-)
 delete mode 100644 src/MaxText/examples/qwen_training.ipynb

diff --git a/src/MaxText/examples/qwen_training.ipynb b/src/MaxText/examples/qwen_training.ipynb
deleted file mode 100644
index eadd73f608..0000000000
--- a/src/MaxText/examples/qwen_training.ipynb
+++ /dev/null
@@ -1,246 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Wr4OOETu8elP"
-      },
-      "outputs": [],
-      "source": [
-        "### (Optional) Run this if you just have this file and nothing else\n",
-        "\n",
-        "# 1. Clone the MaxText repository (from AI‑Hypercomputer)\n",
-        "!git clone https://github.com/AI-Hypercomputer/maxtext.git\n",
-        "\n",
-        "# 2. Navigate into the cloned directory\n",
-        "%cd maxtext"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "5KPyOE8e9WbO"
-      },
-      "outputs": [],
-      "source": [
-        "### (Optional) Do not run this if you already installed the dependencies\n",
-        "\n",
-        "# 3. Ensure setup.sh is executable\n",
-        "!chmod +x setup.sh\n",
-        "\n",
-        "# 4. Execute the setup script\n",
-        "!./setup.sh\n",
-        "\n",
-        "# force numpy version\n",
-        "!pip install --force-reinstall numpy==2.1.2\n",
-        "#install nest_asyncio\n",
-        "!pip install nest_asyncio\n",
-        "\n",
-        "import nest_asyncio\n",
-        "nest_asyncio.apply()\n",
-        "# To fix \"This event loop is already running\" error in Colab\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "INVFpRSKJ0OP"
-      },
-      "outputs": [],
-      "source": [
-        "# 🚀 Jupyter cell: Download & convert Qwen3-0.6B for MaxText\n",
-        "\n",
-        "!pip install -q huggingface_hub git-lfs\n",
-        "!git lfs install\n",
-        "\n",
-        "# 1. Download Qwen3-0.6B from Hugging Face\n",
-        "from huggingface_hub import snapshot_download\n",
-        "snapshot_download(\"Qwen/Qwen3-0.6B\", local_dir=\"/content/Qwen3-0.6B\")\n",
-        "\n",
-        "# 2. Convert HuggingFace checkpoint → MaxText format\n",
-        "!python /content/maxtext/src/MaxText/convert_qwen3_moe.py \\\n",
-        "    --base_model_path /content/Qwen3-0.6B \\\n",
-        "    --maxtext_model_path /content/qwen3_06b_maxtext_ckpt \\\n",
-        "    --model_size qwen3-0.6b\n",
-        "\n",
-        "# 3. Set the checkpoint path for training\n",
-        "MODEL_CHECKPOINT_PATH = \"/content/qwen3_06b_maxtext_ckpt\"\n",
-        "print(\"✓ MODEL_CHECKPOINT_PATH set to:\", MODEL_CHECKPOINT_PATH)\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "CJnhPxUq_G6a"
-      },
-      "outputs": [],
-      "source": [
-        "import os\n",
-        "import sys\n",
-        "#  Set  home directory. Change this to your home directory where maxtext is cloned\n",
-        "MAXTEXT_HOME = os.path.join(\"/content\", \"maxtext\")\n",
-        "print(f\"Home directory (from Python): {MAXTEXT_HOME}\")\n",
-        "#MODEL_CHECKPOINT_PATH = \"path/to/scanned/checkpoint\""
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "CxzKMBQd_U5-"
-      },
-      "outputs": [],
-      "source": [
-        "from pathlib import Path\n",
-        "from typing import Optional, Dict, Any\n",
-        "\n",
-        "# Find MaxText directory and change working directory to it\n",
-        "current_dir = Path.cwd()\n",
-        "if current_dir.name == 'examples':\n",
-        "    # We're in the examples folder, go up one level\n",
-        "    maxtext_path = current_dir.parent.parent\n",
-        "else:\n",
-        "    # We're in the root, MaxText is a subfolder\n",
-        "    maxtext_path = Path(f'{MAXTEXT_HOME}') / 'src' / 'MaxText'\n",
-        "\n",
-        "# Change working directory to MaxText project root\n",
-        "os.chdir(maxtext_path)\n",
-        "sys.path.insert(0, str(maxtext_path))\n",
-        "\n",
-        "print(f\"✓ Changed working directory to: {os.getcwd()}\")\n",
-        "print(f\"✓ MaxText project root: {maxtext_path}\")\n",
-        "print(f\"✓ Added to Python path: {maxtext_path}\")\n",
-        "import jax\n",
-        "if not jax.distributed.is_initialized():\n",
-        "    jax.distributed.initialize()\n",
-        "print(f\"JAX version: {jax.__version__}\")\n",
-        "print(f\"JAX devices: {jax.devices()}\")\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "rKS8nVYgAbwE"
-      },
-      "outputs": [],
-      "source": [
-        "# Hugging Face Authentication Setup\n",
-        "from huggingface_hub import login\n",
-        "\n",
-        "# Set your Hugging Face token here\n",
-        "HF_TOKEN = \"your_actual_token_here\"  # Replace with your actual token\n",
-        "login(token=HF_TOKEN)\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "aR0zTWkxAs4t"
-      },
-      "outputs": [],
-      "source": [
-        "# MaxText imports\n",
-        "try:\n",
-        "    from MaxText import pyconfig\n",
-        "    from MaxText.sft.sft_trainer import train as sft_train\n",
-        "\n",
-        "    MAXTEXT_AVAILABLE = True\n",
-        "    print(\"✓ MaxText imports successful\")\n",
-        "except ImportError as e:\n",
-        "    print(f\"⚠️ MaxText not available: {e}\")\n",
-        "    MAXTEXT_AVAILABLE = False"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "In-jdp1AAwrL"
-      },
-      "outputs": [],
-      "source": [
-        "# Fixed configuration setup for Qwen-0.6B on small TPU\n",
-        "if MAXTEXT_AVAILABLE:\n",
-        "    config_argv = [\n",
-        "        \"\",\n",
-        "        f\"{MAXTEXT_HOME}/src/MaxText/configs/sft.yml\",   # base SFT config\n",
-        "        \"model_name=qwen3-0.6b\",\n",
-        "        \"steps=20\",                                     # very short run for testing\n",
-        "        \"per_device_batch_size=1\",                      # minimal to avoid OOM\n",
-        "        \"max_target_length=512\",                        # shorter context to fit memory\n",
-        "        \"learning_rate=2.0e-5\",                         # safe small LR\n",
-        "        \"eval_steps=5\",\n",
-        "        \"weight_dtype=bfloat16\",\n",
-        "        \"dtype=bfloat16\",\n",
-        "        \"hf_path=HuggingFaceH4/ultrachat_200k\",                       # HuggingFace dataset/model if needed\n",
-        "        f\"hf_access_token={HF_TOKEN}\",\n",
-        "        \"base_output_directory=/tmp/maxtext_qwen06\",\n",
-        "        \"run_name=sft_qwen0.6b_test\",\n",
-        "        \"tokenizer_path=Qwen/Qwen3-0.6B\",                # Qwen tokenizer\n",
-        "        \"eval_interval=10\",\n",
-        "        \"steps=100\",\n",
-        "        \"profiler=xplane\",\n",
-        "    ]\n",
-        "\n",
-        "    # Initialize configuration using MaxText's pyconfig\n",
-        "    config = pyconfig.initialize(config_argv)\n",
-        "\n",
-        "    print(\"✓ Fixed configuration loaded:\")\n",
-        "    print(f\"  - Model: {config.model_name}\")\n",
-        "    print(f\"  - Dataset: {config.hf_path}\")\n",
-        "    print(f\"  - Steps: {config.steps}\")\n",
-        "    print(f\"  - Use SFT: {config.use_sft}\")\n",
-        "    print(f\"  - Learning Rate: {config.learning_rate}\")\n",
-        "else:\n",
-        "    print(\"MaxText not available - cannot load configuration\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "EJE1ookSAzz-"
-      },
-      "source": []
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "mgwpNgQYCJEd"
-      },
-      "outputs": [],
-      "source": [
-        "#  Execute the training using MaxText SFT trainer's train() function\n",
-        "if MAXTEXT_AVAILABLE:\n",
-        "    print(\"=\"*60)\n",
-        "    print(\"EXECUTING ACTUAL TRAINING\")\n",
-        "    print(\"=\"*60)\n",
-        "\n",
-        "\n",
-        "    sft_train(config)\n"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "TPU",
-    "colab": {
-      "gpuType": "V5E1",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}

From 2742f8205a6b7a700959cc99d29863aaa10051e8 Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Fri, 19 Sep 2025 22:24:29 +0400
Subject: [PATCH 05/15] Fix var name

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 src/MaxText/examples/sft_qwen3_demo.ipynb | 83 ++++++++++++++---------
 1 file changed, 52 insertions(+), 31 deletions(-)

diff --git a/src/MaxText/examples/sft_qwen3_demo.ipynb b/src/MaxText/examples/sft_qwen3_demo.ipynb
index 2f5da3faca..c3ebb82b89 100644
--- a/src/MaxText/examples/sft_qwen3_demo.ipynb
+++ b/src/MaxText/examples/sft_qwen3_demo.ipynb
@@ -90,12 +90,54 @@
       },
       "outputs": [],
       "source": [
+        "#Set up the variables for the script\n",
         "import os\n",
         "import sys\n",
-        "#  Set  home directory. Change this to your home directory where maxtext is cloned\n",
-        "MAXTEXT_HOME = os.path.join(\"/content\", \"maxtext\")\n",
-        "print(f\"Home directory (from Python): {MAXTEXT_HOME}\")\n",
-        "#MODEL_CHECKPOINT_PATH = \"path/to/scanned/checkpoint\""
+        "\n",
+        "#Set the MaxText home directory (where you cloned the maxtext repo)\n",
+        "# once running the jupyter notebook you can use \n",
+        "# MAXTEXT_REPO_ROOT = os.path.expanduser(\"~\") + \"/maxtext\"\n",
+        "# This one is for colab\n",
+        "MAXTEXT_REPO_ROOT = os.path.join(\"/content\", \"maxtext\")\n",
+        "\n",
+        "print(f\"MaxText Home directory (from Python): {MAXTEXT_REPO_ROOT}\")\n",
+        "\n",
+        "DEBUG = False  # set to True to run in debug mode, for more print statements\n",
+        "#set this to the path of the checkpoint you want to load, gs:// supported \n",
+        "#MODEL_CHECKPOINT_PATH = \"path/to/scanned/checkpoint\" \n",
+        "# now for colab we will use the checkpoint from the HF model\n",
+        "MODEL_CHECKPOINT_PATH = f\"{MAXTEXT_REPO_ROOT}/qwen_checkpoint\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Hugging Face Authentication Setup\n",
+        "from huggingface_hub import login\n",
+        "\n",
+        "# Set your Hugging Face token here\n",
+        "HF_TOKEN = \"your_actual_token_here\"  # Replace with your actual token\n",
+        "login(token=HF_TOKEN)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# This is the command to convert the HF model to the MaxText format \n",
+        "# You may omit it if you already have a checkpoint\n",
+        "!python3 -m MaxText.utils.ckpt_conversion.to_maxtext \\\n",
+        "    $MAXTEXT_REPO_ROOT/src/MaxText/configs/base.yml \\\n",
+        "    model_name=qwen3-0.6b \\\n",
+        "    base_output_directory=$MODEL_CHECKPOINT_PATH \\\n",
+        "    hf_access_token=$HF_TOKEN \\\n",
+        "    use_multimodal=false \\\n",
+        "    scan_layers=false"
       ]
     },
     {
@@ -111,12 +153,8 @@
         "\n",
         "# Find MaxText directory and change working directory to it\n",
         "current_dir = Path.cwd()\n",
-        "if current_dir.name == 'examples':\n",
-        "    # We're in the examples folder, go up one level\n",
-        "    maxtext_path = current_dir.parent.parent\n",
-        "else:\n",
-        "    # We're in the root, MaxText is a subfolder\n",
-        "    maxtext_path = Path(f'{MAXTEXT_HOME}') / 'src' / 'MaxText'\n",
+        "\n",
+        "maxtext_path = Path(f'{MAXTEXT_REPO_ROOT}') / 'src' / 'MaxText'\n",
         "\n",
         "# Change working directory to MaxText project root\n",
         "os.chdir(maxtext_path)\n",
@@ -129,23 +167,7 @@
         "if not jax.distributed.is_initialized():\n",
         "    jax.distributed.initialize()\n",
         "print(f\"JAX version: {jax.__version__}\")\n",
-        "print(f\"JAX devices: {jax.devices()}\")\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "rKS8nVYgAbwE"
-      },
-      "outputs": [],
-      "source": [
-        "# Hugging Face Authentication Setup\n",
-        "from huggingface_hub import login\n",
-        "\n",
-        "# Set your Hugging Face token here\n",
-        "HF_TOKEN = \"your_actual_token_here\"  # Replace with your actual token\n",
-        "login(token=HF_TOKEN)\n"
+        "print(f\"JAX devices: {jax.devices()}\")"
       ]
     },
     {
@@ -180,11 +202,12 @@
         "if MAXTEXT_AVAILABLE:\n",
         "    config_argv = [\n",
         "        \"\",\n",
-        "        f\"{MAXTEXT_HOME}/src/MaxText/configs/sft.yml\",   # base SFT config\n",
+        "        f\"{MAXTEXT_REPO_ROOT}/src/MaxText/configs/sft.yml\",   # base SFT config\n",
+        "        f\"load_parameters_path={MODEL_CHECKPOINT_PATH}/0/items/\",  # Load pre-trained weights!, replace with your checkpoint path\n",
         "        \"model_name=qwen3-0.6b\",\n",
         "        \"steps=20\",                                     # very short run for testing\n",
         "        \"per_device_batch_size=1\",                      # minimal to avoid OOM\n",
-        "        \"max_target_length=512\",                        # shorter context to fit memory\n",
+        "        \"max_target_length=1024\",                        \n",
         "        \"learning_rate=2.0e-5\",                         # safe small LR\n",
         "        \"eval_steps=5\",\n",
         "        \"weight_dtype=bfloat16\",\n",
@@ -195,7 +218,6 @@
         "        \"run_name=sft_qwen0.6b_test\",\n",
         "        \"tokenizer_path=Qwen/Qwen3-0.6B\",                # Qwen tokenizer\n",
         "        \"eval_interval=10\",\n",
-        "        \"steps=100\",\n",
         "        \"profiler=xplane\",\n",
         "    ]\n",
         "\n",
@@ -233,7 +255,6 @@
         "    print(\"EXECUTING ACTUAL TRAINING\")\n",
         "    print(\"=\"*60)\n",
         "\n",
-        "\n",
         "    sft_train(config)\n"
       ]
     }

From 7e68b9a722ed895db3a46b2a43f982b63ffabc4d Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Fri, 19 Sep 2025 22:27:46 +0400
Subject: [PATCH 06/15] colab label

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 src/MaxText/examples/sft_qwen3_demo.ipynb | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/MaxText/examples/sft_qwen3_demo.ipynb b/src/MaxText/examples/sft_qwen3_demo.ipynb
index c3ebb82b89..e345e3bfcc 100644
--- a/src/MaxText/examples/sft_qwen3_demo.ipynb
+++ b/src/MaxText/examples/sft_qwen3_demo.ipynb
@@ -4,9 +4,16 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Run SFT on Qwen3-0.6B model\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AI-Hypercomputer/maxtext/blob/qwen3-sft-collab/src/MaxText/examples/sft_qwen3_demo.ipynb)\n",
         "\n",
-        "This collab can run on the public TPU 5e-1\n",
+        "# Qwen3-0.6B Supervised Fine-Tuning (SFT) Demo\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "This notebook can run on the public TPU 5e-1\n",
         "\n",
         "This notebook demonstrates how to perform Supervised Fine-Tuning (SFT) on Qwen3-0.6B using the Hugging Face ultrachat_200k dataset with Tunix integration for efficient training.\n",
         "\n",

From dce7b0dd092ae77440298bdeb683388fac300012 Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Fri, 19 Sep 2025 22:29:19 +0400
Subject: [PATCH 07/15] Nicer

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 src/MaxText/examples/sft_qwen3_demo.ipynb | 39 +++++++++++++----------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/src/MaxText/examples/sft_qwen3_demo.ipynb b/src/MaxText/examples/sft_qwen3_demo.ipynb
index e345e3bfcc..8e6aeba8bc 100644
--- a/src/MaxText/examples/sft_qwen3_demo.ipynb
+++ b/src/MaxText/examples/sft_qwen3_demo.ipynb
@@ -13,37 +13,42 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "This notebook can run on the public TPU 5e-1\n",
+        "## Overview\n",
+        "\n",
+        "This notebook can run on the **public TPU 5e-1**\n",
         "\n",
         "This notebook demonstrates how to perform Supervised Fine-Tuning (SFT) on Qwen3-0.6B using the Hugging Face ultrachat_200k dataset with Tunix integration for efficient training.\n",
         "\n",
-        "Dataset Overview\n",
-        "https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n",
+        "## Dataset Overview\n",
         "\n",
-        "Dataset Information:\n",
+        "**Dataset Link:** https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n",
         "\n",
-        "Name: HuggingFaceH4/ultrachat_200k\n",
-        "Type: Supervised Fine-Tuning dataset\n",
-        "Size: ~200k conversations\n",
-        "Format: Chat conversations with human-AI pairs\n",
-        "Splits: train_sft, test_sft\n",
-        "Data columns: ['messages']\n",
-        "Dataset Structure: Each example contains a 'messages' field with:\n",
+        "### Dataset Information:\n",
+        "- **Name:** HuggingFaceH4/ultrachat_200k\n",
+        "- **Type:** Supervised Fine-Tuning dataset\n",
+        "- **Size:** ~200k conversations\n",
+        "- **Format:** Chat conversations with human-AI pairs\n",
+        "- **Splits:** train_sft, test_sft\n",
+        "- **Data columns:** ['messages']\n",
         "\n",
-        "role: 'user' or 'assistant'\n",
-        "content: The actual message text\n",
-        "Example data format:\n",
+        "### Dataset Structure:\n",
+        "Each example contains a 'messages' field with:\n",
+        "- **role:** 'user' or 'assistant'\n",
+        "- **content:** The actual message text\n",
         "\n",
+        "### Example data format:\n",
+        "```json\n",
         "{\n",
         "  \"messages\": [\n",
         "    {\"role\": \"user\", \"content\": \"What is the capital of France?\"},\n",
         "    {\"role\": \"assistant\", \"content\": \"The capital of France is Paris.\"}\n",
         "  ]\n",
         "}\n",
+        "```\n",
         "\n",
-        "Prerequisites\n",
-        "HuggingFace access token for dataset download\n",
-        "Sufficient compute resources (TPU/GPU)"
+        "## Prerequisites\n",
+        "- HuggingFace access token for dataset download\n",
+        "- Sufficient compute resources (TPU/GPU)"
       ]
     },
     {

From b51d693025e65213816ba99162c8928ef8785426 Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Tue, 23 Sep 2025 20:50:04 +0400
Subject: [PATCH 08/15] Making nicer

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 src/MaxText/examples/sft_qwen3_demo.ipynb | 35 ++++++++++++++---------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/MaxText/examples/sft_qwen3_demo.ipynb b/src/MaxText/examples/sft_qwen3_demo.ipynb
index 8e6aeba8bc..40fd00d032 100644
--- a/src/MaxText/examples/sft_qwen3_demo.ipynb
+++ b/src/MaxText/examples/sft_qwen3_demo.ipynb
@@ -115,10 +115,15 @@
         "print(f\"MaxText Home directory (from Python): {MAXTEXT_REPO_ROOT}\")\n",
         "\n",
         "DEBUG = False  # set to True to run in debug mode, for more print statements\n",
-        "#set this to the path of the checkpoint you want to load, gs:// supported \n",
-        "#MODEL_CHECKPOINT_PATH = \"path/to/scanned/checkpoint\" \n",
-        "# now for colab we will use the checkpoint from the HF model\n",
-        "MODEL_CHECKPOINT_PATH = f\"{MAXTEXT_REPO_ROOT}/qwen_checkpoint\""
+        "\n",
+        "# Case 1: Set `MODEL_CHECKPOINT_PATH` to GCS path that already has `Qwen3-0.6B` model checkpoint\n",
+        "# Case 2: If you do not have the checkpoint, then do not update `MODEL_CHECKPOINT_PATH`\n",
+        "# and this colab will download the checkpoint from HF and store at `\"{MAXTEXT_REPO_ROOT}/qwen_checkpoint\\\"`\n",
+        "MODEL_CHECKPOINT_PATH = f\"{MAXTEXT_REPO_ROOT}/qwen_checkpoint\"\n",
+        "\n",
+        "# This is the directory where the fine-tuned model will be saved\n",
+        "# You can change it to any path you want including GCS gs://...\n",
+        "BASE_OUTPUT_DIRECTORY = \"/tmp/out/maxtext_qwen06\""
       ]
     },
     {
@@ -143,13 +148,14 @@
       "source": [
         "# This is the command to convert the HF model to the MaxText format \n",
         "# You may omit it if you already have a checkpoint\n",
-        "!python3 -m MaxText.utils.ckpt_conversion.to_maxtext \\\n",
-        "    $MAXTEXT_REPO_ROOT/src/MaxText/configs/base.yml \\\n",
-        "    model_name=qwen3-0.6b \\\n",
-        "    base_output_directory=$MODEL_CHECKPOINT_PATH \\\n",
-        "    hf_access_token=$HF_TOKEN \\\n",
-        "    use_multimodal=false \\\n",
-        "    scan_layers=false"
+        "if not os.path.exists(MODEL_CHECKPOINT_PATH):\n",
+        "    !python3 -m MaxText.utils.ckpt_conversion.to_maxtext \\\n",
+        "        $MAXTEXT_REPO_ROOT/src/MaxText/configs/base.yml \\\n",
+        "        model_name=qwen3-0.6b \\\n",
+        "        base_output_directory=$MODEL_CHECKPOINT_PATH \\\n",
+        "        hf_access_token=$HF_TOKEN \\\n",
+        "        use_multimodal=false \\\n",
+        "        scan_layers=false"
       ]
     },
     {
@@ -226,7 +232,7 @@
         "        \"dtype=bfloat16\",\n",
         "        \"hf_path=HuggingFaceH4/ultrachat_200k\",                       # HuggingFace dataset/model if needed\n",
         "        f\"hf_access_token={HF_TOKEN}\",\n",
-        "        \"base_output_directory=/tmp/maxtext_qwen06\",\n",
+        "        f\"base_output_directory={BASE_OUTPUT_DIRECTORY}\",\n",
         "        \"run_name=sft_qwen0.6b_test\",\n",
         "        \"tokenizer_path=Qwen/Qwen3-0.6B\",                # Qwen tokenizer\n",
         "        \"eval_interval=10\",\n",
@@ -267,7 +273,10 @@
         "    print(\"EXECUTING ACTUAL TRAINING\")\n",
         "    print(\"=\"*60)\n",
         "\n",
-        "    sft_train(config)\n"
+        "    sft_train(config)\n",
+        "\n",
+        "print(\"Training complete!\")\n",
+        "print(\"Model saved at: \", BASE_OUTPUT_DIRECTORY)"
       ]
     }
   ],

From 5d05559db2f765abec97f9339bf680676cefbeb5 Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Tue, 23 Sep 2025 21:31:29 +0400
Subject: [PATCH 09/15] Added README.md

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 src/MaxText/examples/README.md | 187 +++++++++++++++++++++++++++++++++
 1 file changed, 187 insertions(+)
 create mode 100644 src/MaxText/examples/README.md

diff --git a/src/MaxText/examples/README.md b/src/MaxText/examples/README.md
new file mode 100644
index 0000000000..02de34d3df
--- /dev/null
+++ b/src/MaxText/examples/README.md
@@ -0,0 +1,187 @@
+# MaxText Examples - TPU Setup Guide
+
+This guide provides comprehensive instructions for setting up Jupyter Lab on TPU and connecting it to Google Colab for running MaxText examples.
+
+## 📑 Table of Contents
+
+- [Prerequisites](#prerequisites)
+- [Method 1: Google Colab with TPU (Recommended)](#method-1-google-colab-with-tpu-recommended)
+- [Method 2: Local Jupyter Lab with TPU](#method-2-local-jupyter-lab-with-tpu)
+- [Method 3: Colab + Local Jupyter Lab Hybrid](#method-3-colab--local-jupyter-lab-hybrid)
+- [Available Examples](#available-examples)
+- [Common Pitfalls & Debugging](#common-pitfalls--debugging)
+- [Support & Resources](#support--resources)
+- [Contributing](#contributing)
+
+## Prerequisites
+
+Before starting, make sure you have:
+
+- ✅ A Google Cloud Platform (GCP) account with billing enabled
+- ✅ TPU quota available in your region (check under IAM & Admin → Quotas)
+- ✅ Basic familiarity with Jupyter, Python, and Git
+- ✅ gcloud CLI installed locally if you plan to use Method 2 or 3
+- ✅ Firewall rules open for port 8888 (Jupyter) if accessing directly
+
+## Method 1: Google Colab with TPU (Recommended)
+
+This is the fastest way to run MaxText without managing infrastructure.
+
+### Step 1: Open Google Colab
+
+1. Go to [Google Colab](https://colab.research.google.com/)
+2. Sign in → New Notebook
+
+### Step 2: Enable TPU Runtime
+
+1. **Runtime** → **Change runtime type**
+2. Set **Hardware accelerator** → **TPU**
+3. Select TPU version:
+   - **v5e-8** → recommended for most MaxText examples, but it's a paid option
+   - **v5e-1** → free tier option (slower, but works for Qwen-0.6B demos)
+4. Click **Save**
+
+### Step 3: Upload & Prepare MaxText
+
+Upload notebooks or mount your GitHub repo
+
+> **Note:** In Colab, the repo root will usually be `/content/maxtext`
+
+**Example:**
+```python
+!git clone https://github.com/AI-Hypercomputer/maxtext.git
+%cd maxtext
+!pip install -r requirements.txt
+```
+
+### Step 4: Run Examples
+
+1. Open `src/MaxText/examples/`
+2. Try:
+   - `sft_qwen3_demo.ipynb`
+   - `sft_llama3_demo.ipynb`
+
+> ⚡ **Tip:** If Colab disconnects, re-enable TPU and re-run setup cells. Save checkpoints to GCS or Drive.
+
+## Method 2: Local Jupyter Lab with TPU
+
+This method gives you more control and is better for long training runs.
+
+### Step 1: Set Up TPU VM
+
+In Google Cloud Console:
+
+1. **Compute Engine** → **TPU** → **Create TPU Node**
+2. Example config:
+   - **Name:** `maxtext-tpu-node`
+   - **TPU type:** `v5e-8` (or `v6p-8` for newer hardware)
+   - **Runtime Version:** `tpu-ubuntu-alpha-*` (matches your VM image)
+
+### Step 2: Connect to TPU VM
+
+```bash
+gcloud compute tpus tpu-vm ssh maxtext-tpu-node --zone=YOUR_ZONE
+```
+
+### Step 3: Install Dependencies
+
+```bash
+sudo apt update && sudo apt upgrade -y
+sudo apt install python3-pip python3-dev git -y
+pip3 install jupyterlab
+git clone https://github.com/AI-Hypercomputer/maxtext.git
+cd maxtext
+chmod +x setup.sh
+./setup.sh
+```
+
+### Step 4: Start Jupyter Lab
+
+```bash
+jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root
+```
+
+Copy the URL with token from terminal
+
+### Step 5: Secure Access
+
+#### Option A: SSH Tunnel (Recommended)
+
+```bash
+gcloud compute tpus tpu-vm ssh maxtext-tpu-node --zone=YOUR_ZONE -- -L 8888:localhost:8888
+```
+
+Then open → `http://localhost:8888`
+
+#### Option B: Direct IP (Less Secure)
+
+1. Open `http://<VM_PUBLIC_IP>:8888`
+2. Use token for login
+
+## Method 3: Colab + Local Jupyter Lab Hybrid
+
+This lets you develop locally but train on Colab TPU.
+
+### Step 1: Local Dev Setup
+
+```bash
+git clone https://github.com/AI-Hypercomputer/maxtext.git
+cd maxtext
+python3 -m venv maxtext_env
+source maxtext_env/bin/activate
+pip install jupyterlab -r requirements.txt
+jupyter lab
+```
+
+Open → `http://localhost:8888`
+
+### Step 2: Upload to Colab
+
+1. **File** → **Upload notebook**
+2. Enable TPU runtime in Colab
+3. Run on Colab TPU
+
+### Step 3: Sync Changes
+
+1. Download `.ipynb` from Colab after edits
+2. Replace local version → continue dev
+
+## Available Examples
+
+### Supervised Fine-Tuning (SFT)
+
+- **`sft_qwen3_demo.ipynb`** → Qwen3-0.6B with ultrachat_200k
+- **`sft_llama3_demo.ipynb`** → Llama3 with ultrachat_200k
+
+### GRPO Training
+
+- **`grpo_llama3_demo.ipynb`** → GRPO training on math dataset
+
+## Common Pitfalls & Debugging
+
+| Issue | Solution |
+|-------|----------|
+| ❌ TPU runtime mismatch | Check TPU runtime version matches VM image (`tpu-ubuntu-alpha-*`) |
+| ❌ Colab disconnects | Save checkpoints to GCS or Drive regularly |
+| ❌ "RESOURCE_EXHAUSTED" errors | Use smaller batch size or v5e-8 instead of v5e-1 |
+| ❌ Firewall blocked | Ensure port 8888 open, or always use SSH tunneling |
+| ❌ Path confusion | In Colab use `/content/maxtext`; in TPU VM use `~/maxtext` |
+
+## Support and Resources
+
+- 📘 [MaxText Documentation](https://github.com/AI-Hypercomputer/maxtext)
+- 💻 [Google Colab](https://colab.research.google.com)
+- ⚡ [Cloud TPU Docs](https://cloud.google.com/tpu/docs)
+- 🧩 [Jupyter Lab](https://jupyterlab.readthedocs.io)
+
+## Contributing
+
+If you encounter issues or have improvements for this guide, please:
+
+1. Open an issue on the MaxText repository
+2. Submit a pull request with your improvements
+3. Share your experience in the discussions
+
+---
+
+**Happy Training! 🚀**
\ No newline at end of file

From b6264b1508a902d609f58f41afb6628386cfc9c4 Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Fri, 26 Sep 2025 03:46:29 +0400
Subject: [PATCH 10/15] Restructured

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 src/MaxText/examples/sft_qwen3_demo.ipynb | 93 ++++++++++++++---------
 1 file changed, 57 insertions(+), 36 deletions(-)

diff --git a/src/MaxText/examples/sft_qwen3_demo.ipynb b/src/MaxText/examples/sft_qwen3_demo.ipynb
index 40fd00d032..dae1427d1f 100644
--- a/src/MaxText/examples/sft_qwen3_demo.ipynb
+++ b/src/MaxText/examples/sft_qwen3_demo.ipynb
@@ -76,22 +76,13 @@
       },
       "outputs": [],
       "source": [
-        "### (Optional) Do not run this if you already installed the dependencies\n",
+        "#Install maxtext and dependencies\n",
+        "# 1. Install uv, a fast Python package installer\n",
+        "pip install uv\n",
         "\n",
-        "# 3. Ensure setup.sh is executable\n",
-        "!chmod +x setup.sh\n",
-        "\n",
-        "# 4. Execute the setup script\n",
-        "!./setup.sh\n",
-        "\n",
-        "# force numpy version\n",
-        "!pip install --force-reinstall numpy==2.1.2\n",
-        "#install nest_asyncio\n",
-        "!pip install nest_asyncio\n",
-        "\n",
-        "import nest_asyncio\n",
-        "nest_asyncio.apply()\n",
-        "# To fix \"This event loop is already running\" error in Colab\n"
+        "# 2. Install MaxText and its dependencies\n",
+        "uv pip install maxtext --resolution=lowest\n",
+        "install_maxtext_github_deps"
       ]
     },
     {
@@ -102,7 +93,7 @@
       },
       "outputs": [],
       "source": [
-        "#Set up the variables for the script\n",
+        "## Set up the maxtext environment\n",
         "import os\n",
         "import sys\n",
         "\n",
@@ -114,7 +105,16 @@
         "\n",
         "print(f\"MaxText Home directory (from Python): {MAXTEXT_REPO_ROOT}\")\n",
         "\n",
-        "DEBUG = False  # set to True to run in debug mode, for more print statements\n",
+        "DEBUG = False  # set to True to run in debug mode, for more print statements"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "## Set the model checkpoint path and output directory\n",
         "\n",
         "# Case 1: Set `MODEL_CHECKPOINT_PATH` to GCS path that already has `Qwen3-0.6B` model checkpoint\n",
         "# Case 2: If you do not have the checkpoint, then do not update `MODEL_CHECKPOINT_PATH`\n",
@@ -126,6 +126,29 @@
         "BASE_OUTPUT_DIRECTORY = \"/tmp/out/maxtext_qwen06\""
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from pathlib import Path\n",
+        "from typing import Optional, Dict, Any\n",
+        "\n",
+        "# Find MaxText directory and change working directory to it\n",
+        "current_dir = Path.cwd()\n",
+        "\n",
+        "maxtext_path = Path(f'{MAXTEXT_REPO_ROOT}') / 'src' / 'MaxText'\n",
+        "\n",
+        "# Change working directory to MaxText project root\n",
+        "os.chdir(maxtext_path)\n",
+        "sys.path.insert(0, str(maxtext_path))\n",
+        "\n",
+        "print(f\"✓ Changed working directory to: {os.getcwd()}\")\n",
+        "print(f\"✓ MaxText project root: {maxtext_path}\")\n",
+        "print(f\"✓ Added to Python path: {maxtext_path}\")"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -135,8 +158,10 @@
         "# Hugging Face Authentication Setup\n",
         "from huggingface_hub import login\n",
         "\n",
-        "# Set your Hugging Face token here\n",
-        "HF_TOKEN = \"your_actual_token_here\"  # Replace with your actual token\n",
+        "# Set your Hugging Face token as a secret in the Google Colab   \n",
+        "from google.colab import userdata\n",
+        "HF_TOKEN = userdata.get(\"HF_TOKEN\")\n",
+        "# HF_TOKEN = \"your_actual_token_here\" - use this for a private jupyter lab\n",
         "login(token=HF_TOKEN)"
       ]
     },
@@ -166,21 +191,7 @@
       },
       "outputs": [],
       "source": [
-        "from pathlib import Path\n",
-        "from typing import Optional, Dict, Any\n",
-        "\n",
-        "# Find MaxText directory and change working directory to it\n",
-        "current_dir = Path.cwd()\n",
-        "\n",
-        "maxtext_path = Path(f'{MAXTEXT_REPO_ROOT}') / 'src' / 'MaxText'\n",
-        "\n",
-        "# Change working directory to MaxText project root\n",
-        "os.chdir(maxtext_path)\n",
-        "sys.path.insert(0, str(maxtext_path))\n",
-        "\n",
-        "print(f\"✓ Changed working directory to: {os.getcwd()}\")\n",
-        "print(f\"✓ MaxText project root: {maxtext_path}\")\n",
-        "print(f\"✓ Added to Python path: {maxtext_path}\")\n",
+        "# this is the code to initialize jax if it's not initialized in the cell above\n",
         "import jax\n",
         "if not jax.distributed.is_initialized():\n",
         "    jax.distributed.initialize()\n",
@@ -287,11 +298,21 @@
       "provenance": []
     },
     "kernelspec": {
-      "display_name": "Python 3",
+      "display_name": "base",
+      "language": "python",
       "name": "python3"
     },
     "language_info": {
-      "name": "python"
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.2"
     }
   },
   "nbformat": 4,

From 81f20a50fa53f73992125703ac02274d3ccdca37 Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Fri, 26 Sep 2025 03:57:44 +0400
Subject: [PATCH 11/15] Fix

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 docs/tutorials/grpo.md         |  2 +-
 src/MaxText/examples/README.md | 37 ++++------------------------------
 2 files changed, 5 insertions(+), 34 deletions(-)

diff --git a/docs/tutorials/grpo.md b/docs/tutorials/grpo.md
index 3030b1714b..cbf615f25b 100644
--- a/docs/tutorials/grpo.md
+++ b/docs/tutorials/grpo.md
@@ -45,7 +45,7 @@ Next, run the following bash script to get all the necessary installations insid
 This will take few minutes. Follow along the installation logs and look out for any issues!
 
 ```
-bash ~/maxtext/MaxText/examples/install_tunix_vllm_requirement.sh
+bash ~/maxtext/src/MaxText/examples/install_tunix_vllm_requirement.sh
 ```
 
 1. It installs `pip install keyring keyrings.google-artifactregistry-auth` which enables pip to authenticate with Google Artifact Registry automatically.
diff --git a/src/MaxText/examples/README.md b/src/MaxText/examples/README.md
index 02de34d3df..cdade52a32 100644
--- a/src/MaxText/examples/README.md
+++ b/src/MaxText/examples/README.md
@@ -60,6 +60,8 @@ Upload notebooks or mount your GitHub repo
 2. Try:
    - `sft_qwen3_demo.ipynb`
    - `sft_llama3_demo.ipynb`
+   - `grpo_llama3_demo.ipynb`
+
 
 > ⚡ **Tip:** If Colab disconnects, re-enable TPU and re-run setup cells. Save checkpoints to GCS or Drive.
 
@@ -89,10 +91,6 @@ gcloud compute tpus tpu-vm ssh maxtext-tpu-node --zone=YOUR_ZONE
 sudo apt update && sudo apt upgrade -y
 sudo apt install python3-pip python3-dev git -y
 pip3 install jupyterlab
-git clone https://github.com/AI-Hypercomputer/maxtext.git
-cd maxtext
-chmod +x setup.sh
-./setup.sh
 ```
 
 ### Step 4: Start Jupyter Lab
@@ -113,38 +111,11 @@ gcloud compute tpus tpu-vm ssh maxtext-tpu-node --zone=YOUR_ZONE -- -L 8888:loca
 
 Then open → `http://localhost:8888`
 
-#### Option B: Direct IP (Less Secure)
-
-1. Open `http://<VM_PUBLIC_IP>:8888`
-2. Use token for login
 
 ## Method 3: Colab + Local Jupyter Lab Hybrid
 
-This lets you develop locally but train on Colab TPU.
-
-### Step 1: Local Dev Setup
-
-```bash
-git clone https://github.com/AI-Hypercomputer/maxtext.git
-cd maxtext
-python3 -m venv maxtext_env
-source maxtext_env/bin/activate
-pip install jupyterlab -r requirements.txt
-jupyter lab
-```
-
-Open → `http://localhost:8888`
-
-### Step 2: Upload to Colab
-
-1. **File** → **Upload notebook**
-2. Enable TPU runtime in Colab
-3. Run on Colab TPU
-
-### Step 3: Sync Changes
-
-1. Download `.ipynb` from Colab after edits
-2. Replace local version → continue dev
+Set up Jupyter Lab as in step 2.
+Use the link for Jupyter Lab as a link for "Connect to a local runtime" in Collab - at the dropdown where you select the runtime.
 
 ## Available Examples
 

From 59f9f6f38ba50764e7e0d26c7b93c74d5ae76771 Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Fri, 26 Sep 2025 04:10:57 +0400
Subject: [PATCH 12/15] Fix

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 src/MaxText/examples/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/MaxText/examples/README.md b/src/MaxText/examples/README.md
index cdade52a32..292fce0825 100644
--- a/src/MaxText/examples/README.md
+++ b/src/MaxText/examples/README.md
@@ -1,4 +1,4 @@
-# MaxText Examples - TPU Setup Guide
+# MaxText Examples - Setting the Jupyter Lab or Collab to run them on TPU
 
 This guide provides comprehensive instructions for setting up Jupyter Lab on TPU and connecting it to Google Colab for running MaxText examples.
 
@@ -51,7 +51,6 @@ Upload notebooks or mount your GitHub repo
 ```python
 !git clone https://github.com/AI-Hypercomputer/maxtext.git
 %cd maxtext
-!pip install -r requirements.txt
 ```
 
 ### Step 4: Run Examples

From 053b98086b4d6125d895d0ea4a22113ab6a32935 Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Fri, 26 Sep 2025 04:15:22 +0400
Subject: [PATCH 13/15] Fix

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 src/MaxText/examples/README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/MaxText/examples/README.md b/src/MaxText/examples/README.md
index 292fce0825..d0e2dbfe12 100644
--- a/src/MaxText/examples/README.md
+++ b/src/MaxText/examples/README.md
@@ -63,6 +63,7 @@ Upload notebooks or mount your GitHub repo
 
 
 > ⚡ **Tip:** If Colab disconnects, re-enable TPU and re-run setup cells. Save checkpoints to GCS or Drive.
+> ⚡ **Tip:** If Colab asks to restart session - do it and continue to run cells
 
 ## Method 2: Local Jupyter Lab with TPU
 
@@ -120,8 +121,8 @@ Use the link for Jupyter Lab as a link for "Connect to a local runtime" in Colla
 
 ### Supervised Fine-Tuning (SFT)
 
-- **`sft_qwen3_demo.ipynb`** → Qwen3-0.6B with ultrachat_200k
-- **`sft_llama3_demo.ipynb`** → Llama3 with ultrachat_200k
+- **`sft_qwen3_demo.ipynb`** → Qwen3-0.6B with Hugging Face ultrachat_200k dataset
+- **`sft_llama3_demo.ipynb`** → Llama3.1-8B with Hugging Face ultrachat_200k dataset
 
 ### GRPO Training
 

From 5d7aea5497739c3c0eddba5d6449259d5568ec0a Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Fri, 26 Sep 2025 04:27:51 +0400
Subject: [PATCH 14/15] Fix

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 src/MaxText/examples/sft_qwen3_demo.ipynb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/MaxText/examples/sft_qwen3_demo.ipynb b/src/MaxText/examples/sft_qwen3_demo.ipynb
index dae1427d1f..7941a33f37 100644
--- a/src/MaxText/examples/sft_qwen3_demo.ipynb
+++ b/src/MaxText/examples/sft_qwen3_demo.ipynb
@@ -78,11 +78,11 @@
       "source": [
         "#Install maxtext and dependencies\n",
         "# 1. Install uv, a fast Python package installer\n",
-        "pip install uv\n",
+        "!pip install uv\n",
         "\n",
         "# 2. Install MaxText and its dependencies\n",
-        "uv pip install maxtext --resolution=lowest\n",
-        "install_maxtext_github_deps"
+        "!uv pip install maxtext --resolution=lowest\n",
+        "!install_maxtext_github_deps"
       ]
     },
     {

From bfa714891d53408e78491b194a08cb0c292e8af8 Mon Sep 17 00:00:00 2001
From: Vladimir Suvorov <suvorovv@google.com>
Date: Fri, 26 Sep 2025 04:30:48 +0400
Subject: [PATCH 15/15] Fix install

Signed-off-by: Vladimir Suvorov <suvorovv@google.com>
---
 src/MaxText/examples/sft_llama3_demo.ipynb | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/src/MaxText/examples/sft_llama3_demo.ipynb b/src/MaxText/examples/sft_llama3_demo.ipynb
index 457f346593..e12e1f5e20 100644
--- a/src/MaxText/examples/sft_llama3_demo.ipynb
+++ b/src/MaxText/examples/sft_llama3_demo.ipynb
@@ -69,22 +69,14 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "### (Optional) Do not run this if you already installed the dependencies\n",
         "\n",
-        "# 3. Ensure setup.sh is executable\n",
-        "!chmod +x setup.sh\n",
+        "#Install maxtext and dependencies\n",
+        "# 1. Install uv, a fast Python package installer\n",
+        "!pip install uv\n",
         "\n",
-        "# 4. Execute the setup script\n",
-        "!./setup.sh\n",
-        "\n",
-        "# force numpy version\n",
-        "!pip install --force-reinstall numpy==2.1.2\n",
-        "#install nest_asyncio\n",
-        "!pip install nest_asyncio\n",
-        "\n",
-        "import nest_asyncio\n",
-        "nest_asyncio.apply()\n",
-        "# To fix \"This event loop is already running\" error in Colab\n"
+        "# 2. Install MaxText and its dependencies\n",
+        "!uv pip install maxtext --resolution=lowest\n",
+        "!install_maxtext_github_deps\n"
       ]
     },
     {