huggingface · zaringleb · Sep 1, 2025 · Sep 2, 2025 · Sep 3, 2025 · Sep 4, 2025
diff --git a/configs/default_config.json b/configs/default_config.json
@@ -0,0 +1,57 @@
+{
+    "dataset": {
+        "repo_id": "lerobot/pusht"
+    },
+    "env": {
+        "type": "pusht",
+        "task": "PushT-v0",
+        "obs_type": "pixels_agent_pos",
+        "render_mode": "rgb_array"
+    },
+    "policy": {
+        "type": "smolandfast",
+        "vlm_checkpoint": "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
+        "n_obs_steps": 1,
+        "device": "cuda",
+        "use_amp": false,
+        "push_to_hub": false,
+        "chunk_size": 10,
+        "n_action_steps": 5,
+        "vision_model_optimizer_lr": 0.0002,
+        "connector_optimizer_lr": 0.0002,
+        "text_model_optimizer_lr": 0.0002,
+        "optimizer_lr": 0.0002,
+        "optimizer_betas": [
+            0.9,
+            0.95
+        ],
+        "optimizer_eps": 1e-08,
+        "optimizer_weight_decay": 0.01,
+        "scheduler_warmup_steps": 1000,
+        "scheduler_decay_steps": 30000,
+        "scheduler_decay_lr": 2.5e-06,
+        "precision": "float32",
+        "freeze_vision_encoder": true,
+        "freeze_connector": true,
+        "scale_factor": 4,
+        "do_image_splitting": false,
+        "drop_n_last_frames": true,
+        "grad_clip_norm": 1,
+        "relaxed_action_decoding": true
+    },
+    "num_workers": 4,
+    "batch_size": 16,
+    "steps": 10000,
+    "eval_freq": 1000,
+    "log_freq": 100,
+    "save_checkpoint": true,
+    "save_freq": 10000,
+    "eval": {
+        "n_episodes": 32,
+        "batch_size": 16
+    },
+    "wandb": {
+        "enable": true,
+        "project": "lerobot"
+    }
+}
diff --git a/configs/smol_and_fast_30000.json b/configs/smol_and_fast_30000.json
@@ -0,0 +1,59 @@
+{
+    "dataset": {
+        "repo_id": "lerobot/pusht"
+    },
+    "env": {
+        "type": "pusht",
+        "task": "PushT-v0",
+        "obs_type": "pixels_agent_pos",
+        "render_mode": "rgb_array"
+    },
+    "policy": {
+        "type": "smolandfast",
+        "vlm_checkpoint": "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
+        "n_obs_steps": 1,
+        "device": "cuda",
+        "use_amp": false,
+        "push_to_hub": false,
+        "chunk_size": 15,
+        "n_action_steps": 5,
+        "vision_model_optimizer_lr": 0.0002,
+        "connector_optimizer_lr": 0.0002,
+        "text_model_optimizer_lr": 0.0002,
+        "optimizer_lr": 0.0002,
+        "optimizer_betas": [
+            0.9,
+            0.95
+        ],
+        "optimizer_eps": 1e-08,
+        "optimizer_weight_decay": 0.01,
+        "scheduler_warmup_steps": 1000,
+        "scheduler_decay_steps": 30000,
+        "scheduler_decay_lr": 2.5e-06,
+        "precision": "float32",
+        "freeze_vision_encoder": true,
+        "freeze_connector": false,
+        "scale_factor": 4,
+        "do_image_splitting": false,
+        "drop_n_last_frames": 10,
+        "grad_clip_norm": 1,
+        "relaxed_action_decoding": true,
+        "crop_shape": [84, 84]
+    },
+    "num_workers": 4,
+    "batch_size": 16,
+    "steps": 30000,
+    "eval_freq": 1000,
+    "log_freq": 50,
+    "save_checkpoint": true,
+    "save_freq": 10000,
+    "profile_step_num": -1,
+    "eval": {
+        "n_episodes": 64,
+        "batch_size": 16
+    },
+    "wandb": {
+        "enable": true,
+        "project": "lerobot"
+    }
+}
diff --git a/configs/smolvlm2_500.json b/configs/smolvlm2_500.json
@@ -0,0 +1,57 @@
+{
+    "dataset": {
+        "repo_id": "lerobot/pusht"
+    },
+    "env": {
+        "type": "pusht",
+        "task": "PushT-v0",
+        "obs_type": "pixels_agent_pos",
+        "render_mode": "rgb_array"
+    },
+    "policy": {
+        "type": "smolandfast",
+        "vlm_checkpoint": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
+        "n_obs_steps": 1,
+        "device": "cuda",
+        "use_amp": false,
+        "push_to_hub": false,
+        "chunk_size": 15,
+        "n_action_steps": 5,
+        "vision_model_optimizer_lr": 0.0002,
+        "connector_optimizer_lr": 0.0002,
+        "text_model_optimizer_lr": 0.0002,
+        "optimizer_lr": 0.0002,
+        "optimizer_betas": [
+            0.9,
+            0.95
+        ],
+        "optimizer_eps": 1e-08,
+        "optimizer_weight_decay": 0.01,
+        "scheduler_warmup_steps": 1000,
+        "scheduler_decay_steps": 30000,
+        "scheduler_decay_lr": 2.5e-06,
+        "precision": "float32",
+        "freeze_vision_encoder": true,
+        "freeze_connector": false,
+        "scale_factor": 2,
+        "do_image_splitting": false,
+        "drop_n_last_frames": true,
+        "grad_clip_norm": 1,
+        "relaxed_action_decoding": true
+    },
+    "num_workers": 4,
+    "batch_size": 16,
+    "steps": 10000,
+    "eval_freq": 1000,
+    "log_freq": 100,
+    "save_checkpoint": true,
+    "save_freq": 10000,
+    "eval": {
+        "n_episodes": 32,
+        "batch_size": 16
+    },
+    "wandb": {
+        "enable": true,
+        "project": "lerobot"
+    }
+}
diff --git a/examples/smolandfast_test.ipynb b/examples/smolandfast_test.ipynb
@@ -0,0 +1,144 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fdfec91b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "from tqdm import tqdm\n",
+    "import torch\n",
+    "from lerobot.datasets.utils import cycle\n",
+    "\n",
+    "from lerobot.configs.types import FeatureType\n",
+    "from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata\n",
+    "from lerobot.datasets.utils import dataset_to_policy_features\n",
+    "from lerobot.policies.smolandfast.configuration_smolandfast import SMOLANDFASTConfig\n",
+    "from lerobot.policies.smolandfast.modeling_smolandfast import SMOLANDFASTPolicy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a05fb55b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_directory = Path(\"outputs/train/example_pusht\")\n",
+    "output_directory.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "device = torch.device(\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fad8c876",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATASET_PATH = \"lerobot/pusht_keypoints\"\n",
+    "\n",
+    "dataset_metadata = LeRobotDatasetMetadata(DATASET_PATH)\n",
+    "features = dataset_to_policy_features(dataset_metadata.features)\n",
+    "output_features = {key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION}\n",
+    "input_features = {key: ft for key, ft in features.items() if key not in output_features}\n",
+    "\n",
+    "cfg = SMOLANDFASTConfig(input_features=input_features,\n",
+    "                        output_features=output_features)\n",
+    "\n",
+    "delta_timestamps = {\n",
+    "        \"action\": [i / dataset_metadata.fps for i in cfg.action_delta_indices],\n",
+    "    }\n",
+    "\n",
+    "# We can then instantiate the dataset with these delta_timestamps configuration.\n",
+    "dataset = LeRobotDataset(DATASET_PATH, delta_timestamps=delta_timestamps)\n",
+    "\n",
+    "dataloader = torch.utils.data.DataLoader(\n",
+    "    dataset,\n",
+    "    num_workers=0,\n",
+    "    batch_size=4,\n",
+    "    shuffle=True,\n",
+    "    pin_memory=device.type != \"cpu\",\n",
+    "    drop_last=True,\n",
+    ")\n",
+    "dl_iter = cycle(dataloader)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fbaf9fd2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "policy = SMOLANDFASTPolicy(cfg,\n",
+    "                           dataset_stats=dataset_metadata.stats)\n",
+    "policy.train()\n",
+    "policy.to(device)\n",
+    "\n",
+    "optimizer = torch.optim.Adam(policy.parameters(), lr=5e-5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ab062ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch = next(dl_iter)\n",
+    "\n",
+    "for step in tqdm(range(50)):\n",
+    "\n",
+    "    batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}\n",
+    "    loss, _ = policy.forward(batch)\n",
+    "    \n",
+    "    loss.backward()\n",
+    "    optimizer.step()\n",
+    "    optimizer.zero_grad()\n",
+    "\n",
+    "    print(f\"step: {step} loss: {loss.item():.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22b833de",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_norm = policy.normalize_inputs(batch)\n",
+    "batch_norm = policy.normalize_targets(batch_norm)\n",
+    "\n",
+    "decoded_actions = policy.model.generate_actions(batch_norm)\n",
+    "error:torch.tensor = torch.sqrt((decoded_actions - batch_norm[\"action\"])**2)\n",
+    "\n",
+    "print(f\"RMSE {(error.mean(dim=1)*100).tolist()}%\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".lerobot",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}