Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
a128760
feat: autoregressive transformer poc
balakhnov Sep 1, 2025
2f74d5d
fix: n_obs = 1 only
balakhnov Sep 2, 2025
a88971e
fix: device
balakhnov Sep 3, 2025
e292eaf
feat: limited generation
balakhnov Sep 4, 2025
29e4b07
fix: env normalisation fixed
balakhnov Sep 5, 2025
7e0bac2
feat: dependencies
balakhnov Sep 5, 2025
0996a00
feat: smollm added
balakhnov Sep 5, 2025
2f96d3a
chore: llm checkpoint refactoring
balakhnov Sep 6, 2025
2990a74
feat: special tokens in forward
balakhnov Sep 6, 2025
f17798e
fix non blocking
balakhnov Sep 13, 2025
fced732
feat: gemma
balakhnov Sep 14, 2025
9cadf3f
fix: obs padding
balakhnov Sep 19, 2025
ede8c41
feat: padding obs
balakhnov Sep 19, 2025
131c752
feat: smolvlm
balakhnov Sep 21, 2025
8811597
feat:no grad
balakhnov Sep 24, 2025
fee8d19
feat: no image resize
balakhnov Sep 25, 2025
ac39326
chore: refactor
balakhnov Sep 26, 2025
7dc836b
chore: refactor
balakhnov Sep 29, 2025
022fac5
chore: doc
balakhnov Sep 30, 2025
a93e79e
feat: docs
balakhnov Sep 30, 2025
991fd8d
feat: optim groups
balakhnov Oct 2, 2025
a67f5e5
feat: fast tokenizer, drop n last frames
balakhnov Oct 3, 2025
487faf6
feat: rebase fork + processor
balakhnov Oct 4, 2025
3b299f8
feat: configs
balakhnov Oct 4, 2025
a1d28ac
feat: smolvlm2 500m config
balakhnov Oct 5, 2025
948f90b
Add profiling and fast image processor
Oct 5, 2025
32e2360
Small changes
Oct 5, 2025
ae9e456
Add random crops
Oct 5, 2025
c10ed42
Merge pull request #3 from Robot-Learning-Collective/southfreebird/sm…
southfreebird Oct 6, 2025
9a78657
drop_last_frames bool -> int; fix wandb warning
zaringleb Oct 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions configs/default_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"dataset": {
"repo_id": "lerobot/pusht"
},
"env": {
"type": "pusht",
"task": "PushT-v0",
"obs_type": "pixels_agent_pos",
"render_mode": "rgb_array"
},
"policy": {
"type": "smolandfast",
"vlm_checkpoint": "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
"n_obs_steps": 1,
"device": "cuda",
"use_amp": false,
"push_to_hub": false,
"chunk_size": 10,
"n_action_steps": 5,
"vision_model_optimizer_lr": 0.0002,
"connector_optimizer_lr": 0.0002,
"text_model_optimizer_lr": 0.0002,
"optimizer_lr": 0.0002,
"optimizer_betas": [
0.9,
0.95
],
"optimizer_eps": 1e-08,
"optimizer_weight_decay": 0.01,
"scheduler_warmup_steps": 1000,
"scheduler_decay_steps": 30000,
"scheduler_decay_lr": 2.5e-06,
"precision": "float32",
"freeze_vision_encoder": true,
"freeze_connector": true,
"scale_factor": 4,
"do_image_splitting": false,
"drop_n_last_frames": true,
"grad_clip_norm": 1,
"relaxed_action_decoding": true
},
"num_workers": 4,
"batch_size": 16,
"steps": 10000,
"eval_freq": 1000,
"log_freq": 100,
"save_checkpoint": true,
"save_freq": 10000,
"eval": {
"n_episodes": 32,
"batch_size": 16
},
"wandb": {
"enable": true,
"project": "lerobot"
}
}
59 changes: 59 additions & 0 deletions configs/smol_and_fast_30000.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{
"dataset": {
"repo_id": "lerobot/pusht"
},
"env": {
"type": "pusht",
"task": "PushT-v0",
"obs_type": "pixels_agent_pos",
"render_mode": "rgb_array"
},
"policy": {
"type": "smolandfast",
"vlm_checkpoint": "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
"n_obs_steps": 1,
"device": "cuda",
"use_amp": false,
"push_to_hub": false,
"chunk_size": 15,
"n_action_steps": 5,
"vision_model_optimizer_lr": 0.0002,
"connector_optimizer_lr": 0.0002,
"text_model_optimizer_lr": 0.0002,
"optimizer_lr": 0.0002,
"optimizer_betas": [
0.9,
0.95
],
"optimizer_eps": 1e-08,
"optimizer_weight_decay": 0.01,
"scheduler_warmup_steps": 1000,
"scheduler_decay_steps": 30000,
"scheduler_decay_lr": 2.5e-06,
"precision": "float32",
"freeze_vision_encoder": true,
"freeze_connector": false,
"scale_factor": 4,
"do_image_splitting": false,
"drop_n_last_frames": 10,
"grad_clip_norm": 1,
"relaxed_action_decoding": true,
"crop_shape": [84, 84]
},
"num_workers": 4,
"batch_size": 16,
"steps": 30000,
"eval_freq": 1000,
"log_freq": 50,
"save_checkpoint": true,
"save_freq": 10000,
"profile_step_num": -1,
"eval": {
"n_episodes": 64,
"batch_size": 16
},
"wandb": {
"enable": true,
"project": "lerobot"
}
}
57 changes: 57 additions & 0 deletions configs/smolvlm2_500.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"dataset": {
"repo_id": "lerobot/pusht"
},
"env": {
"type": "pusht",
"task": "PushT-v0",
"obs_type": "pixels_agent_pos",
"render_mode": "rgb_array"
},
"policy": {
"type": "smolandfast",
"vlm_checkpoint": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
"n_obs_steps": 1,
"device": "cuda",
"use_amp": false,
"push_to_hub": false,
"chunk_size": 15,
"n_action_steps": 5,
"vision_model_optimizer_lr": 0.0002,
"connector_optimizer_lr": 0.0002,
"text_model_optimizer_lr": 0.0002,
"optimizer_lr": 0.0002,
"optimizer_betas": [
0.9,
0.95
],
"optimizer_eps": 1e-08,
"optimizer_weight_decay": 0.01,
"scheduler_warmup_steps": 1000,
"scheduler_decay_steps": 30000,
"scheduler_decay_lr": 2.5e-06,
"precision": "float32",
"freeze_vision_encoder": true,
"freeze_connector": false,
"scale_factor": 2,
"do_image_splitting": false,
"drop_n_last_frames": true,
"grad_clip_norm": 1,
"relaxed_action_decoding": true
},
"num_workers": 4,
"batch_size": 16,
"steps": 10000,
"eval_freq": 1000,
"log_freq": 100,
"save_checkpoint": true,
"save_freq": 10000,
"eval": {
"n_episodes": 32,
"batch_size": 16
},
"wandb": {
"enable": true,
"project": "lerobot"
}
}
144 changes: 144 additions & 0 deletions examples/smolandfast_test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "fdfec91b",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"from tqdm import tqdm\n",
"import torch\n",
"from lerobot.datasets.utils import cycle\n",
"\n",
"from lerobot.configs.types import FeatureType\n",
"from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata\n",
"from lerobot.datasets.utils import dataset_to_policy_features\n",
"from lerobot.policies.smolandfast.configuration_smolandfast import SMOLANDFASTConfig\n",
"from lerobot.policies.smolandfast.modeling_smolandfast import SMOLANDFASTPolicy"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a05fb55b",
"metadata": {},
"outputs": [],
"source": [
"output_directory = Path(\"outputs/train/example_pusht\")\n",
"output_directory.mkdir(parents=True, exist_ok=True)\n",
"\n",
"device = torch.device(\"cpu\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fad8c876",
"metadata": {},
"outputs": [],
"source": [
"DATASET_PATH = \"lerobot/pusht_keypoints\"\n",
"\n",
"dataset_metadata = LeRobotDatasetMetadata(DATASET_PATH)\n",
"features = dataset_to_policy_features(dataset_metadata.features)\n",
"output_features = {key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION}\n",
"input_features = {key: ft for key, ft in features.items() if key not in output_features}\n",
"\n",
"cfg = SMOLANDFASTConfig(input_features=input_features,\n",
" output_features=output_features)\n",
"\n",
"delta_timestamps = {\n",
" \"action\": [i / dataset_metadata.fps for i in cfg.action_delta_indices],\n",
" }\n",
"\n",
"# We can then instantiate the dataset with these delta_timestamps configuration.\n",
"dataset = LeRobotDataset(DATASET_PATH, delta_timestamps=delta_timestamps)\n",
"\n",
"dataloader = torch.utils.data.DataLoader(\n",
" dataset,\n",
" num_workers=0,\n",
" batch_size=4,\n",
" shuffle=True,\n",
" pin_memory=device.type != \"cpu\",\n",
" drop_last=True,\n",
")\n",
"dl_iter = cycle(dataloader)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fbaf9fd2",
"metadata": {},
"outputs": [],
"source": [
"policy = SMOLANDFASTPolicy(cfg,\n",
" dataset_stats=dataset_metadata.stats)\n",
"policy.train()\n",
"policy.to(device)\n",
"\n",
"optimizer = torch.optim.Adam(policy.parameters(), lr=5e-5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ab062ae",
"metadata": {},
"outputs": [],
"source": [
"batch = next(dl_iter)\n",
"\n",
"for step in tqdm(range(50)):\n",
"\n",
" batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}\n",
" loss, _ = policy.forward(batch)\n",
" \n",
" loss.backward()\n",
" optimizer.step()\n",
" optimizer.zero_grad()\n",
"\n",
" print(f\"step: {step} loss: {loss.item():.3f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "22b833de",
"metadata": {},
"outputs": [],
"source": [
"batch_norm = policy.normalize_inputs(batch)\n",
"batch_norm = policy.normalize_targets(batch_norm)\n",
"\n",
"decoded_actions = policy.model.generate_actions(batch_norm)\n",
"error:torch.tensor = torch.sqrt((decoded_actions - batch_norm[\"action\"])**2)\n",
"\n",
"print(f\"RMSE {(error.mean(dim=1)*100).tolist()}%\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".lerobot",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading