End-to-end dexterous manipulation: predict trajectories from RGB-D + language, then execute with RL policy.
[RGB-D + Text] → IntentTracker → [Trajectory] → RL Policy → [Motor Commands]
sudo apt install -y libegl1-mesa-dev libgles2-mesa-dev libosmesa6-dev libopenexr-dev build-essential gitInstall Miniforge if conda is not available, then:
conda create -n sam python=3.11 -y
conda activate sam
# PyTorch (CUDA 12.8 for RTX 5090 / Blackwell)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
# CUDA toolkit (needed to build ViPE C++ extensions)
conda install cuda-toolkit=12.8 -c nvidia -ygit submodule update --init --force \
thirdparty/sam3 \
thirdparty/DenseTrack3Dv2 \
thirdparty/WiLoR \
thirdparty/vipe \
thirdparty/tapnext \
thirdparty/sam2-realtime \
thirdparty/LEAP_Hand_API# SAM3
pip install -e thirdparty/sam3
pip install pycocotools
# ViPE
CUDA_HOME=$CONDA_PREFIX pip install -e thirdparty/vipe
# sam2-realtime
pip install -e thirdparty/sam2-realtime
# DenseTrack3Dv2 (no setup.py, uses sys.path at runtime)
pip install jaxtyping diffusers==0.29.1 accelerate mediapy fire decord OpenEXR yacs natsort
# WiLoR (no setup.py, uses sys.path at runtime)
pip install pyrender smplx==0.1.28 ultralytics==8.1.34 pytorch-lightning yacs PyOpenGL PyOpenGL-accelerate
pip install --no-build-isolation "chumpy @ git+https://github.com/mattloper/chumpy"pip install "setuptools<81"
pip install \
opencv-python Pillow "imageio[ffmpeg]" pandas h5py \
matplotlib tqdm natsort timm einops wandb omegaconf hydra-core \
diffusers transformers sentencepiece supervision pyyaml scipy \
scikit-image kornia gdown flask duckdb pyarrowRequired for SAM3 model downloads and dataset downloads.
pip install "huggingface_hub[cli]"
huggingface-cli loginRequest access to SAM3 checkpoints at https://huggingface.co/facebook/sam3
# DenseTrack3Dv2
gdown --fuzzy "https://drive.google.com/file/d/1Qa9YFAjBFIzrrHHWf8NZMln5YkLfw4Qa/view" \
-O thirdparty/DenseTrack3Dv2/densetrack3dv2.pth
# WiLoR
wget https://huggingface.co/spaces/rolpotamias/WiLoR/resolve/main/pretrained_models/detector.pt \
-O thirdparty/WiLoR/pretrained_models/detector.pt
wget https://huggingface.co/spaces/rolpotamias/WiLoR/resolve/main/pretrained_models/wilor_final.ckpt \
-O thirdparty/WiLoR/pretrained_models/wilor_final.ckpt
# SAM3 auto-downloads on first use (requires HuggingFace auth above)
# MANO_RIGHT.pkl is included in thirdparty/WiLoR/mano_data/ via our forkAll large outputs live under Y2R_DATA_ROOT (defaults to repo root if unset).
# Optional: point to a drive with more space
export Y2R_DATA_ROOT="/path/to/data/drive"$Y2R_DATA_ROOT/
├── y2r/
│ ├── checkpoints/ # IntentTracker model checkpoints
│ └── wandb/ # W&B run logs
├── data/
│ └── datasets/ # Training data (HDF5, pipeline intermediates)
└── IsaacLab/
└── logs/rl_games/ # RL training logs
| Component | How it resolves |
|---|---|
y2r/train.py |
os.environ.get("Y2R_DATA_ROOT", repo_root) prepended to config paths |
data/dataset_scripts/*.py |
load_config() prepends to base_data_dir |
isaac_scripts/common.sh |
Y2R_DATA_ROOT="${Y2R_DATA_ROOT:-$REPO_ROOT}" |
conda activate sam
python y2r/train.py --config y2r/configs/train_direct.yaml # Direct prediction
python y2r/train.py --config y2r/configs/train_diffusion.yaml # Diffusion-based
python y2r/train.py --config y2r/configs/train_autoreg.yaml # Autoregressive
# Resume
python y2r/train.py --config y2r/configs/train_direct.yaml --resume path/to/ckpt.ptScripts in data/dataset_scripts/ process raw videos into training data. Config: data/dataset_scripts/config.yaml.
conda activate sam
python data/dataset_scripts/preprocess.py # Extract frames at target FPS
python data/dataset_scripts/process_gsam.py # Segment objects (SAM3)
python data/dataset_scripts/process_vipe.py # Depth maps + camera poses (ViPE)
python data/dataset_scripts/process_densetrack3d.py # 3D point tracking
python data/dataset_scripts/process_wilor.py # Hand pose extraction
python data/dataset_scripts/create_h5_dataset.py # Package into HDF5Flask + DuckDB app for browsing training datasets. See data/dataset_explorer/README.md for full details.
All commands run from data/dataset_explorer/.
cd data/dataset_explorer
mkdir -p data/source data/videos
# Panda-70M metadata (~17 parquet shards)
huggingface-cli download SUSTech/panda-70m --repo-type dataset \
--local-dir data/source/panda70m
# SSv2 labels
# Download train.json + validation.json from Something-Something-V2
# Place in data/source/ssv2_meta/
# SSv2 videos (~19 GB)
huggingface-cli download morpheushoc/something-something-v2 \
--repo-type dataset --local-dir data/videos/ssv2_download
mkdir -p data/videos/ssv2
cd data/videos/ssv2_download/videos
cat 20bn-something-something-v2-* | tar -xz --strip-components=1 -C ../../ssv2/
cd ../../../..
# Action100M (single CSV)
# Place action100m_actions.csv in data/source/
# LVP metadata (3 CSVs)
# Place in data/source/lvp/{pandas,something_something_v2,epic_kitchens}/python scripts/setup_data.py # Source data -> parquet files
python app.py # http://localhost:5555RL policy training uses conda env y2r and lives in IsaacLab/ + isaac_scripts/. See CLAUDE.md for details.