huggingface · chenpeng-zhao · Oct 5, 2025 · Oct 5, 2025 · Oct 11, 2025 · Oct 11, 2025
diff --git a/download_dataset_locally.py b/download_dataset_locally.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+
+"""
+Download script for ywu67/keychain dataset using LeRobot.
+This script downloads all data and files from the Hugging Face Hub dataset.
+"""
+
+import logging
+import sys
+from pathlib import Path
+
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.utils.constants import HF_LEROBOT_HOME
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler('download_log.txt')
+    ]
+)
+
+def download_keychain_dataset(
+    repo_id: str = "ywu67/keychain",
+    root_dir: str | Path | None = None,
+    download_videos: bool = True,
+    force_cache_sync: bool = False
+):
+    """
+    Download the ywu67/keychain dataset from Hugging Face Hub.
+
+    Args:
+        repo_id: The repository ID on Hugging Face Hub
+        root_dir: Local directory to store the dataset (defaults to HF_LEROBOT_HOME)
+        download_videos: Whether to download video files
+        force_cache_sync: Whether to force sync and refresh local files
+    """
+
+    if root_dir is None:
+        root_dir = HF_LEROBOT_HOME / repo_id.replace("/", "_")
+    else:
+        root_dir = Path(root_dir)
+
+    logging.info(f"Starting download of dataset: {repo_id}")
+    logging.info(f"Download location: {root_dir}")
+    logging.info(f"Download videos: {download_videos}")
+    logging.info(f"Force cache sync: {force_cache_sync}")
+
+    try:
+        # Create the dataset object - this will trigger the download
+        dataset = LeRobotDataset(
+            repo_id=repo_id,
+            root=root_dir,
+            download_videos=download_videos,
+            force_cache_sync=force_cache_sync
+        )
+
+        logging.info("Dataset downloaded successfully!")
+        logging.info(f"Dataset info:")
+        logging.info(f"  - Total episodes: {dataset.num_episodes}")
+        logging.info(f"  - Total frames: {dataset.num_frames}")
+        logging.info(f"  - FPS: {dataset.fps}")
+        logging.info(f"  - Features: {list(dataset.features.keys())}")
+        logging.info(f"  - Camera keys: {dataset.meta.camera_keys}")
+        logging.info(f"  - Video keys: {dataset.meta.video_keys}")
+        logging.info(f"  - Dataset size on disk: {get_directory_size(root_dir):.2f} MB")
+
+        return dataset
+
+    except Exception as e:
+        logging.error(f"Error downloading dataset: {str(e)}")
+        logging.error(f"Error type: {type(e).__name__}")
+        raise
+
+def get_directory_size(path: Path) -> float:
+    """Calculate the total size of a directory in MB."""
+    total_size = 0
+    try:
+        for file_path in path.rglob('*'):
+            if file_path.is_file():
+                total_size += file_path.stat().st_size
+        return total_size / (1024 * 1024)  # Convert to MB
+    except Exception as e:
+        logging.warning(f"Could not calculate directory size: {e}")
+        return 0.0
+
+def main():
+    """Main function to run the download script."""
+
+    # Configuration
+    REPO_ID = "ywu67/record-test30"
+    DOWNLOAD_DIR = Path("./downloaded_dataset")  # Download to current directory
+
+    logging.info("="*60)
+    logging.info("LeRobot Dataset Downloader")
+    logging.info("="*60)
+
+    try:
+        # Download the dataset
+        dataset = download_keychain_dataset(
+            repo_id=REPO_ID,
+            root_dir=DOWNLOAD_DIR,
+            download_videos=True,
+            force_cache_sync=False  # Set to True to re-download even if files exist
+        )
+
+        logging.info("="*60)
+        logging.info("Download completed successfully!")
+        logging.info(f"Dataset available at: {DOWNLOAD_DIR}")
+        logging.info("="*60)
+
+        # Print final summary
+        print(f"\n✓ Successfully downloaded {REPO_ID}")
+        print(f"📁 Location: {DOWNLOAD_DIR.absolute()}")
+        print(f"📊 Episodes: {dataset.num_episodes}")
+        print(f"🎬 Frames: {dataset.num_frames}")
+        print(f"📹 Videos included: {len(dataset.meta.video_keys) > 0}")
+
+    except KeyboardInterrupt:
+        logging.info("Download interrupted by user")
+        print("\n⚠️ Download interrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        logging.error(f"Download failed: {e}")
+        print(f"\n❌ Download failed: {e}")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/lerobot/datasets/aggregate.py b/src/lerobot/datasets/aggregate.py
@@ -493,3 +493,62 @@ def finalize_aggregation(aggr_meta, all_metadata):
     logging.info("write stats")
     aggr_meta.stats = aggregate_stats([m.stats for m in all_metadata])
     write_stats(aggr_meta.stats, aggr_meta.root)
+
+
+def merge_grab_datasets():
+    """Merge grab1-record-test and grab2-record-test datasets using aggregate_datasets."""
+
+    # Dataset paths
+    grab1_path = Path("/Users/chenz/.cache/huggingface/lerobot/ywu67/grab1-record-test")
+    grab2_path = Path("/Users/chenz/.cache/huggingface/lerobot/ywu67/grab2-record-test")
+    output_path = Path("/Users/chenz/.cache/huggingface/lerobot/ywu67/grab-merged")
+
+    # Repository IDs
+    repo_ids = ["ywu67/grab1-record-test", "ywu67/grab2-record-test"]
+    aggr_repo_id = "ywu67/grab-merged"
+
+    # Root paths for the datasets
+    roots = [grab1_path, grab2_path]
+
+    logging.info(f"Merging grab datasets:")
+    logging.info(f"  Dataset 1: {grab1_path}")
+    logging.info(f"  Dataset 2: {grab2_path}")
+    logging.info(f"  Output: {output_path}")
+    logging.info(f"  Merged repo ID: {aggr_repo_id}")
+
+    # Validate that both datasets exist
+    if not grab1_path.exists():
+        raise FileNotFoundError(f"Dataset grab1-record-test not found at: {grab1_path}")
+    if not grab2_path.exists():
+        raise FileNotFoundError(f"Dataset grab2-record-test not found at: {grab2_path}")
+
+    # Call the existing aggregate_datasets function
+    aggregate_datasets(
+        repo_ids=repo_ids,
+        aggr_repo_id=aggr_repo_id,
+        roots=roots,
+        aggr_root=output_path,
+    )
+
+    logging.info(f"Successfully merged datasets to: {output_path}")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Aggregate or merge LeRobot datasets")
+    parser.add_argument("--merge-grab", action="store_true", help="Merge grab1 and grab2 datasets")
+    parser.add_argument("--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"])
+
+    args = parser.parse_args()
+
+    # Setup logging
+    logging.basicConfig(
+        level=getattr(logging, args.log_level),
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+
+    if args.merge_grab:
+        merge_grab_datasets()
+    else:
+        parser.print_help()
diff --git a/src/lerobot/datasets/data_processing/download_dataset_locally.py b/src/lerobot/datasets/data_processing/download_dataset_locally.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+
+"""
+Download script for ywu67/keychain dataset using LeRobot.
+This script downloads all data and files from the Hugging Face Hub dataset.
+"""
+
+import logging
+import sys
+from pathlib import Path
+
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.utils.constants import HF_LEROBOT_HOME
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler('download_log.txt')
+    ]
+)
+
+def download_keychain_dataset(
+    repo_id: str = "ywu67/keychain",
+    root_dir: str | Path | None = None,
+    download_videos: bool = True,
+    force_cache_sync: bool = False
+):
+    """
+    Download the ywu67/keychain dataset from Hugging Face Hub.
+
+    Args:
+        repo_id: The repository ID on Hugging Face Hub
+        root_dir: Local directory to store the dataset (defaults to HF_LEROBOT_HOME)
+        download_videos: Whether to download video files
+        force_cache_sync: Whether to force sync and refresh local files
+    """
+
+    if root_dir is None:
+        root_dir = HF_LEROBOT_HOME / repo_id.replace("/", "_")
+    else:
+        root_dir = Path(root_dir)
+
+    logging.info(f"Starting download of dataset: {repo_id}")
+    logging.info(f"Download location: {root_dir}")
+    logging.info(f"Download videos: {download_videos}")
+    logging.info(f"Force cache sync: {force_cache_sync}")
+
+    try:
+        # Create the dataset object - this will trigger the download
+        dataset = LeRobotDataset(
+            repo_id=repo_id,
+            root=root_dir,
+            download_videos=download_videos,
+            force_cache_sync=force_cache_sync
+        )
+
+        logging.info("Dataset downloaded successfully!")
+        logging.info(f"Dataset info:")
+        logging.info(f"  - Total episodes: {dataset.num_episodes}")
+        logging.info(f"  - Total frames: {dataset.num_frames}")
+        logging.info(f"  - FPS: {dataset.fps}")
+        logging.info(f"  - Features: {list(dataset.features.keys())}")
+        logging.info(f"  - Camera keys: {dataset.meta.camera_keys}")
+        logging.info(f"  - Video keys: {dataset.meta.video_keys}")
+        logging.info(f"  - Dataset size on disk: {get_directory_size(root_dir):.2f} MB")
+
+        return dataset
+
+    except Exception as e:
+        logging.error(f"Error downloading dataset: {str(e)}")
+        logging.error(f"Error type: {type(e).__name__}")
+        raise
+
+def get_directory_size(path: Path) -> float:
+    """Calculate the total size of a directory in MB."""
+    total_size = 0
+    try:
+        for file_path in path.rglob('*'):
+            if file_path.is_file():
+                total_size += file_path.stat().st_size
+        return total_size / (1024 * 1024)  # Convert to MB
+    except Exception as e:
+        logging.warning(f"Could not calculate directory size: {e}")
+        return 0.0
+
+def main():
+    """Main function to run the download script."""
+
+    # Configuration
+    REPO_ID = "ywu67/record-test30"
+    DOWNLOAD_DIR = Path("./downloaded_dataset")  # Download to current directory
+
+    logging.info("="*60)
+    logging.info("LeRobot Dataset Downloader")
+    logging.info("="*60)
+
+    try:
+        # Download the dataset
+        dataset = download_keychain_dataset(
+            repo_id=REPO_ID,
+            root_dir=DOWNLOAD_DIR,
+            download_videos=True,
+            force_cache_sync=False  # Set to True to re-download even if files exist
+        )
+
+        logging.info("="*60)
+        logging.info("Download completed successfully!")
+        logging.info(f"Dataset available at: {DOWNLOAD_DIR}")
+        logging.info("="*60)
+
+        # Print final summary
+        print(f"\n✓ Successfully downloaded {REPO_ID}")
+        print(f"📁 Location: {DOWNLOAD_DIR.absolute()}")
+        print(f"📊 Episodes: {dataset.num_episodes}")
+        print(f"🎬 Frames: {dataset.num_frames}")
+        print(f"📹 Videos included: {len(dataset.meta.video_keys) > 0}")
+
+    except KeyboardInterrupt:
+        logging.info("Download interrupted by user")
+        print("\n⚠️ Download interrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        logging.error(f"Download failed: {e}")
+        print(f"\n❌ Download failed: {e}")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()