NVIDIA · rapids-bot · May 13, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
@@ -113,22 +113,8 @@ The steps below demonstrate how to download, install, and run benchmarks on a su
     # (1) Prepare dataset.
     python -m cuvs_bench.get_dataset --dataset deep-image-96-angular --normalize
 
-.. code-block:: python
-
     # (2) Build and search index.
-    from cuvs_bench.orchestrator import BenchmarkOrchestrator
-
-    orchestrator = BenchmarkOrchestrator(backend_type="cpp_gbench")
-    results = orchestrator.run_benchmark(
-        dataset="deep-image-96-inner",
-        algorithms="cuvs_cagra",
-        count=10,
-        batch_size=10,
-        build=True,
-        search=True,
-    )
-
-.. code-block:: bash
+    python -m cuvs_bench.run --dataset deep-image-96-inner --algorithms cuvs_cagra --batch-size 10 -k 10 --build --search
 
     # (3) Export data.
     python -m cuvs_bench.run --data-export --dataset deep-image-96-inner
@@ -210,22 +196,10 @@ The steps below demonstrate how to download, install, and run benchmarks on a su
     python -m cuvs_bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin
     # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
 
-.. code-block:: python
+.. code-block:: bash
 
     # (2) Build and search index.
-    from cuvs_bench.orchestrator import BenchmarkOrchestrator
-
-    orchestrator = BenchmarkOrchestrator(backend_type="cpp_gbench")
-    results = orchestrator.run_benchmark(
-        dataset="deep-1B",
-        algorithms="cuvs_cagra",
-        count=10,
-        batch_size=10,
-        build=True,
-        search=True,
-    )
-
-.. code-block:: bash
+    python -m cuvs_bench.run --dataset deep-1B --algorithms cuvs_cagra --batch-size 10 -k 10 --build --search
 
     # (3) Export data.
     python -m cuvs_bench.run --data-export --dataset deep-1B

@@ -18,18 +18,28 @@
     from ..orchestrator.config_loaders import IndexConfig
 
 
-@dataclass
 class Dataset:
     """
     Dataset representation for benchmarking.
 
-    Attributes
+    Supports two usage patterns:
+
+    1. File-path based (C++ backend): The orchestrator passes file paths
+       and the backend reads them directly via ``base_file``, ``query_file``,
+       etc. Vectors are never loaded into Python.
+
+    2. Array based (Python-native backends like OpenSearch, Elasticsearch):
+       Backends access ``base_vectors``, ``query_vectors``, etc. If vectors
+       were not provided directly but a file path exists, they are loaded
+       lazily on first access. This keeps file I/O invisible to backends.
+
+    Parameters
     ----------
     name : str
         Dataset name (e.g., "glove-100-inner")
-    base_vectors : np.ndarray
+    base_vectors : Optional[np.ndarray]
         Base vectors for index building, shape (n_vectors, dims)
-    query_vectors : np.ndarray
+    query_vectors : Optional[np.ndarray]
         Query vectors for search, shape (n_queries, dims)
     groundtruth_neighbors : Optional[np.ndarray]
         Ground truth neighbor IDs, shape (n_queries, k_gt)
@@ -38,25 +48,108 @@ class Dataset:
     distance_metric : str
         Distance metric ("euclidean", "inner_product", "cosine")
     base_file : Optional[str]
-        Path to base vectors file (for C++ backend compatibility)
+        Path to base vectors file
     query_file : Optional[str]
-        Path to query vectors file (for C++ backend compatibility)
+        Path to query vectors file
     groundtruth_neighbors_file : Optional[str]
-        Path to ground truth neighbors file (for C++ backend compatibility)
-    metadata : Dict[str, Any]
-        Additional dataset metadata like {"source": "ann-benchmarks"}
+        Path to ground truth neighbors file
+    metadata : Optional[Dict[str, Any]]
+        Additional dataset metadata like {"subset_size": 10000}
     """
 
-    name: str
-    base_vectors: np.ndarray
-    query_vectors: np.ndarray
-    groundtruth_neighbors: Optional[np.ndarray] = None
-    groundtruth_distances: Optional[np.ndarray] = None
-    distance_metric: str = "euclidean"
-    base_file: Optional[str] = None
-    query_file: Optional[str] = None
-    groundtruth_neighbors_file: Optional[str] = None
-    metadata: Dict[str, Any] = field(default_factory=dict)
+    def __init__(
+        self,
+        name: str,
+        base_vectors: Optional[np.ndarray] = None,
+        query_vectors: Optional[np.ndarray] = None,
+        groundtruth_neighbors: Optional[np.ndarray] = None,
+        groundtruth_distances: Optional[np.ndarray] = None,
+        distance_metric: str = "euclidean",
+        base_file: Optional[str] = None,
+        query_file: Optional[str] = None,
+        groundtruth_neighbors_file: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ):
+        self.name = name
+        # Vectors are stored privately to support lazy loading.
+        # If the caller provides vectors directly, they are used as-is.
+        # If empty and a file path is provided, the corresponding property
+        # loads vectors from the file on first access. This keeps the
+        # loading logic invisible to backends: they just access
+        # dataset.base_vectors and get a numpy array regardless of whether
+        # it was passed in or loaded from disk.
+        self._base_vectors = (
+            base_vectors if base_vectors is not None else np.empty((0, 0))
+        )
+        self._query_vectors = (
+            query_vectors if query_vectors is not None else np.empty((0, 0))
+        )
+        self._groundtruth_neighbors = groundtruth_neighbors
+        self.groundtruth_distances = groundtruth_distances
+        self.distance_metric = distance_metric
+        self.base_file = base_file
+        self.query_file = query_file
+        self.groundtruth_neighbors_file = groundtruth_neighbors_file
+        self.metadata = metadata or {}
+
+    @property
+    def base_vectors(self) -> np.ndarray:
+        """Base vectors for index building.
+
+        Loaded from base_file on first access if not provided directly.
+        """
+        if self._base_vectors.size == 0 and self.base_file:
+            from .utils import load_vectors
+
+            self._base_vectors = load_vectors(
+                self.base_file, self.metadata.get("subset_size")
+            )
+        return self._base_vectors
+
+    @base_vectors.setter
+    def base_vectors(self, value: Optional[np.ndarray]) -> None:
+        """Set base vectors directly."""
+        self._base_vectors = value if value is not None else np.empty((0, 0))
+
+    @property
+    def query_vectors(self) -> np.ndarray:
+        """Query vectors for search.
+
+        Loaded from query_file on first access if not provided directly.
+        """
+        if self._query_vectors.size == 0 and self.query_file:
+            from .utils import load_vectors
+
+            self._query_vectors = load_vectors(self.query_file)
+        return self._query_vectors
+
+    @query_vectors.setter
+    def query_vectors(self, value: Optional[np.ndarray]) -> None:
+        """Set query vectors directly."""
+        self._query_vectors = value if value is not None else np.empty((0, 0))
+
+    @property
+    def groundtruth_neighbors(self) -> Optional[np.ndarray]:
+        """Ground truth neighbor IDs.
+
+        Loaded from groundtruth_neighbors_file on first access if not
+        provided directly.
+        """
+        if (
+            self._groundtruth_neighbors is None
+            and self.groundtruth_neighbors_file
+        ):
+            from .utils import load_vectors
+
+            self._groundtruth_neighbors = load_vectors(
+                self.groundtruth_neighbors_file
+            )
+        return self._groundtruth_neighbors
+
+    @groundtruth_neighbors.setter
+    def groundtruth_neighbors(self, value: Optional[np.ndarray]) -> None:
+        """Set ground truth neighbors directly."""
+        self._groundtruth_neighbors = value
 
     @property
     def dims(self) -> int: