Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions rust/cuvs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ doc-only = ["cuvs-sys/doc-only"]

[dependencies]
cuvs-sys = { workspace = true }
ndarray = "0.15"
thiserror = "2"
tinyvec = { version = "1", features = ["alloc"] }

[dev-dependencies]
ndarray-rand = "0.14"
ndarray = "0.17"
ndarray-rand = "0.16"

[package.metadata.docs.rs]
features = ["doc-only"]
218 changes: 188 additions & 30 deletions rust/cuvs/examples/cagra.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,62 +3,220 @@
* SPDX-License-Identifier: Apache-2.0
*/

//! CAGRA example with a user-provided GPU tensor.
//!
//! This demonstrates how to feed your own device memory into cuVS by
//! implementing the public [`IntoDlTensor`]/[`IntoDlTensorMut`] traits. The
//! [`CudaTensor`] type manages device memory directly through the CUDA runtime
//! (`cudaMalloc`/`cudaFree`) and copies to/from host arrays with `cudaMemcpyAsync`
//! on the cuVS stream, reusing the resources handle's `get_cuda_stream`/
//! `sync_stream` for stream access and synchronization.
//!
//! A real application would likely rely on a helper crate such as `cudarc`
//! and its `CudaSlice`.

use std::ffi::c_void;
use std::marker::PhantomData;
use std::os::raw::c_int;

use cuvs::Resources;
use cuvs::cagra::{Index, IndexParams, SearchParams};
use cuvs::{ManagedTensor, Resources, Result};
use cuvs::dlpack::{
DLDevice, DLDeviceType, DLPackError, DLTensorView, DLTensorViewMut, DType, IntoDlTensor,
IntoDlTensorMut,
};

use ndarray::s;
use ndarray_rand::RandomExt;
use ndarray_rand::rand_distr::Uniform;

/// Example showing how to index and search data with CAGRA
fn cagra_example() -> Result<()> {
type ExampleResult<T> = std::result::Result<T, Box<dyn std::error::Error>>;

// ---------------------------------------------------------------------------
// Minimal CUDA runtime FFI
// ---------------------------------------------------------------------------

#[allow(non_camel_case_types)]
type cudaError_t = c_int;
const CUDA_SUCCESS: cudaError_t = 0;
const CUDA_MEMCPY_HOST_TO_DEVICE: c_int = 1;
const CUDA_MEMCPY_DEVICE_TO_HOST: c_int = 2;

#[link(name = "cudart")]
unsafe extern "C" {
fn cudaMalloc(ptr: *mut *mut c_void, size: usize) -> cudaError_t;
fn cudaFree(ptr: *mut c_void) -> cudaError_t;
fn cudaMemcpyAsync(
dst: *mut c_void,
src: *const c_void,
count: usize,
kind: c_int,
stream: cuvs_sys::cudaStream_t,
) -> cudaError_t;
}

fn check_cuda(status: cudaError_t) -> ExampleResult<()> {
if status == CUDA_SUCCESS {
Ok(())
} else {
Err(format!("CUDA runtime error: {status}").into())
}
}

// ---------------------------------------------------------------------------
// A custom device tensor backed by the CUDA runtime
// ---------------------------------------------------------------------------

struct CudaTensor<T: DType> {
data: *mut c_void,
shape: Vec<i64>,
bytes: usize,
_marker: PhantomData<T>,
}

impl<T: DType> CudaTensor<T> {
/// Allocate an uninitialized device buffer (used for search outputs).
fn alloc(shape: &[usize]) -> ExampleResult<Self> {
let bytes = shape.iter().product::<usize>() * std::mem::size_of::<T>();
let mut data: *mut c_void = std::ptr::null_mut();
check_cuda(unsafe { cudaMalloc(&mut data, bytes) })?;
Ok(Self {
data,
shape: shape.iter().map(|&d| d as i64).collect(),
bytes,
_marker: PhantomData,
})
}

/// Copy a contiguous host array onto the device.
fn from_host<D>(res: &Resources, host: &ndarray::ArrayRef<T, D>) -> ExampleResult<Self>
where
D: ndarray::Dimension,
{
if !host.is_standard_layout() {
return Err("host array must be contiguous (row-major)".into());
}
let tensor = Self::alloc(host.shape())?;

let stream = res.get_cuda_stream()?;
check_cuda(unsafe {
cudaMemcpyAsync(
tensor.data,
host.as_ptr() as *const c_void,
tensor.bytes,
CUDA_MEMCPY_HOST_TO_DEVICE,
stream,
)
})?;
res.sync_stream()?;

Ok(tensor)
}

/// Copy the device buffer back into a contiguous host array.
fn to_host<D>(&self, res: &Resources, host: &mut ndarray::ArrayRef<T, D>) -> ExampleResult<()>
where
D: ndarray::Dimension,
{
if !host.is_standard_layout() {
return Err("host array must be contiguous (row-major)".into());
}

let stream = res.get_cuda_stream()?;
check_cuda(unsafe {
cudaMemcpyAsync(
host.as_mut_ptr() as *mut c_void,
self.data,
self.bytes,
CUDA_MEMCPY_DEVICE_TO_HOST,
stream,
)
})?;
res.sync_stream()?;

Ok(())
}
}

impl<T: DType> Drop for CudaTensor<T> {
fn drop(&mut self) {
if !self.data.is_null() {
unsafe { cudaFree(self.data) };
}
}
}

impl<'a, T: DType> IntoDlTensor<'a> for &'a CudaTensor<T> {
fn into_dl_tensor(self) -> std::result::Result<DLTensorView<'a>, DLPackError> {
unsafe {
DLTensorView::from_raw_parts(
self.data,
DLDevice { device_type: DLDeviceType::kDLCUDA, device_id: 0 },
&self.shape,
None,
T::dl_dtype(),
)
}
}
}

impl<'a, T: DType> IntoDlTensorMut<'a> for &'a mut CudaTensor<T> {
fn into_dl_tensor_mut(self) -> std::result::Result<DLTensorViewMut<'a>, DLPackError> {
unsafe {
DLTensorViewMut::from_raw_parts(
self.data,
DLDevice { device_type: DLDeviceType::kDLCUDA, device_id: 0 },
&self.shape,
None,
T::dl_dtype(),
)
}
}
}

/// Example showing how to index and search data with CAGRA.
fn cagra_example() -> ExampleResult<()> {
let res = Resources::new()?;

// Create a new random dataset to index
// Create a new random dataset to index and copy it to the device.
let n_datapoints = 65536;
let n_features = 512;
let dataset =
ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
let dataset_host = ndarray::Array::<f32, _>::random(
(n_datapoints, n_features),
Uniform::new(0., 1.0).unwrap(),
);
let dataset = CudaTensor::from_host(&res, &dataset_host)?;

// build the cagra index
// Build the CAGRA index.
let build_params = IndexParams::new()?;
let index = Index::build(&res, &build_params, &dataset)?;
println!("Indexed {}x{} datapoints into cagra index", n_datapoints, n_features);
println!("Indexed {n_datapoints}x{n_features} datapoints into cagra index");

// use the first 4 points from the dataset as queries : will test that we get them back
// as their own nearest neighbor
// Use the first 4 points as queries; each should be its own nearest neighbor.
let n_queries = 4;
let queries = dataset.slice(s![0..n_queries, ..]);

let k = 10;
let queries_host = dataset_host.slice(s![0..n_queries, ..]).to_owned();
let queries = CudaTensor::from_host(&res, &queries_host)?;

// CAGRA search API requires queries and outputs to be on device memory
// copy query data over, and allocate new device memory for the distances/ neighbors
// outputs
let queries = ManagedTensor::from(&queries).to_device(&res)?;
let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
let neighbors = ManagedTensor::from(&neighbors_host).to_device(&res)?;

let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
let distances = ManagedTensor::from(&distances_host).to_device(&res)?;
let mut neighbors = CudaTensor::<u32>::alloc(&[n_queries, k])?;
let mut distances = CudaTensor::<f32>::alloc(&[n_queries, k])?;

let search_params = SearchParams::new()?;
index.search(&res, &search_params, &queries, &mut neighbors, &mut distances)?;

index.search(&res, &search_params, &queries, &neighbors, &distances)?;

// Copy back to host memory
distances.to_host(&res, &mut distances_host)?;
// Copy the results back to the host.
let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
neighbors.to_host(&res, &mut neighbors_host)?;
distances.to_host(&res, &mut distances_host)?;

// nearest neighbors should be themselves, since queries are from the
// dataset
println!("Neighbors {:?}", neighbors_host);
println!("Distances {:?}", distances_host);
println!("Neighbors {neighbors_host:?}");
println!("Distances {distances_host:?}");
Ok(())
}

fn main() {
if let Err(e) = cagra_example() {
println!("Failed to run CAGRA: {:?}", e);
println!("Failed to run CAGRA: {e:?}");
}
}
Loading
Loading