From 9c7da996b18dda8d5030a30c63c5140114af54f6 Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Thu, 13 Feb 2025 12:48:57 +0100 Subject: [PATCH] WIP --- .editorconfig | 3 + Cargo.toml | 8 +- README.md | 21 ++-- benches/bq_bench.rs | 106 +++++++++++++++++++ benches/my_benchmarks.rs | 1 - benches/pq_bench.rs | 83 +++++++++++++++ benches/sq_bench.rs | 107 +++++++++++++++++++ benches/utils.rs | 21 ++++ pyproject.toml | 95 +++++++++++++++++ src/logging.rs | 3 +- src/tsvq.rs | 4 +- src/utils.rs | 52 ++++++++++ src/vector.rs | 7 +- tests/distances_tests.rs | 215 +++++++++++++++++++++++++++++++++++++++ tests/vector_tests.rs | 186 +++++++++++++++++++++++++++++++++ 15 files changed, 892 insertions(+), 20 deletions(-) create mode 100644 benches/bq_bench.rs delete mode 100644 benches/my_benchmarks.rs create mode 100644 benches/pq_bench.rs create mode 100644 benches/sq_bench.rs create mode 100644 benches/utils.rs create mode 100644 pyproject.toml create mode 100644 tests/distances_tests.rs create mode 100644 tests/vector_tests.rs diff --git a/.editorconfig b/.editorconfig index b769733..000ba5a 100644 --- a/.editorconfig +++ b/.editorconfig @@ -29,3 +29,6 @@ indent_size = 2 [*.{yaml,yml}] indent_size = 2 +# Python files +[*.py] +max_line_length = 120 \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 0ccba51..a7dc987 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,15 +1,15 @@ [package] name = "vq" -version = "0.1.1" +version = "0.1.2" description = "A vector quantization library for Rust" repository = "https://github.com/habedi/vq" license = "MIT OR Apache-2.0" readme = "README.md" -keywords = ["vq", "vector-quantization", "clustering", "nearest-neighbor", "data-compression"] +keywords = ["vector-quantization", "quantization", "nearest-neighbor", "data-compression", "embeddings"] authors = ["Hassan Abedi "] homepage = "https://github.com/habedi/vq" documentation = "https://docs.rs/vq" -#categories = ["development-tools"] +categories = ["algorithms", "compression", "data-structures"] edition = "2021" [lib] @@ -33,5 +33,5 @@ rayon = "1.10" criterion = { version = "0.5", features = ["html_reports"] } [[bench]] -name = "my_benchmarks" +name = "bq_bench" harness = false diff --git a/README.md b/README.md index c6e0718..75e0ca3 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ [Crates.io](https://crates.io/crates/vq) [Docs.rs](https://docs.rs/vq) [Downloads](https://crates.io/crates/vq) +
[Docs](docs) [License](https://github.com/habedi/vq) @@ -17,18 +18,18 @@ It provides a simple, efficient API for data compression that help reduce memory ## Features - Implemented Algorithms: - - [**Binary Quantization (BQ)**](src/bq.rs) - - [**Scalar Quantization (SQ)**](src/sq.rs) - - [**Product Quantization (PQ)**](https://ieeexplore.ieee.org/document/5432202) - - [**Optimized Product Quantization (OPQ)**](https://ieeexplore.ieee.org/document/6619223) - - [**Tree-structured Vector Quantization (TSVQ)**](https://ieeexplore.ieee.org/document/515493) - - [**Residual Vector Quantization (RVQ)**](https://pmc.ncbi.nlm.nih.gov/articles/PMC3231071/) + - [Binary Quantization (BQ)](src/bq.rs) + - [Scalar Quantization (SQ)](src/sq.rs) + - [Product Quantization (PQ)](https://ieeexplore.ieee.org/document/5432202) + - [Optimized Product Quantization (OPQ)](https://ieeexplore.ieee.org/document/6619223) + - [Tree-structured Vector Quantization (TSVQ)](https://ieeexplore.ieee.org/document/515493) + - [Residual Vector Quantization (RVQ)](https://pmc.ncbi.nlm.nih.gov/articles/PMC3231071/) - Parallelized vector operations for large vectors using [Rayon](https://crates.io/crates/rayon). -- Flexible quantization algorithm implementations that support custom distance functions (e.g., Euclidean, Cosine, - Chebyshev, etc.). -- Support for quantizing vectors of `f32` to `f16` (using [half](https://github.com/starkat99/half-rs/tree/main/src)) or `u8` data types. -- Simple and intuitive API for all quantization algorithms. +- Flexible quantization algorithm implementations that support using various distance metrics such as Euclidean, Cosine, + Manhattan distances. +- Support for quantizing vectors of `f32` to `f16` (using [half](https://crates.io/crates/half)) or `u8` data types. +- Simple, intuitive, and uniform API for all quantization algorithms. ## Installation diff --git a/benches/bq_bench.rs b/benches/bq_bench.rs new file mode 100644 index 0000000..d87c561 --- /dev/null +++ b/benches/bq_bench.rs @@ -0,0 +1,106 @@ +#[path = "utils.rs"] +mod utils; + +use crate::utils::{BENCH_TIMEOUT, NUM_VECTORS}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rayon::prelude::*; +use vq::bq::BinaryQuantizer; +use vq::vector::{Vector, PARALLEL_THRESHOLD}; + +/// Benchmark quantization on a single vector that is small enough to trigger sequential processing. +fn bench_quantize_sequential(_c: &mut Criterion) { + // Create a vector with length less than PARALLEL_THRESHOLD. + let n = PARALLEL_THRESHOLD / 2; + let data: Vec = (0..n).map(|i| (i as f32) / (n as f32)).collect(); + let vector = Vector::new(data); + let quantizer = BinaryQuantizer::new(0.5, 0, 1); + + let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT); + cc.bench_function("quantize_sequential", |b| { + b.iter(|| { + let result = quantizer.quantize(black_box(&vector)); + black_box(result) + }) + }); +} + +/// Benchmark quantization on a single vector that is large enough to trigger parallel processing. +fn bench_quantize_parallel(_c: &mut Criterion) { + // Create a vector with length greater than PARALLEL_THRESHOLD. + let n = PARALLEL_THRESHOLD + 1000; + let data: Vec = (0..n).map(|i| (i as f32) / (n as f32)).collect(); + let vector = Vector::new(data); + let quantizer = BinaryQuantizer::new(0.5, 0, 1); + + let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT); + cc.bench_function("quantize_parallel", |b| { + b.iter(|| { + let result = quantizer.quantize(black_box(&vector)); + black_box(result) + }) + }); +} + +/// Benchmark quantization of many small vectors (each processed sequentially) using a sequential outer loop. +fn bench_quantize_multiple_vectors_sequential(_c: &mut Criterion) { + // Each vector is small enough to use sequential quantization internally. + let vector_size = PARALLEL_THRESHOLD / 2; + let vectors: Vec> = (0..NUM_VECTORS) + .map(|_| { + let data: Vec = (0..vector_size) + .map(|i| (i as f32) / (vector_size as f32)) + .collect(); + Vector::new(data) + }) + .collect(); + + let quantizer = BinaryQuantizer::new(0.5, 0, 1); + + let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT); + cc.bench_function("quantize_multiple_vectors_sequential", |b| { + b.iter(|| { + let results: Vec> = vectors + .iter() + .map(|v| quantizer.quantize(black_box(v))) + .collect(); + black_box(results); + }) + }); +} + +/// Benchmark quantization of many large vectors (each using parallel quantization) +/// and process them concurrently using a parallel outer loop. +fn bench_quantize_multiple_vectors_parallel_outer(_c: &mut Criterion) { + // Each vector is large enough to use parallel quantization internally. + let vector_size = PARALLEL_THRESHOLD + 100; + let vectors: Vec> = (0..NUM_VECTORS) + .map(|_| { + let data: Vec = (0..vector_size) + .map(|i| (i as f32) / (vector_size as f32)) + .collect(); + Vector::new(data) + }) + .collect(); + + let quantizer = BinaryQuantizer::new(0.5, 0, 1); + + let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT); + cc.bench_function("quantize_multiple_vectors_parallel_outer", |b| { + b.iter(|| { + let results: Vec> = vectors + .par_iter() + .map(|v| quantizer.quantize(black_box(v))) + .collect(); + black_box(results); + }) + }); +} + +criterion_group!( + benches, + bench_quantize_sequential, + bench_quantize_parallel, + bench_quantize_multiple_vectors_sequential, + bench_quantize_multiple_vectors_parallel_outer +); +criterion_main!(benches); diff --git a/benches/my_benchmarks.rs b/benches/my_benchmarks.rs deleted file mode 100644 index 8b13789..0000000 --- a/benches/my_benchmarks.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/benches/pq_bench.rs b/benches/pq_bench.rs new file mode 100644 index 0000000..ad5612d --- /dev/null +++ b/benches/pq_bench.rs @@ -0,0 +1,83 @@ +#[path = "utils.rs"] +mod utils; + +use crate::utils::{ + generate_training_data, BENCH_TIMEOUT, DIM, K, M, MAX_ITERS, NUM_VECTORS, SEED, +}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rayon::prelude::*; +use vq::distances::Distance; +use vq::pq::ProductQuantizer; +use vq::vector::Vector; + +/// Benchmark the construction of a ProductQuantizer using LBG quantization over training data. +fn bench_pq_construction(_c: &mut Criterion) { + // Generate synthetic training data. + let training_data = generate_training_data(NUM_VECTORS, DIM); + + let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT); + cc.bench_function("pq_construction", |b| { + b.iter(|| { + // Measure the time to construct the quantizer. + let pq = ProductQuantizer::new( + black_box(&training_data), + M, + K, + MAX_ITERS, + Distance::Euclidean, + SEED, + ); + black_box(pq) + }) + }); +} + +/// Benchmark quantizing a single vector using an already constructed ProductQuantizer. +fn bench_pq_quantize_single(_c: &mut Criterion) { + let training_data = generate_training_data(NUM_VECTORS, DIM); + + let pq = ProductQuantizer::new(&training_data, M, K, MAX_ITERS, Distance::Euclidean, SEED); + + // Create a test vector (must have dimension m * (dim/m) = 64). + let test_vector = Vector::new((0..DIM).map(|i| (i as f32) / (DIM as f32)).collect()); + + let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT); + cc.bench_function("pq_quantize_single_vector", |b| { + b.iter(|| { + let result = pq.quantize(black_box(&test_vector)); + black_box(result) + }) + }); +} + +/// Benchmark quantizing a batch of vectors. +fn bench_pq_quantize_multiple_vectors(_c: &mut Criterion) { + let training_data = generate_training_data(NUM_VECTORS, DIM); + + let pq = ProductQuantizer::new(&training_data, M, K, MAX_ITERS, Distance::Euclidean, SEED); + + // Generate a batch of test vectors. + let test_vectors: Vec> = (0..NUM_VECTORS) + .map(|_| Vector::new((0..DIM).map(|i| (i as f32) / (DIM as f32)).collect())) + .collect(); + + let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT); + cc.bench_function("pq_quantize_multiple_vectors", |b| { + b.iter(|| { + // Quantize each vector in the batch. + let results: Vec<_> = test_vectors + .iter() + .map(|v| pq.quantize(black_box(v))) + .collect(); + black_box(results); + }) + }); +} + +criterion_group!( + benches, + bench_pq_construction, + bench_pq_quantize_single, + bench_pq_quantize_multiple_vectors +); +criterion_main!(benches); diff --git a/benches/sq_bench.rs b/benches/sq_bench.rs new file mode 100644 index 0000000..9ac02c0 --- /dev/null +++ b/benches/sq_bench.rs @@ -0,0 +1,107 @@ +#[path = "utils.rs"] +mod utils; + +use crate::utils::{BENCH_TIMEOUT, NUM_VECTORS}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rayon::prelude::*; +use vq::sq::ScalarQuantizer; +use vq::vector::{Vector, PARALLEL_THRESHOLD}; + +/// Benchmark quantization on a single vector that is small enough to trigger sequential processing. +fn bench_sq_quantize_sequential(_c: &mut Criterion) { + // Create a vector with length less than SQ_PARALLEL_THRESHOLD. + let n = PARALLEL_THRESHOLD / 2; + let data: Vec = (0..n).map(|i| (i as f32) / (n as f32)).collect(); + let vector = Vector::new(data); + // Configure the quantizer with a range from 0.0 to 1.0 and 256 levels. + let quantizer = ScalarQuantizer::new(0.0, 1.0, 256); + + let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT); + cc.bench_function("sq_quantize_sequential", |b| { + b.iter(|| { + let result = quantizer.quantize(black_box(&vector)); + black_box(result) + }) + }); +} + +/// Benchmark quantization on a single vector that is large enough to trigger parallel processing. +fn bench_sq_quantize_parallel(_c: &mut Criterion) { + // Create a vector with length greater than SQ_PARALLEL_THRESHOLD. + let n = PARALLEL_THRESHOLD + 1000; + let data: Vec = (0..n).map(|i| (i as f32) / (n as f32)).collect(); + let vector = Vector::new(data); + let quantizer = ScalarQuantizer::new(0.0, 1.0, 256); + + let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT); + cc.bench_function("sq_quantize_parallel", |b| { + b.iter(|| { + let result = quantizer.quantize(black_box(&vector)); + black_box(result) + }) + }); +} + +/// Benchmark quantization of many small vectors (each processed sequentially) using a sequential outer loop. +fn bench_sq_quantize_multiple_vectors_sequential(_c: &mut Criterion) { + // Each vector is small enough to be processed sequentially. + let vector_size = PARALLEL_THRESHOLD / 2; + let vectors: Vec> = (0..NUM_VECTORS) + .map(|_| { + let data: Vec = (0..vector_size) + .map(|i| (i as f32) / (vector_size as f32)) + .collect(); + Vector::new(data) + }) + .collect(); + + let quantizer = ScalarQuantizer::new(0.0, 1.0, 256); + + let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT); + cc.bench_function("sq_quantize_multiple_vectors_sequential", |b| { + b.iter(|| { + let results: Vec> = vectors + .iter() + .map(|v| quantizer.quantize(black_box(v))) + .collect(); + black_box(results); + }) + }); +} + +/// Benchmark quantization of many large vectors (each using parallel quantization) +/// and process them concurrently using a parallel outer loop. +fn bench_sq_quantize_multiple_vectors_parallel_outer(_c: &mut Criterion) { + // Each vector is large enough to trigger parallel quantization internally. + let vector_size = PARALLEL_THRESHOLD + 100; + let vectors: Vec> = (0..NUM_VECTORS) + .map(|_| { + let data: Vec = (0..vector_size) + .map(|i| (i as f32) / (vector_size as f32)) + .collect(); + Vector::new(data) + }) + .collect(); + + let quantizer = ScalarQuantizer::new(0.0, 1.0, 256); + + let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT); + cc.bench_function("sq_quantize_multiple_vectors_parallel_outer", |b| { + b.iter(|| { + let results: Vec> = vectors + .par_iter() + .map(|v| quantizer.quantize(black_box(v))) + .collect(); + black_box(results); + }) + }); +} + +criterion_group!( + benches, + bench_sq_quantize_sequential, + bench_sq_quantize_parallel, + bench_sq_quantize_multiple_vectors_sequential, + bench_sq_quantize_multiple_vectors_parallel_outer +); +criterion_main!(benches); diff --git a/benches/utils.rs b/benches/utils.rs new file mode 100644 index 0000000..cddc144 --- /dev/null +++ b/benches/utils.rs @@ -0,0 +1,21 @@ +#![allow(dead_code)] + +use vq::vector::Vector; + +pub const NUM_VECTORS: usize = 10; +pub const BENCH_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(10); +pub const DIM: usize = 64; +pub const M: usize = 4; +pub const K: usize = 16; +pub const MAX_ITERS: usize = 10; +pub const SEED: u64 = 42; + +/// Generates a synthetic training dataset of `num` vectors, each of dimension `dim`. +pub fn generate_training_data(num: usize, dim: usize) -> Vec> { + (0..num) + .map(|_| { + let data: Vec = (0..dim).map(|i| (i as f32) / (dim as f32)).collect(); + Vector::new(data) + }) + .collect() +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9183ab2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,95 @@ +[tool.poetry] +name = "vq" +version = "0.1.0" +description = "A vector quantization library for Rust" +authors = ["Hassan Abedi "] +maintainers = ["Hassan Abedi "] +readme = "README.md" +repository = "https://github.com/habedi/vq" +license = "MIT OR Apache-2.0" +#packages = [{ include = "src", from = "." }] + +[tool.poetry.dependencies] +python = "^3.11" + +[tool.poetry.group.dev.dependencies] +poetry-dynamic-versioning = "^1.4.0" +pytest = "^8.0.1" +pytest-cov = "^6.0.0" +pytest-mock = "^3.14.0" +mypy = "^1.11.1" +ruff = "^0.9.3" + +#[tool.poetry.scripts] +#cli_script = "src.cli:main" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +pythonpath = [".", "src", 'bin', 'notebooks'] + +[tool.mypy] +python_version = "3.11" +ignore_missing_imports = true +disallow_untyped_calls = true +strict_optional = true +warn_redundant_casts = true + +[tool.poetry-dynamic-versioning] +enable = true +vcs = "git" +versioning = "semver" # Semantic Versioning + +# Ruff configuration +[tool.ruff] +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "venv" +] +line-length = 120 +indent-width = 4 +src = ["src", "tests", "bin", "notebooks"] +target-version = "py311" + +[tool.ruff.lint] +select = ["ANN", "D", "E", "F", "I"] +ignore = [ + # Ignore missing docstrings + "D100", "D101", "D102", "D103", "D104", "D105", "D106", "D107", +] +fixable = ["ALL"] +unfixable = [] +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.lint.per-file-ignores] +"tests/**/*.py" = [] diff --git a/src/logging.rs b/src/logging.rs index 3dc36df..3d69bed 100644 --- a/src/logging.rs +++ b/src/logging.rs @@ -1,7 +1,6 @@ use ctor::ctor; use std::env; use tracing::Level; -use tracing_subscriber; #[ctor] fn set_debug_level() { @@ -12,7 +11,7 @@ fn set_debug_level() { // Normalize the string for case-insensitive comparison. let v = v.trim().to_lowercase(); // Consider these values as "false". - !(v == "0" || v == "false" || v == "no" || v == "off" || v == "") + !(v == "0" || v == "false" || v == "no" || v == "off" || v.is_empty()) }) .unwrap_or(false); diff --git a/src/tsvq.rs b/src/tsvq.rs index 572e90b..d51b829 100644 --- a/src/tsvq.rs +++ b/src/tsvq.rs @@ -7,7 +7,7 @@ use rayon::prelude::*; /// /// Each node holds a centroid (the mean of the training data at that node) /// and optionally left/right child nodes representing further splits. -pub struct TSVQNode { +struct TSVQNode { /// The centroid of the training data at this node. pub centroid: Vector, /// Left subtree (if any). @@ -143,7 +143,7 @@ impl TSVQNode { /// (centroid) of its data, and leaf nodes provide the final quantized representations. pub struct TSVQ { /// The root node of the TSVQ tree. - pub root: TSVQNode, + root: TSVQNode, /// The distance metric used for traversing the tree. pub distance: Distance, } diff --git a/src/utils.rs b/src/utils.rs index c374c51..46dbc10 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -74,3 +74,55 @@ pub fn lbg_quantize( } centroids } + +#[cfg(test)] +mod tests { + use super::*; + use crate::vector::Vector; + + // Helper function to create the test data. + fn get_data() -> Vec> { + vec![ + Vector::new(vec![1.0, 2.0]), + Vector::new(vec![2.0, 3.0]), + Vector::new(vec![3.0, 4.0]), + Vector::new(vec![4.0, 5.0]), + ] + } + + #[test] + fn lbg_quantize_basic_functionality() { + let data = get_data(); + let centroids = lbg_quantize(&data, 2, 10, 42); + assert_eq!(centroids.len(), 2); + } + + #[test] + #[should_panic(expected = "k must be greater than 0")] + fn lbg_quantize_k_zero() { + let data = vec![Vector::new(vec![1.0, 2.0]), Vector::new(vec![2.0, 3.0])]; + lbg_quantize(&data, 0, 10, 42); + } + + #[test] + #[should_panic(expected = "Not enough data points for k clusters")] + fn lbg_quantize_not_enough_data_points() { + let data = vec![Vector::new(vec![1.0, 2.0])]; + lbg_quantize(&data, 2, 10, 42); + } + + #[test] + fn lbg_quantize_single_data_point() { + let data = vec![Vector::new(vec![1.0, 2.0])]; + let centroids = lbg_quantize(&data, 1, 10, 42); + assert_eq!(centroids.len(), 1); + assert_eq!(centroids[0], Vector::new(vec![1.0, 2.0])); + } + + #[test] + fn lbg_quantize_multiple_iterations() { + let data = get_data(); + let centroids = lbg_quantize(&data, 2, 100, 42); + assert_eq!(centroids.len(), 2); + } +} diff --git a/src/vector.rs b/src/vector.rs index f2d90f6..18e968f 100644 --- a/src/vector.rs +++ b/src/vector.rs @@ -3,7 +3,7 @@ use rayon::prelude::*; use std::fmt; use std::ops::{Add, Div, Mul, Sub}; -// Size threshold for enabling parallel computation. +/// Size threshold for enabling parallel computation. pub const PARALLEL_THRESHOLD: usize = 1024; /// Abstraction for real numbers. @@ -152,6 +152,11 @@ impl Vector { self.data.len() } + // Returns true if the vector is empty. + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } + /// Returns a slice of the data. pub fn data(&self) -> &[T] { &self.data diff --git a/tests/distances_tests.rs b/tests/distances_tests.rs new file mode 100644 index 0000000..74f2572 --- /dev/null +++ b/tests/distances_tests.rs @@ -0,0 +1,215 @@ +#[path = "utils.rs"] +mod utils; + +use vq::distances::Distance; +use vq::vector::PARALLEL_THRESHOLD; + +// We'll test using f32 as our Real type. + +// A helper function to compare two floating point numbers within a given tolerance. +fn approx_eq(a: f32, b: f32, eps: f32) -> bool { + (a - b).abs() < eps +} + +// ---------------------------- +// Squared Euclidean Distance +// ---------------------------- +#[test] +fn test_squared_euclidean_sequential() { + let a = vec![1.0f32, 2.0, 3.0]; + let b = vec![4.0f32, 6.0, 8.0]; + // (1-4)² + (2-6)² + (3-8)² = 9 + 16 + 25 = 50 + let d = Distance::SquaredEuclidean; + let result = d.compute(&a, &b); + assert!(approx_eq(result, 50.0, 1e-6)); +} + +#[test] +fn test_squared_euclidean_parallel() { + let len = PARALLEL_THRESHOLD + 10; + // Each difference is (i - (i+1)) = -1 so square is 1. + let a: Vec = (0..len).map(|i| i as f32).collect(); + let b: Vec = (0..len).map(|i| (i as f32) + 1.0).collect(); + let d = Distance::SquaredEuclidean; + let result = d.compute(&a, &b); + assert!(approx_eq(result, len as f32, 1e-6)); +} + +// ---------------------------- +// Euclidean Distance +// ---------------------------- +#[test] +fn test_euclidean_sequential() { + let a = vec![1.0f32, 2.0, 3.0]; + let b = vec![4.0f32, 6.0, 8.0]; + // Squared distance is 50, so Euclidean distance = sqrt(50) + let expected = 50.0f32.sqrt(); + let d = Distance::Euclidean; + let result = d.compute(&a, &b); + assert!(approx_eq(result, expected, 1e-6)); +} + +#[test] +fn test_euclidean_parallel() { + let len = PARALLEL_THRESHOLD + 10; + let a: Vec = (0..len).map(|i| i as f32).collect(); + let b: Vec = (0..len).map(|i| (i as f32) + 1.0).collect(); + // Each pair differs by 1 so squared differences add to len. + let expected = (len as f32).sqrt(); + let d = Distance::Euclidean; + let result = d.compute(&a, &b); + assert!(approx_eq(result, expected, 1e-6)); +} + +// ---------------------------- +// Cosine Distance +// ---------------------------- +#[test] +fn test_cosine_distance_sequential() { + // Orthogonal vectors: cosine similarity = 0, so distance = 1. + let a = vec![1.0f32, 0.0]; + let b = vec![0.0f32, 1.0]; + let d = Distance::CosineDistance; + let result = d.compute(&a, &b); + assert!(approx_eq(result, 1.0, 1e-6)); + + // Identical vectors: cosine similarity = 1, so distance = 0. + let a = vec![1.0f32, 1.0]; + let b = vec![1.0f32, 1.0]; + let result = d.compute(&a, &b); + assert!(approx_eq(result, 0.0, 1e-6)); +} + +#[test] +fn test_cosine_distance_parallel() { + let len = PARALLEL_THRESHOLD + 10; + // Use identical vectors so that cosine similarity is 1 and distance is 0. + let a = vec![1.0f32; len]; + let b = vec![1.0f32; len]; + let d = Distance::CosineDistance; + let result = d.compute(&a, &b); + assert!(approx_eq(result, 0.0, 1e-6)); +} + +// ---------------------------- +// Manhattan Distance +// ---------------------------- +#[test] +fn test_manhattan_sequential() { + let a = vec![1.0f32, 2.0, 3.0]; + let b = vec![4.0f32, 6.0, 8.0]; + // |1-4| + |2-6| + |3-8| = 3 + 4 + 5 = 12 + let d = Distance::Manhattan; + let result = d.compute(&a, &b); + assert!(approx_eq(result, 12.0, 1e-6)); +} + +#[test] +fn test_manhattan_parallel() { + let len = PARALLEL_THRESHOLD + 10; + let a: Vec = (0..len).map(|i| i as f32).collect(); + let b: Vec = (0..len).map(|i| (i as f32) + 2.0).collect(); + // Each difference is 2, so sum = 2 * len. + let expected = 2.0 * (len as f32); + let d = Distance::Manhattan; + let result = d.compute(&a, &b); + assert!(approx_eq(result, expected, 1e-6)); +} + +// ---------------------------- +// Chebyshev Distance +// ---------------------------- +#[test] +fn test_chebyshev_sequential() { + let a = vec![1.0f32, 5.0, 3.0]; + let b = vec![4.0f32, 2.0, 9.0]; + // Differences: |1-4|=3, |5-2|=3, |3-9|=6, so maximum is 6. + let d = Distance::Chebyshev; + let result = d.compute(&a, &b); + assert!(approx_eq(result, 6.0, 1e-6)); +} + +#[test] +fn test_chebyshev_parallel() { + let len = PARALLEL_THRESHOLD + 10; + // Create two vectors with nearly identical values except one coordinate. + let mut a: Vec = (0..len).map(|i| i as f32).collect(); + let mut b: Vec = (0..len).map(|i| i as f32).collect(); + // Introduce a large difference at the last element. + a[len - 1] = 1000.0; + b[len - 1] = 0.0; + let d = Distance::Chebyshev; + let result = d.compute(&a, &b); + assert!(approx_eq(result, 1000.0, 1e-6)); +} + +// ---------------------------- +// Minkowski Distance (p = 3) +// ---------------------------- +#[test] +fn test_minkowski_sequential() { + let a = vec![1.0f32, 2.0, 3.0]; + let b = vec![4.0f32, 6.0, 8.0]; + // For p = 3: + // |1-4|^3 = 27, |2-6|^3 = 64, |3-8|^3 = 125, sum = 216, cube root = 6. + let d = Distance::Minkowski(3.0); + let result = d.compute(&a, &b); + assert!(approx_eq(result, 6.0, 1e-6)); +} + +#[test] +fn test_minkowski_parallel() { + let p = 3.0; + let d = Distance::Minkowski(p); + let len = PARALLEL_THRESHOLD + 10; + let a: Vec = (0..len).map(|i| i as f32).collect(); + let b: Vec = (0..len).map(|i| (i as f32) + 1.0).collect(); + // Each difference is 1: |1|^3 = 1. Sum = len, then result = len^(1/3) + let expected = (len as f32).powf(1.0 / 3.0); + let result = d.compute(&a, &b); + assert!(approx_eq(result, expected, 1e-6)); +} + +// ---------------------------- +// Hamming Distance +// ---------------------------- +#[test] +fn test_hamming_sequential() { + let a = vec![1.0f32, 2.0, 3.0, 4.0]; + let b = vec![1.0f32, 0.0, 3.0, 0.0]; + // Differences occur at index 1 and 3, so count = 2. + let d = Distance::Hamming; + let result = d.compute(&a, &b); + assert!(approx_eq(result, 2.0, 1e-6)); +} + +#[test] +fn test_hamming_parallel() { + let len = PARALLEL_THRESHOLD + 10; + let a: Vec = vec![1.0f32; len]; + // Make b differ on every odd index. + let b: Vec = (0..len) + .map(|i| if i % 2 == 0 { 1.0f32 } else { 0.0f32 }) + .collect(); + // Expected differences: about half the indices. + let expected = if len % 2 == 0 { + (len / 2) as f32 + } else { + ((len / 2) + 1) as f32 + }; + let d = Distance::Hamming; + let result = d.compute(&a, &b); + assert!(approx_eq(result, expected, 1e-6)); +} + +// ---------------------------- +// Mismatched Lengths +// ---------------------------- +#[test] +#[should_panic(expected = "Input slices must have the same length")] +fn test_compute_mismatched_lengths() { + let a = vec![1.0f32, 2.0]; + let b = vec![1.0f32]; + let d = Distance::Euclidean; + let _ = d.compute(&a, &b); +} diff --git a/tests/vector_tests.rs b/tests/vector_tests.rs new file mode 100644 index 0000000..a027a77 --- /dev/null +++ b/tests/vector_tests.rs @@ -0,0 +1,186 @@ +#[path = "utils.rs"] +mod utils; + +use half::{bf16, f16}; +use vq::vector::{mean_vector, Vector, PARALLEL_THRESHOLD}; + +// A small helper to compare floating point numbers with an epsilon. +fn approx_eq(a: f32, b: f32, eps: f32) -> bool { + (a - b).abs() < eps +} + +#[test] +fn test_addition() { + let a = Vector::new(vec![1.0f32, 2.0, 3.0]); + let b = Vector::new(vec![4.0f32, 5.0, 6.0]); + let result = &a + &b; + assert_eq!(result.data, vec![5.0, 7.0, 9.0]); +} + +#[test] +fn test_subtraction() { + let a = Vector::new(vec![4.0f32, 5.0, 6.0]); + let b = Vector::new(vec![1.0f32, 2.0, 3.0]); + let result = &a - &b; + assert_eq!(result.data, vec![3.0, 3.0, 3.0]); +} + +#[test] +fn test_scalar_multiplication() { + let a = Vector::new(vec![1.0f32, 2.0, 3.0]); + let result = &a * 2.0f32; + assert_eq!(result.data, vec![2.0, 4.0, 6.0]); +} + +#[test] +fn test_dot_product_sequential() { + // Use a small vector to force sequential dot product. + let a = Vector::new(vec![1.0f32, 2.0, 3.0]); + let b = Vector::new(vec![4.0f32, 5.0, 6.0]); + // 1*4 + 2*5 + 3*6 = 32 + let dot = a.dot(&b); + assert!(approx_eq(dot, 32.0, 1e-6)); +} + +#[test] +fn test_dot_product_parallel() { + // Create vectors longer than PARALLEL_THRESHOLD so that parallel code is used. + let len = PARALLEL_THRESHOLD + 1; + let a = Vector::new((0..len).map(|i| i as f32).collect()); + let b = Vector::new((0..len).map(|i| (i as f32) * 2.0).collect()); + // dot = 2 * sum(i^2) for i in 0..len + let expected: f32 = 2.0 * (0..len).map(|i| (i as f32).powi(2)).sum::(); + let dot = a.dot(&b); + println!("Expected: {}, Actual: {}", expected, dot); + assert!(approx_eq(dot, expected, 1e3)); // Using a larger epsilon due to error accumulation. +} + +#[test] +fn test_norm() { + // For a vector [3,4], norm should be 5. + let a = Vector::new(vec![3.0f32, 4.0]); + let norm = a.norm(); + assert!(approx_eq(norm, 5.0, 1e-6)); +} + +#[test] +fn test_distance2() { + // Distance squared between [1,2,3] and [4,5,6]: + // (1-4)^2 + (2-5)^2 + (3-6)^2 = 9 + 9 + 9 = 27 + let a = Vector::new(vec![1.0f32, 2.0, 3.0]); + let b = Vector::new(vec![4.0f32, 5.0, 6.0]); + let dist2 = a.distance2(&b); + assert!(approx_eq(dist2, 27.0, 1e-6)); +} + +#[test] +fn test_mean_vector_sequential() { + let vectors = vec![ + Vector::new(vec![1.0f32, 2.0, 3.0]), + Vector::new(vec![4.0f32, 5.0, 6.0]), + Vector::new(vec![7.0f32, 8.0, 9.0]), + ]; + let mean = mean_vector(&vectors); + // Expected mean: ([1+4+7, 2+5+8, 3+6+9] / 3) = [4, 5, 6] + assert!(approx_eq(mean.data[0], 4.0, 1e-6)); + assert!(approx_eq(mean.data[1], 5.0, 1e-6)); + assert!(approx_eq(mean.data[2], 6.0, 1e-6)); +} + +#[test] +fn test_mean_vector_parallel() { + // Create more than PARALLEL_THRESHOLD vectors of identical content. + let n = PARALLEL_THRESHOLD + 10; + let dim = 5; + let vectors: Vec<_> = (0..n) + .map(|_| Vector::new((0..dim).map(|i| i as f32).collect())) + .collect(); + let mean = mean_vector(&vectors); + // Each vector is the same so the mean should be identical. + let expected: Vec = (0..dim).map(|i| i as f32).collect(); + for (m, e) in mean.data.iter().zip(expected.iter()) { + assert!(approx_eq(*m, *e, 1e-6)); + } +} + +#[test] +#[should_panic(expected = "Vectors must be same length")] +fn test_addition_mismatched_dimensions() { + let a = Vector::new(vec![1.0f32, 2.0]); + let b = Vector::new(vec![1.0f32, 2.0, 3.0]); + let _ = &a + &b; +} + +#[test] +#[should_panic(expected = "Vectors must be same length")] +fn test_dot_product_mismatched_dimensions() { + let a = Vector::new(vec![1.0f32, 2.0]); + let b = Vector::new(vec![1.0f32]); + let _ = a.dot(&b); +} + +#[test] +#[should_panic(expected = "Cannot compute mean of empty slice")] +fn test_mean_vector_empty() { + let vectors: Vec> = vec![]; + let _ = mean_vector(&vectors); +} + +#[test] +fn test_display() { + let a = Vector::new(vec![1.0f32, 2.0, 3.0]); + let s = format!("{}", a); + // Check that the string starts with "Vector [" and ends with "]" + assert!(s.starts_with("Vector [")); + assert!(s.ends_with("]")); +} + +// --- Tests using other Real types --- + +#[test] +fn test_f64_operations() { + let a = Vector::new(vec![1.0f64, 2.0, 3.0]); + let b = Vector::new(vec![4.0f64, 5.0, 6.0]); + let dot = a.dot(&b); + assert!((dot - 32.0).abs() < 1e-6); + let norm = a.norm(); + let expected_norm = (1.0f64 * 1.0 + 2.0 * 2.0 + 3.0 * 3.0).sqrt(); + assert!((norm - expected_norm).abs() < 1e-6); +} + +#[test] +fn test_f16_operations() { + // Test basic operations using half-precision (f16). + let a = Vector::new(vec![ + f16::from_f32(1.0), + f16::from_f32(2.0), + f16::from_f32(3.0), + ]); + let b = Vector::new(vec![ + f16::from_f32(4.0), + f16::from_f32(5.0), + f16::from_f32(6.0), + ]); + let dot = a.dot(&b); + // Convert dot to f32 for comparison. + let dot_f32 = f32::from(dot); + assert!((dot_f32 - 32.0).abs() < 1e-1); +} + +#[test] +fn test_bf16_operations() { + // Test basic operations using bf16. + let a = Vector::new(vec![ + bf16::from_f32(1.0), + bf16::from_f32(2.0), + bf16::from_f32(3.0), + ]); + let b = Vector::new(vec![ + bf16::from_f32(4.0), + bf16::from_f32(5.0), + bf16::from_f32(6.0), + ]); + let dot = a.dot(&b); + let dot_f32 = f32::from(dot); + assert!((dot_f32 - 32.0).abs() < 1e-1); +}