WIP

habedi · Feb 13, 2025 · 071330f · 071330f
1 parent 8b0c66a
commit 071330f
Show file tree

Hide file tree

Showing 8 changed files with 474 additions and 16 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "vq"
-version = "0.1.1"
+version = "0.1.2"
 description = "A vector quantization library for Rust"
 repository = "https://github.com/habedi/vq"
 license = "MIT OR Apache-2.0"

diff --git a/README.md b/README.md
@@ -6,6 +6,7 @@
 [<img alt="Crates.io" src="https://img.shields.io/crates/v/vq.svg?style=for-the-badge&color=fc8d62&logo=rust" height="20">](https://crates.io/crates/vq)
 [<img alt="Docs.rs" src="https://img.shields.io/badge/docs.rs-vq-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs" height="20">](https://docs.rs/vq)
 [<img alt="Downloads" src="https://img.shields.io/crates/d/vq?style=for-the-badge&labelColor=555555&logo=rust" height="20">](https://crates.io/crates/vq)
+<br>
 [<img alt="Docs" src="https://img.shields.io/badge/docs-latest-3776ab?style=for-the-badge&labelColor=555555&logo=readthedocs" height="20">](docs)
 [<img alt="License" src="https://img.shields.io/badge/license-MIT%2FApache--2.0-007ec6?style=for-the-badge&labelColor=555555&logo=open-source-initiative" height="20">](https://github.com/habedi/vq)
 
@@ -17,18 +18,18 @@ It provides a simple, efficient API for data compression that help reduce memory
 ## Features
 
 - Implemented Algorithms:
-    - [**Binary Quantization (BQ)**](src/bq.rs)
-    - [**Scalar Quantization (SQ)**](src/sq.rs)
-    - [**Product Quantization (PQ)**](https://ieeexplore.ieee.org/document/5432202)
-    - [**Optimized Product Quantization (OPQ)**](https://ieeexplore.ieee.org/document/6619223)
-    - [**Tree-structured Vector Quantization (TSVQ)**](https://ieeexplore.ieee.org/document/515493)
-    - [**Residual Vector Quantization (RVQ)**](https://pmc.ncbi.nlm.nih.gov/articles/PMC3231071/)
+    - [Binary Quantization (BQ)](src/bq.rs)
+    - [Scalar Quantization (SQ)](src/sq.rs)
+    - [Product Quantization (PQ)](https://ieeexplore.ieee.org/document/5432202)
+    - [Optimized Product Quantization (OPQ)](https://ieeexplore.ieee.org/document/6619223)
+    - [Tree-structured Vector Quantization (TSVQ)](https://ieeexplore.ieee.org/document/515493)
+    - [Residual Vector Quantization (RVQ)](https://pmc.ncbi.nlm.nih.gov/articles/PMC3231071/)
 
 - Parallelized vector operations for large vectors using [Rayon](https://crates.io/crates/rayon).
-- Flexible quantization algorithm implementations that support custom distance functions (e.g., Euclidean, Cosine,
-  Chebyshev, etc.).
-- Support for quantizing vectors of `f32` to `f16` (using [half](https://github.com/starkat99/half-rs/tree/main/src)) or `u8` data types.
-- Simple and intuitive API for all quantization algorithms.
+- Flexible quantization algorithm implementations that support using various distance metrics such as Euclidean, Cosine,
+  Manhattan distances. 
+- Support for quantizing vectors of `f32` to `f16` (using [half](https://crates.io/crates/half)) or `u8` data types.
+- Simple, intuitive, and uniform API for all quantization algorithms.
 
 ## Installation
 

diff --git a/src/logging.rs b/src/logging.rs
@@ -1,7 +1,6 @@
 use ctor::ctor;
 use std::env;
 use tracing::Level;
-use tracing_subscriber;
 
 #[ctor]
 fn set_debug_level() {
@@ -12,7 +11,7 @@ fn set_debug_level() {
             // Normalize the string for case-insensitive comparison.
             let v = v.trim().to_lowercase();
             // Consider these values as "false".
-            !(v == "0" || v == "false" || v == "no" || v == "off" || v == "")
+            !(v == "0" || v == "false" || v == "no" || v == "off" || v.is_empty())
         })
         .unwrap_or(false);
 

diff --git a/src/tsvq.rs b/src/tsvq.rs
@@ -7,7 +7,7 @@ use rayon::prelude::*;
 ///
 /// Each node holds a centroid (the mean of the training data at that node)
 /// and optionally left/right child nodes representing further splits.
-pub struct TSVQNode {
+struct TSVQNode {
     /// The centroid of the training data at this node.
     pub centroid: Vector<f32>,
     /// Left subtree (if any).
@@ -143,7 +143,7 @@ impl TSVQNode {
 /// (centroid) of its data, and leaf nodes provide the final quantized representations.
 pub struct TSVQ {
     /// The root node of the TSVQ tree.
-    pub root: TSVQNode,
+    root: TSVQNode,
     /// The distance metric used for traversing the tree.
     pub distance: Distance,
 }

diff --git a/src/utils.rs b/src/utils.rs
@@ -74,3 +74,55 @@ pub fn lbg_quantize(
     }
     centroids
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::vector::Vector;
+
+    // Helper function to create the test data.
+    fn get_data() -> Vec<Vector<f32>> {
+        vec![
+            Vector::new(vec![1.0, 2.0]),
+            Vector::new(vec![2.0, 3.0]),
+            Vector::new(vec![3.0, 4.0]),
+            Vector::new(vec![4.0, 5.0]),
+        ]
+    }
+
+    #[test]
+    fn lbg_quantize_basic_functionality() {
+        let data = get_data();
+        let centroids = lbg_quantize(&data, 2, 10, 42);
+        assert_eq!(centroids.len(), 2);
+    }
+
+    #[test]
+    #[should_panic(expected = "k must be greater than 0")]
+    fn lbg_quantize_k_zero() {
+        let data = vec![Vector::new(vec![1.0, 2.0]), Vector::new(vec![2.0, 3.0])];
+        lbg_quantize(&data, 0, 10, 42);
+    }
+
+    #[test]
+    #[should_panic(expected = "Not enough data points for k clusters")]
+    fn lbg_quantize_not_enough_data_points() {
+        let data = vec![Vector::new(vec![1.0, 2.0])];
+        lbg_quantize(&data, 2, 10, 42);
+    }
+
+    #[test]
+    fn lbg_quantize_single_data_point() {
+        let data = vec![Vector::new(vec![1.0, 2.0])];
+        let centroids = lbg_quantize(&data, 1, 10, 42);
+        assert_eq!(centroids.len(), 1);
+        assert_eq!(centroids[0], Vector::new(vec![1.0, 2.0]));
+    }
+
+    #[test]
+    fn lbg_quantize_multiple_iterations() {
+        let data = get_data();
+        let centroids = lbg_quantize(&data, 2, 100, 42);
+        assert_eq!(centroids.len(), 2);
+    }
+}
diff --git a/src/vector.rs b/src/vector.rs
@@ -3,7 +3,7 @@ use rayon::prelude::*;
 use std::fmt;
 use std::ops::{Add, Div, Mul, Sub};
 
-// Size threshold for enabling parallel computation.
+/// Size threshold for enabling parallel computation.
 pub const PARALLEL_THRESHOLD: usize = 1024;
 
 /// Abstraction for real numbers.
@@ -152,6 +152,11 @@ impl<T: Real> Vector<T> {
         self.data.len()
     }
 
+    // Returns true if the vector is empty.
+    pub fn is_empty(&self) -> bool {
+        self.data.is_empty()
+    }
+
     /// Returns a slice of the data.
     pub fn data(&self) -> &[T] {
         &self.data

diff --git a/tests/distances_tests.rs b/tests/distances_tests.rs
@@ -0,0 +1,215 @@
+#[path = "utils.rs"]
+mod utils;
+
+use vq::distances::Distance;
+use vq::vector::PARALLEL_THRESHOLD;
+
+// We'll test using f32 as our Real type.
+
+// A helper function to compare two floating point numbers within a given tolerance.
+fn approx_eq(a: f32, b: f32, eps: f32) -> bool {
+    (a - b).abs() < eps
+}
+
+// ----------------------------
+// Squared Euclidean Distance
+// ----------------------------
+#[test]
+fn test_squared_euclidean_sequential() {
+    let a = vec![1.0f32, 2.0, 3.0];
+    let b = vec![4.0f32, 6.0, 8.0];
+    // (1-4)² + (2-6)² + (3-8)² = 9 + 16 + 25 = 50
+    let d = Distance::SquaredEuclidean;
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, 50.0, 1e-6));
+}
+
+#[test]
+fn test_squared_euclidean_parallel() {
+    let len = PARALLEL_THRESHOLD + 10;
+    // Each difference is (i - (i+1)) = -1 so square is 1.
+    let a: Vec<f32> = (0..len).map(|i| i as f32).collect();
+    let b: Vec<f32> = (0..len).map(|i| (i as f32) + 1.0).collect();
+    let d = Distance::SquaredEuclidean;
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, len as f32, 1e-6));
+}
+
+// ----------------------------
+// Euclidean Distance
+// ----------------------------
+#[test]
+fn test_euclidean_sequential() {
+    let a = vec![1.0f32, 2.0, 3.0];
+    let b = vec![4.0f32, 6.0, 8.0];
+    // Squared distance is 50, so Euclidean distance = sqrt(50)
+    let expected = 50.0f32.sqrt();
+    let d = Distance::Euclidean;
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, expected, 1e-6));
+}
+
+#[test]
+fn test_euclidean_parallel() {
+    let len = PARALLEL_THRESHOLD + 10;
+    let a: Vec<f32> = (0..len).map(|i| i as f32).collect();
+    let b: Vec<f32> = (0..len).map(|i| (i as f32) + 1.0).collect();
+    // Each pair differs by 1 so squared differences add to len.
+    let expected = (len as f32).sqrt();
+    let d = Distance::Euclidean;
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, expected, 1e-6));
+}
+
+// ----------------------------
+// Cosine Distance
+// ----------------------------
+#[test]
+fn test_cosine_distance_sequential() {
+    // Orthogonal vectors: cosine similarity = 0, so distance = 1.
+    let a = vec![1.0f32, 0.0];
+    let b = vec![0.0f32, 1.0];
+    let d = Distance::CosineDistance;
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, 1.0, 1e-6));
+
+    // Identical vectors: cosine similarity = 1, so distance = 0.
+    let a = vec![1.0f32, 1.0];
+    let b = vec![1.0f32, 1.0];
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, 0.0, 1e-6));
+}
+
+#[test]
+fn test_cosine_distance_parallel() {
+    let len = PARALLEL_THRESHOLD + 10;
+    // Use identical vectors so that cosine similarity is 1 and distance is 0.
+    let a = vec![1.0f32; len];
+    let b = vec![1.0f32; len];
+    let d = Distance::CosineDistance;
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, 0.0, 1e-6));
+}
+
+// ----------------------------
+// Manhattan Distance
+// ----------------------------
+#[test]
+fn test_manhattan_sequential() {
+    let a = vec![1.0f32, 2.0, 3.0];
+    let b = vec![4.0f32, 6.0, 8.0];
+    // |1-4| + |2-6| + |3-8| = 3 + 4 + 5 = 12
+    let d = Distance::Manhattan;
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, 12.0, 1e-6));
+}
+
+#[test]
+fn test_manhattan_parallel() {
+    let len = PARALLEL_THRESHOLD + 10;
+    let a: Vec<f32> = (0..len).map(|i| i as f32).collect();
+    let b: Vec<f32> = (0..len).map(|i| (i as f32) + 2.0).collect();
+    // Each difference is 2, so sum = 2 * len.
+    let expected = 2.0 * (len as f32);
+    let d = Distance::Manhattan;
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, expected, 1e-6));
+}
+
+// ----------------------------
+// Chebyshev Distance
+// ----------------------------
+#[test]
+fn test_chebyshev_sequential() {
+    let a = vec![1.0f32, 5.0, 3.0];
+    let b = vec![4.0f32, 2.0, 9.0];
+    // Differences: |1-4|=3, |5-2|=3, |3-9|=6, so maximum is 6.
+    let d = Distance::Chebyshev;
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, 6.0, 1e-6));
+}
+
+#[test]
+fn test_chebyshev_parallel() {
+    let len = PARALLEL_THRESHOLD + 10;
+    // Create two vectors with nearly identical values except one coordinate.
+    let mut a: Vec<f32> = (0..len).map(|i| i as f32).collect();
+    let mut b: Vec<f32> = (0..len).map(|i| i as f32).collect();
+    // Introduce a large difference at the last element.
+    a[len - 1] = 1000.0;
+    b[len - 1] = 0.0;
+    let d = Distance::Chebyshev;
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, 1000.0, 1e-6));
+}
+
+// ----------------------------
+// Minkowski Distance (p = 3)
+// ----------------------------
+#[test]
+fn test_minkowski_sequential() {
+    let a = vec![1.0f32, 2.0, 3.0];
+    let b = vec![4.0f32, 6.0, 8.0];
+    // For p = 3:
+    // |1-4|^3 = 27, |2-6|^3 = 64, |3-8|^3 = 125, sum = 216, cube root = 6.
+    let d = Distance::Minkowski(3.0);
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, 6.0, 1e-6));
+}
+
+#[test]
+fn test_minkowski_parallel() {
+    let p = 3.0;
+    let d = Distance::Minkowski(p);
+    let len = PARALLEL_THRESHOLD + 10;
+    let a: Vec<f32> = (0..len).map(|i| i as f32).collect();
+    let b: Vec<f32> = (0..len).map(|i| (i as f32) + 1.0).collect();
+    // Each difference is 1: |1|^3 = 1. Sum = len, then result = len^(1/3)
+    let expected = (len as f32).powf(1.0 / 3.0);
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, expected, 1e-6));
+}
+
+// ----------------------------
+// Hamming Distance
+// ----------------------------
+#[test]
+fn test_hamming_sequential() {
+    let a = vec![1.0f32, 2.0, 3.0, 4.0];
+    let b = vec![1.0f32, 0.0, 3.0, 0.0];
+    // Differences occur at index 1 and 3, so count = 2.
+    let d = Distance::Hamming;
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, 2.0, 1e-6));
+}
+
+#[test]
+fn test_hamming_parallel() {
+    let len = PARALLEL_THRESHOLD + 10;
+    let a: Vec<f32> = vec![1.0f32; len];
+    // Make b differ on every odd index.
+    let b: Vec<f32> = (0..len)
+        .map(|i| if i % 2 == 0 { 1.0f32 } else { 0.0f32 })
+        .collect();
+    // Expected differences: about half the indices.
+    let expected = if len % 2 == 0 {
+        (len / 2) as f32
+    } else {
+        ((len / 2) + 1) as f32
+    };
+    let d = Distance::Hamming;
+    let result = d.compute(&a, &b);
+    assert!(approx_eq(result, expected, 1e-6));
+}
+
+// ----------------------------
+// Mismatched Lengths
+// ----------------------------
+#[test]
+#[should_panic(expected = "Input slices must have the same length")]
+fn test_compute_mismatched_lengths() {
+    let a = vec![1.0f32, 2.0];
+    let b = vec![1.0f32];
+    let d = Distance::Euclidean;
+    let _ = d.compute(&a, &b);
+}