Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
habedi committed Feb 13, 2025
1 parent 8b0c66a commit 071330f
Show file tree
Hide file tree
Showing 8 changed files with 474 additions and 16 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "vq"
version = "0.1.1"
version = "0.1.2"
description = "A vector quantization library for Rust"
repository = "https://github.com/habedi/vq"
license = "MIT OR Apache-2.0"
Expand Down
21 changes: 11 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
[<img alt="Crates.io" src="https://img.shields.io/crates/v/vq.svg?style=for-the-badge&color=fc8d62&logo=rust" height="20">](https://crates.io/crates/vq)
[<img alt="Docs.rs" src="https://img.shields.io/badge/docs.rs-vq-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs" height="20">](https://docs.rs/vq)
[<img alt="Downloads" src="https://img.shields.io/crates/d/vq?style=for-the-badge&labelColor=555555&logo=rust" height="20">](https://crates.io/crates/vq)
<br>
[<img alt="Docs" src="https://img.shields.io/badge/docs-latest-3776ab?style=for-the-badge&labelColor=555555&logo=readthedocs" height="20">](docs)
[<img alt="License" src="https://img.shields.io/badge/license-MIT%2FApache--2.0-007ec6?style=for-the-badge&labelColor=555555&logo=open-source-initiative" height="20">](https://github.com/habedi/vq)

Expand All @@ -17,18 +18,18 @@ It provides a simple, efficient API for data compression that help reduce memory
## Features

- Implemented Algorithms:
- [**Binary Quantization (BQ)**](src/bq.rs)
- [**Scalar Quantization (SQ)**](src/sq.rs)
- [**Product Quantization (PQ)**](https://ieeexplore.ieee.org/document/5432202)
- [**Optimized Product Quantization (OPQ)**](https://ieeexplore.ieee.org/document/6619223)
- [**Tree-structured Vector Quantization (TSVQ)**](https://ieeexplore.ieee.org/document/515493)
- [**Residual Vector Quantization (RVQ)**](https://pmc.ncbi.nlm.nih.gov/articles/PMC3231071/)
- [Binary Quantization (BQ)](src/bq.rs)
- [Scalar Quantization (SQ)](src/sq.rs)
- [Product Quantization (PQ)](https://ieeexplore.ieee.org/document/5432202)
- [Optimized Product Quantization (OPQ)](https://ieeexplore.ieee.org/document/6619223)
- [Tree-structured Vector Quantization (TSVQ)](https://ieeexplore.ieee.org/document/515493)
- [Residual Vector Quantization (RVQ)](https://pmc.ncbi.nlm.nih.gov/articles/PMC3231071/)

- Parallelized vector operations for large vectors using [Rayon](https://crates.io/crates/rayon).
- Flexible quantization algorithm implementations that support custom distance functions (e.g., Euclidean, Cosine,
Chebyshev, etc.).
- Support for quantizing vectors of `f32` to `f16` (using [half](https://github.com/starkat99/half-rs/tree/main/src)) or `u8` data types.
- Simple and intuitive API for all quantization algorithms.
- Flexible quantization algorithm implementations that support using various distance metrics such as Euclidean, Cosine,
Manhattan distances.
- Support for quantizing vectors of `f32` to `f16` (using [half](https://crates.io/crates/half)) or `u8` data types.
- Simple, intuitive, and uniform API for all quantization algorithms.

## Installation

Expand Down
3 changes: 1 addition & 2 deletions src/logging.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use ctor::ctor;
use std::env;
use tracing::Level;
use tracing_subscriber;

#[ctor]
fn set_debug_level() {
Expand All @@ -12,7 +11,7 @@ fn set_debug_level() {
// Normalize the string for case-insensitive comparison.
let v = v.trim().to_lowercase();
// Consider these values as "false".
!(v == "0" || v == "false" || v == "no" || v == "off" || v == "")
!(v == "0" || v == "false" || v == "no" || v == "off" || v.is_empty())
})
.unwrap_or(false);

Expand Down
4 changes: 2 additions & 2 deletions src/tsvq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use rayon::prelude::*;
///
/// Each node holds a centroid (the mean of the training data at that node)
/// and optionally left/right child nodes representing further splits.
pub struct TSVQNode {
struct TSVQNode {
/// The centroid of the training data at this node.
pub centroid: Vector<f32>,
/// Left subtree (if any).
Expand Down Expand Up @@ -143,7 +143,7 @@ impl TSVQNode {
/// (centroid) of its data, and leaf nodes provide the final quantized representations.
pub struct TSVQ {
/// The root node of the TSVQ tree.
pub root: TSVQNode,
root: TSVQNode,
/// The distance metric used for traversing the tree.
pub distance: Distance,
}
Expand Down
52 changes: 52 additions & 0 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,55 @@ pub fn lbg_quantize(
}
centroids
}

#[cfg(test)]
mod tests {
use super::*;
use crate::vector::Vector;

// Helper function to create the test data.
fn get_data() -> Vec<Vector<f32>> {
vec![
Vector::new(vec![1.0, 2.0]),
Vector::new(vec![2.0, 3.0]),
Vector::new(vec![3.0, 4.0]),
Vector::new(vec![4.0, 5.0]),
]
}

#[test]
fn lbg_quantize_basic_functionality() {
let data = get_data();
let centroids = lbg_quantize(&data, 2, 10, 42);
assert_eq!(centroids.len(), 2);
}

#[test]
#[should_panic(expected = "k must be greater than 0")]
fn lbg_quantize_k_zero() {
let data = vec![Vector::new(vec![1.0, 2.0]), Vector::new(vec![2.0, 3.0])];
lbg_quantize(&data, 0, 10, 42);
}

#[test]
#[should_panic(expected = "Not enough data points for k clusters")]
fn lbg_quantize_not_enough_data_points() {
let data = vec![Vector::new(vec![1.0, 2.0])];
lbg_quantize(&data, 2, 10, 42);
}

#[test]
fn lbg_quantize_single_data_point() {
let data = vec![Vector::new(vec![1.0, 2.0])];
let centroids = lbg_quantize(&data, 1, 10, 42);
assert_eq!(centroids.len(), 1);
assert_eq!(centroids[0], Vector::new(vec![1.0, 2.0]));
}

#[test]
fn lbg_quantize_multiple_iterations() {
let data = get_data();
let centroids = lbg_quantize(&data, 2, 100, 42);
assert_eq!(centroids.len(), 2);
}
}
7 changes: 6 additions & 1 deletion src/vector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use rayon::prelude::*;
use std::fmt;
use std::ops::{Add, Div, Mul, Sub};

// Size threshold for enabling parallel computation.
/// Size threshold for enabling parallel computation.
pub const PARALLEL_THRESHOLD: usize = 1024;

/// Abstraction for real numbers.
Expand Down Expand Up @@ -152,6 +152,11 @@ impl<T: Real> Vector<T> {
self.data.len()
}

// Returns true if the vector is empty.
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}

/// Returns a slice of the data.
pub fn data(&self) -> &[T] {
&self.data
Expand Down
215 changes: 215 additions & 0 deletions tests/distances_tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
#[path = "utils.rs"]
mod utils;

use vq::distances::Distance;
use vq::vector::PARALLEL_THRESHOLD;

// We'll test using f32 as our Real type.

// A helper function to compare two floating point numbers within a given tolerance.
fn approx_eq(a: f32, b: f32, eps: f32) -> bool {
(a - b).abs() < eps
}

// ----------------------------
// Squared Euclidean Distance
// ----------------------------
#[test]
fn test_squared_euclidean_sequential() {
let a = vec![1.0f32, 2.0, 3.0];
let b = vec![4.0f32, 6.0, 8.0];
// (1-4)² + (2-6)² + (3-8)² = 9 + 16 + 25 = 50
let d = Distance::SquaredEuclidean;
let result = d.compute(&a, &b);
assert!(approx_eq(result, 50.0, 1e-6));
}

#[test]
fn test_squared_euclidean_parallel() {
let len = PARALLEL_THRESHOLD + 10;
// Each difference is (i - (i+1)) = -1 so square is 1.
let a: Vec<f32> = (0..len).map(|i| i as f32).collect();
let b: Vec<f32> = (0..len).map(|i| (i as f32) + 1.0).collect();
let d = Distance::SquaredEuclidean;
let result = d.compute(&a, &b);
assert!(approx_eq(result, len as f32, 1e-6));
}

// ----------------------------
// Euclidean Distance
// ----------------------------
#[test]
fn test_euclidean_sequential() {
let a = vec![1.0f32, 2.0, 3.0];
let b = vec![4.0f32, 6.0, 8.0];
// Squared distance is 50, so Euclidean distance = sqrt(50)
let expected = 50.0f32.sqrt();
let d = Distance::Euclidean;
let result = d.compute(&a, &b);
assert!(approx_eq(result, expected, 1e-6));
}

#[test]
fn test_euclidean_parallel() {
let len = PARALLEL_THRESHOLD + 10;
let a: Vec<f32> = (0..len).map(|i| i as f32).collect();
let b: Vec<f32> = (0..len).map(|i| (i as f32) + 1.0).collect();
// Each pair differs by 1 so squared differences add to len.
let expected = (len as f32).sqrt();
let d = Distance::Euclidean;
let result = d.compute(&a, &b);
assert!(approx_eq(result, expected, 1e-6));
}

// ----------------------------
// Cosine Distance
// ----------------------------
#[test]
fn test_cosine_distance_sequential() {
// Orthogonal vectors: cosine similarity = 0, so distance = 1.
let a = vec![1.0f32, 0.0];
let b = vec![0.0f32, 1.0];
let d = Distance::CosineDistance;
let result = d.compute(&a, &b);
assert!(approx_eq(result, 1.0, 1e-6));

// Identical vectors: cosine similarity = 1, so distance = 0.
let a = vec![1.0f32, 1.0];
let b = vec![1.0f32, 1.0];
let result = d.compute(&a, &b);
assert!(approx_eq(result, 0.0, 1e-6));
}

#[test]
fn test_cosine_distance_parallel() {
let len = PARALLEL_THRESHOLD + 10;
// Use identical vectors so that cosine similarity is 1 and distance is 0.
let a = vec![1.0f32; len];
let b = vec![1.0f32; len];
let d = Distance::CosineDistance;
let result = d.compute(&a, &b);
assert!(approx_eq(result, 0.0, 1e-6));
}

// ----------------------------
// Manhattan Distance
// ----------------------------
#[test]
fn test_manhattan_sequential() {
let a = vec![1.0f32, 2.0, 3.0];
let b = vec![4.0f32, 6.0, 8.0];
// |1-4| + |2-6| + |3-8| = 3 + 4 + 5 = 12
let d = Distance::Manhattan;
let result = d.compute(&a, &b);
assert!(approx_eq(result, 12.0, 1e-6));
}

#[test]
fn test_manhattan_parallel() {
let len = PARALLEL_THRESHOLD + 10;
let a: Vec<f32> = (0..len).map(|i| i as f32).collect();
let b: Vec<f32> = (0..len).map(|i| (i as f32) + 2.0).collect();
// Each difference is 2, so sum = 2 * len.
let expected = 2.0 * (len as f32);
let d = Distance::Manhattan;
let result = d.compute(&a, &b);
assert!(approx_eq(result, expected, 1e-6));
}

// ----------------------------
// Chebyshev Distance
// ----------------------------
#[test]
fn test_chebyshev_sequential() {
let a = vec![1.0f32, 5.0, 3.0];
let b = vec![4.0f32, 2.0, 9.0];
// Differences: |1-4|=3, |5-2|=3, |3-9|=6, so maximum is 6.
let d = Distance::Chebyshev;
let result = d.compute(&a, &b);
assert!(approx_eq(result, 6.0, 1e-6));
}

#[test]
fn test_chebyshev_parallel() {
let len = PARALLEL_THRESHOLD + 10;
// Create two vectors with nearly identical values except one coordinate.
let mut a: Vec<f32> = (0..len).map(|i| i as f32).collect();
let mut b: Vec<f32> = (0..len).map(|i| i as f32).collect();
// Introduce a large difference at the last element.
a[len - 1] = 1000.0;
b[len - 1] = 0.0;
let d = Distance::Chebyshev;
let result = d.compute(&a, &b);
assert!(approx_eq(result, 1000.0, 1e-6));
}

// ----------------------------
// Minkowski Distance (p = 3)
// ----------------------------
#[test]
fn test_minkowski_sequential() {
let a = vec![1.0f32, 2.0, 3.0];
let b = vec![4.0f32, 6.0, 8.0];
// For p = 3:
// |1-4|^3 = 27, |2-6|^3 = 64, |3-8|^3 = 125, sum = 216, cube root = 6.
let d = Distance::Minkowski(3.0);
let result = d.compute(&a, &b);
assert!(approx_eq(result, 6.0, 1e-6));
}

#[test]
fn test_minkowski_parallel() {
let p = 3.0;
let d = Distance::Minkowski(p);
let len = PARALLEL_THRESHOLD + 10;
let a: Vec<f32> = (0..len).map(|i| i as f32).collect();
let b: Vec<f32> = (0..len).map(|i| (i as f32) + 1.0).collect();
// Each difference is 1: |1|^3 = 1. Sum = len, then result = len^(1/3)
let expected = (len as f32).powf(1.0 / 3.0);
let result = d.compute(&a, &b);
assert!(approx_eq(result, expected, 1e-6));
}

// ----------------------------
// Hamming Distance
// ----------------------------
#[test]
fn test_hamming_sequential() {
let a = vec![1.0f32, 2.0, 3.0, 4.0];
let b = vec![1.0f32, 0.0, 3.0, 0.0];
// Differences occur at index 1 and 3, so count = 2.
let d = Distance::Hamming;
let result = d.compute(&a, &b);
assert!(approx_eq(result, 2.0, 1e-6));
}

#[test]
fn test_hamming_parallel() {
let len = PARALLEL_THRESHOLD + 10;
let a: Vec<f32> = vec![1.0f32; len];
// Make b differ on every odd index.
let b: Vec<f32> = (0..len)
.map(|i| if i % 2 == 0 { 1.0f32 } else { 0.0f32 })
.collect();
// Expected differences: about half the indices.
let expected = if len % 2 == 0 {
(len / 2) as f32
} else {
((len / 2) + 1) as f32
};
let d = Distance::Hamming;
let result = d.compute(&a, &b);
assert!(approx_eq(result, expected, 1e-6));
}

// ----------------------------
// Mismatched Lengths
// ----------------------------
#[test]
#[should_panic(expected = "Input slices must have the same length")]
fn test_compute_mismatched_lengths() {
let a = vec![1.0f32, 2.0];
let b = vec![1.0f32];
let d = Distance::Euclidean;
let _ = d.compute(&a, &b);
}
Loading

0 comments on commit 071330f

Please sign in to comment.