diff --git a/examples/relevancy.rs b/examples/relevancy.rs index 633c5d58..1558a6a5 100644 --- a/examples/relevancy.rs +++ b/examples/relevancy.rs @@ -1,6 +1,8 @@ use rand::seq::SliceRandom; -use arroy::distances::{Angular, BinaryQuantizedEuclidean, DotProduct, Euclidean, Manhattan}; +use arroy::distances::{ + Angular, BinaryQuantizedEuclidean, BinaryQuantizedManhattan, DotProduct, Euclidean, Manhattan, +}; use arroy::internals::{self, Leaf, NodeCodec, UnalignedVector}; use arroy::{Database, Distance, ItemId, Result, Writer}; use heed::{EnvOpenOptions, RwTxn}; @@ -9,12 +11,12 @@ use rand::{Rng, SeedableRng}; const TWENTY_HUNDRED_MIB: usize = 2 * 1024 * 1024 * 1024; -const NUMBER_VECTORS: usize = 4_000; +const NUMBER_VECTORS: usize = 10_000; // The openAI dimensions -const VECTOR_DIMENSIONS: usize = 256; +// const VECTOR_DIMENSIONS: usize = 256; // const VECTOR_DIMENSIONS: usize = 512; // const VECTOR_DIMENSIONS: usize = 1024; -// const VECTOR_DIMENSIONS: usize = 1536; +const VECTOR_DIMENSIONS: usize = 1536; // const VECTOR_DIMENSIONS: usize = 3072; fn main() { @@ -27,6 +29,10 @@ fn main() { BinaryQuantizedEuclidean::name(), &measure_distance:: as &dyn Fn(usize), ), + ( + BinaryQuantizedManhattan::name(), + &measure_distance:: as &dyn Fn(usize), + ), ] { println!("{distance_name}"); for number_fetched in [1, 10, 50, 100] { diff --git a/src/distance/binary_quantized_manhattan.rs b/src/distance/binary_quantized_manhattan.rs new file mode 100644 index 00000000..55d3fd89 --- /dev/null +++ b/src/distance/binary_quantized_manhattan.rs @@ -0,0 +1,93 @@ +use std::borrow::Cow; + +use bytemuck::{Pod, Zeroable}; +use rand::Rng; + +use super::two_means; +use crate::distance::Distance; +use crate::node::Leaf; +use crate::parallel::ImmutableSubsetLeafs; +use crate::unaligned_vector::{self, BinaryQuantized, UnalignedVector}; + +/// A taxicab geometry or a Manhattan geometry is a geometry whose usual distance function +/// or metric of Euclidean geometry is replaced by a new metric in which the distance between +/// two points is the sum of the absolute differences of their Cartesian coordinates. +#[derive(Debug, Clone)] +pub enum BinaryQuantizedManhattan {} + +/// The header of BinaryQuantizedEuclidean leaf nodes. +#[repr(C)] +#[derive(Pod, Zeroable, Debug, Clone, Copy)] +pub struct NodeHeaderBinaryQuantizedManhattan { + /// An extra constant term to determine the offset of the plane + bias: f32, +} + +impl Distance for BinaryQuantizedManhattan { + type Header = NodeHeaderBinaryQuantizedManhattan; + type VectorCodec = unaligned_vector::BinaryQuantized; + + fn name() -> &'static str { + "binary quantized manhattan" + } + + fn new_header(_vector: &UnalignedVector) -> Self::Header { + NodeHeaderBinaryQuantizedManhattan { bias: 0.0 } + } + + fn built_distance(p: &Leaf, q: &Leaf) -> f32 { + manhattan_distance(&p.vector, &q.vector) + } + + /// Normalizes the distance returned by the distance method. + fn normalized_distance(d: f32, dimensions: usize) -> f32 { + d.max(0.0) / dimensions as f32 + } + + fn norm_no_header(v: &UnalignedVector) -> f32 { + let ones = v.as_bytes().iter().map(|b| b.count_ones()).sum::() as f32; + ones.sqrt() + } + + fn init(_node: &mut Leaf) {} + + fn create_split<'a, R: Rng>( + children: &'a ImmutableSubsetLeafs, + rng: &mut R, + ) -> heed::Result>> { + let [node_p, node_q] = two_means(rng, children, false)?; + let vector: Vec = + node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect(); + let mut normal = Leaf { + header: NodeHeaderBinaryQuantizedManhattan { bias: 0.0 }, + vector: UnalignedVector::from_slice(&vector), + }; + Self::normalize(&mut normal); + + Ok(Cow::Owned(normal.vector.into_owned())) + } + + fn margin(p: &Leaf, q: &Leaf) -> f32 { + p.header.bias + dot_product(&p.vector, &q.vector) + } + + fn margin_no_header( + p: &UnalignedVector, + q: &UnalignedVector, + ) -> f32 { + dot_product(p, q) + } +} + +fn dot_product(u: &UnalignedVector, v: &UnalignedVector) -> f32 { + // /!\ If the number of dimensions is not a multiple of the `Word` size, we'll xor 0 bits at the end, which will generate a lot of 1s. + // This may or may not impact relevancy since the 1s will be added to every vector. + u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u | v).count_ones()).sum::() as f32 +} + +fn manhattan_distance( + u: &UnalignedVector, + v: &UnalignedVector, +) -> f32 { + u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u ^ v).count_ones()).sum::() as f32 +} diff --git a/src/distance/mod.rs b/src/distance/mod.rs index 237ba246..a8d8df57 100644 --- a/src/distance/mod.rs +++ b/src/distance/mod.rs @@ -5,6 +5,9 @@ pub use angular::{Angular, NodeHeaderAngular}; pub use binary_quantized_euclidean::{ BinaryQuantizedEuclidean, NodeHeaderBinaryQuantizedEuclidean, }; +pub use binary_quantized_manhattan::{ + BinaryQuantizedManhattan, NodeHeaderBinaryQuantizedManhattan, +}; use bytemuck::{Pod, Zeroable}; pub use dot_product::{DotProduct, NodeHeaderDotProduct}; pub use euclidean::{Euclidean, NodeHeaderEuclidean}; @@ -20,6 +23,7 @@ use crate::NodeCodec; mod angular; mod binary_quantized_euclidean; +mod binary_quantized_manhattan; mod dot_product; mod euclidean; mod manhattan; diff --git a/src/lib.rs b/src/lib.rs index 0c0e873b..c9a1bb49 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -105,8 +105,8 @@ pub mod internals { use rand::Rng; pub use crate::distance::{ - NodeHeaderAngular, NodeHeaderBinaryQuantizedEuclidean, NodeHeaderDotProduct, - NodeHeaderEuclidean, NodeHeaderManhattan, + NodeHeaderAngular, NodeHeaderBinaryQuantizedEuclidean, NodeHeaderBinaryQuantizedManhattan, + NodeHeaderDotProduct, NodeHeaderEuclidean, NodeHeaderManhattan, }; pub use crate::key::KeyCodec; pub use crate::node::{Leaf, NodeCodec}; @@ -136,7 +136,8 @@ pub mod internals { /// The set of distances implementing the [`Distance`] and supported by arroy. pub mod distances { pub use crate::distance::{ - Angular, BinaryQuantizedEuclidean, DotProduct, Euclidean, Manhattan, + Angular, BinaryQuantizedEuclidean, BinaryQuantizedManhattan, DotProduct, Euclidean, + Manhattan, }; }