From 9f664489d045139b8d2be442cb4895e3b956e350 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 5 Aug 2024 15:45:17 +0200 Subject: [PATCH] compute two_means on non binary quantized distances --- examples/relevancy.rs | 63 +++++++++++--------- src/distance/binary_quantized_euclidean.rs | 46 +++++++++++++-- src/distance/binary_quantized_manhattan.rs | 45 ++++++++++++-- src/distance/euclidean.rs | 3 +- src/distance/mod.rs | 69 +++++++++++++++++++++- src/lib.rs | 9 +-- src/unaligned_vector/binary_quantized.rs | 7 ++- 7 files changed, 194 insertions(+), 48 deletions(-) diff --git a/examples/relevancy.rs b/examples/relevancy.rs index 5e7edf85..cc464d4c 100644 --- a/examples/relevancy.rs +++ b/examples/relevancy.rs @@ -3,7 +3,8 @@ use std::fmt; use rand::seq::SliceRandom; use arroy::distances::{ - Angular, BinaryQuantizedEuclidean, BinaryQuantizedManhattan, DotProduct, Euclidean, Manhattan, + Angular, BinaryQuantizedAngular, BinaryQuantizedEuclidean, BinaryQuantizedManhattan, + DotProduct, Euclidean, Manhattan, }; use arroy::internals::{self, Leaf, NodeCodec, UnalignedVector}; use arroy::{Database, Distance, ItemId, Result, Writer}; @@ -28,28 +29,32 @@ fn main() { println!(); for (distance_name, func) in &[ + ( + BinaryQuantizedAngular::name(), + &measure_distance:: as &dyn Fn(usize, usize) -> f32, + ), (Angular::name(), &measure_distance:: as &dyn Fn(usize, usize) -> f32), ( - Euclidean::name(), - &measure_distance:: as &dyn Fn(usize, usize) -> f32, + BinaryQuantizedManhattan::name(), + &measure_distance:: + as &dyn Fn(usize, usize) -> f32, ), ( Manhattan::name(), &measure_distance:: as &dyn Fn(usize, usize) -> f32, ), - ( - DotProduct::name(), - &measure_distance:: as &dyn Fn(usize, usize) -> f32, - ), ( BinaryQuantizedEuclidean::name(), &measure_distance:: as &dyn Fn(usize, usize) -> f32, ), ( - BinaryQuantizedManhattan::name(), - &measure_distance:: - as &dyn Fn(usize, usize) -> f32, + Euclidean::name(), + &measure_distance:: as &dyn Fn(usize, usize) -> f32, + ), + ( + DotProduct::name(), + &measure_distance:: as &dyn Fn(usize, usize) -> f32, ), ] { let now = std::time::Instant::now(); @@ -110,29 +115,31 @@ fn measure_distance( let reader = arroy::Reader::open(&wtxn, 0, database).unwrap(); - let querying = points.choose(&mut rng).unwrap(); - - let relevant = partial_sort_by::( - points.iter().map(|(i, v)| (*i, v.as_slice())), - &querying.1, - number_fetched, - ); - - let mut arroy = reader - .nns_by_item(&wtxn, querying.0, number_fetched * OVERSAMPLING, None, None) - .unwrap() - .unwrap(); - arroy.truncate(number_fetched); - let mut correctly_retrieved = 0; - for ret in arroy { - if relevant.iter().any(|(id, _, _)| *id == ret.0) { - correctly_retrieved += 1; + for _ in 0..100 { + let querying = points.choose(&mut rng).unwrap(); + + let relevant = partial_sort_by::( + points.iter().map(|(i, v)| (*i, v.as_slice())), + &querying.1, + number_fetched, + ); + + let mut arroy = reader + .nns_by_item(&wtxn, querying.0, number_fetched * OVERSAMPLING, None, None) + .unwrap() + .unwrap(); + arroy.truncate(number_fetched); + + for ret in arroy { + if relevant.iter().any(|(id, _, _)| *id == ret.0) { + correctly_retrieved += 1; + } } } // println!("recall@{number_fetched}: {}", correctly_retrieved as f32 / relevant.len() as f32); - correctly_retrieved as f32 / relevant.len() as f32 + correctly_retrieved as f32 / (number_fetched as f32 * 100.0) } fn partial_sort_by<'a, D: Distance>( diff --git a/src/distance/binary_quantized_euclidean.rs b/src/distance/binary_quantized_euclidean.rs index 278a77ce..724fb0ed 100644 --- a/src/distance/binary_quantized_euclidean.rs +++ b/src/distance/binary_quantized_euclidean.rs @@ -3,7 +3,7 @@ use std::borrow::Cow; use bytemuck::{Pod, Zeroable}; use rand::Rng; -use super::two_means; +use super::{two_means_binary_quantized as two_means, Euclidean}; use crate::distance::Distance; use crate::node::Leaf; use crate::parallel::ImmutableSubsetLeafs; @@ -46,8 +46,7 @@ impl Distance for BinaryQuantizedEuclidean { } fn norm_no_header(v: &UnalignedVector) -> f32 { - let ones = v.as_bytes().iter().map(|b| b.count_ones()).sum::() as f32; - ones.sqrt() + dot_product(v, v).sqrt() } fn init(_node: &mut Leaf) {} @@ -56,7 +55,7 @@ impl Distance for BinaryQuantizedEuclidean { children: &'a ImmutableSubsetLeafs, rng: &mut R, ) -> heed::Result>> { - let [node_p, node_q] = two_means(rng, children, false)?; + let [node_p, node_q] = two_means::(rng, children, false)?; let vector: Vec = node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect(); let mut normal = Leaf { @@ -80,10 +79,35 @@ impl Distance for BinaryQuantizedEuclidean { } } +fn bits(mut word: u8) -> [f32; 8] { + let mut ret = [0.0; 8]; + for i in 0..8 { + let bit = word & 1; + word >>= 1; + if bit == 0 { + ret[i] = -1.0; + } else { + ret[i] = 1.0; + } + } + + ret +} + fn dot_product(u: &UnalignedVector, v: &UnalignedVector) -> f32 { // /!\ If the number of dimensions is not a multiple of the `Word` size, we'll xor 0 bits at the end, which will generate a lot of 1s. // This may or may not impact relevancy since the 1s will be added to every vector. - u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u | v).count_ones()).sum::() as f32 + // u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u | v).count_ones()).sum::() as f32 + + u.as_bytes() + .iter() + .zip(v.as_bytes()) + .flat_map(|(u, v)| { + let u = bits(*u); + let v = bits(*v); + u.into_iter().zip(v).map(|(u, v)| u * v) + }) + .sum::() } fn squared_euclidean_distance( @@ -92,5 +116,15 @@ fn squared_euclidean_distance( ) -> f32 { // /!\ If the number of dimensions is not a multiple of the `Word` size, we'll xor 0 bits at the end, which will generate a lot of 1s. // This may or may not impact relevancy since the 1s will be added to every vector. - u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u ^ v).count_ones()).sum::() as f32 + // u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u ^ v).count_ones()).sum::() as f32 + + u.as_bytes() + .iter() + .zip(v.as_bytes()) + .flat_map(|(u, v)| { + let u = bits(*u); + let v = bits(*v); + u.into_iter().zip(v).map(|(u, v)| (u - v) * (u - v)) + }) + .sum::() } diff --git a/src/distance/binary_quantized_manhattan.rs b/src/distance/binary_quantized_manhattan.rs index 55d3fd89..3a724f10 100644 --- a/src/distance/binary_quantized_manhattan.rs +++ b/src/distance/binary_quantized_manhattan.rs @@ -3,7 +3,7 @@ use std::borrow::Cow; use bytemuck::{Pod, Zeroable}; use rand::Rng; -use super::two_means; +use super::{two_means_binary_quantized as two_means, Manhattan}; use crate::distance::Distance; use crate::node::Leaf; use crate::parallel::ImmutableSubsetLeafs; @@ -45,7 +45,7 @@ impl Distance for BinaryQuantizedManhattan { } fn norm_no_header(v: &UnalignedVector) -> f32 { - let ones = v.as_bytes().iter().map(|b| b.count_ones()).sum::() as f32; + let ones = v.as_bytes().iter().flat_map(|b| bits(*b)).sum::(); ones.sqrt() } @@ -55,7 +55,7 @@ impl Distance for BinaryQuantizedManhattan { children: &'a ImmutableSubsetLeafs, rng: &mut R, ) -> heed::Result>> { - let [node_p, node_q] = two_means(rng, children, false)?; + let [node_p, node_q] = two_means::(rng, children, false)?; let vector: Vec = node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect(); let mut normal = Leaf { @@ -79,15 +79,50 @@ impl Distance for BinaryQuantizedManhattan { } } +fn bits(mut word: u8) -> [f32; 8] { + let mut ret = [0.0; 8]; + for i in 0..8 { + let bit = word & 1; + word >>= 1; + if bit == 0 { + ret[i] = -1.0; + } else { + ret[i] = 1.0; + } + } + + ret +} + fn dot_product(u: &UnalignedVector, v: &UnalignedVector) -> f32 { // /!\ If the number of dimensions is not a multiple of the `Word` size, we'll xor 0 bits at the end, which will generate a lot of 1s. // This may or may not impact relevancy since the 1s will be added to every vector. - u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u | v).count_ones()).sum::() as f32 + // u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u | v).count_ones()).sum::() as f32 + + u.as_bytes() + .iter() + .zip(v.as_bytes()) + .flat_map(|(u, v)| { + let u = bits(*u); + let v = bits(*v); + u.into_iter().zip(v).map(|(u, v)| u * v) + }) + .sum::() } fn manhattan_distance( u: &UnalignedVector, v: &UnalignedVector, ) -> f32 { - u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u ^ v).count_ones()).sum::() as f32 + // u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u ^ v).count_ones()).sum::() as f32 + + u.as_bytes() + .iter() + .zip(v.as_bytes()) + .flat_map(|(u, v)| { + let u = bits(*u); + let v = bits(*v); + u.into_iter().zip(v).map(|(u, v)| (u - v).abs()) + }) + .sum::() } diff --git a/src/distance/euclidean.rs b/src/distance/euclidean.rs index ae503e83..1c0b2f54 100644 --- a/src/distance/euclidean.rs +++ b/src/distance/euclidean.rs @@ -54,11 +54,10 @@ impl Distance for Euclidean { let [node_p, node_q] = two_means(rng, children, false)?; let vector: Vec<_> = node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect(); - let mut normal = Leaf { + let mut normal: Leaf<'static, Self> = Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: UnalignedVector::from_vec(vector), }; - Self::normalize(&mut normal); normal.header.bias = normal .vector diff --git a/src/distance/mod.rs b/src/distance/mod.rs index a8d8df57..8871e0a4 100644 --- a/src/distance/mod.rs +++ b/src/distance/mod.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::fmt; pub use angular::{Angular, NodeHeaderAngular}; +pub use binary_quantized_angular::{BinaryQuantizedAngular, NodeHeaderBinaryQuantizedAngular}; pub use binary_quantized_euclidean::{ BinaryQuantizedEuclidean, NodeHeaderBinaryQuantizedEuclidean, }; @@ -22,12 +23,18 @@ use crate::unaligned_vector::{UnalignedVector, UnalignedVectorCodec}; use crate::NodeCodec; mod angular; +mod binary_quantized_angular; mod binary_quantized_euclidean; mod binary_quantized_manhattan; mod dot_product; mod euclidean; mod manhattan; +fn new_leaf(vec: Vec) -> Leaf<'static, D> { + let vector = UnalignedVector::from_vec(vec); + Leaf { header: D::new_header(&vector), vector } +} + /// A trait used by arroy to compute the distances, /// compute the split planes, and normalize user vectors. #[allow(missing_docs)] @@ -137,8 +144,7 @@ fn two_means( const ITERATION_STEPS: usize = 200; let [leaf_p, leaf_q] = leafs.choose_two(rng)?.unwrap(); - let mut leaf_p = leaf_p.into_owned(); - let mut leaf_q = leaf_q.into_owned(); + let (mut leaf_p, mut leaf_q) = (leaf_p.into_owned(), leaf_q.into_owned()); if cosine { D::normalize(&mut leaf_p); @@ -171,3 +177,62 @@ fn two_means( Ok([leaf_p, leaf_q]) } + +pub fn two_means_binary_quantized( + rng: &mut R, + leafs: &ImmutableSubsetLeafs, + cosine: bool, +) -> heed::Result<[Leaf<'static, D>; 2]> { + // This algorithm is a huge heuristic. Empirically it works really well, but I + // can't motivate it well. The basic idea is to keep two centroids and assign + // points to either one of them. We weight each centroid by the number of points + // assigned to it, so to balance it. + + const ITERATION_STEPS: usize = 200; + + let [leaf_p, leaf_q] = leafs.choose_two(rng)?.unwrap(); + let mut leaf_p: Leaf<'static, NonBqDist> = new_leaf(leaf_p.vector.iter().collect()); + let mut leaf_q: Leaf<'static, NonBqDist> = new_leaf(leaf_q.vector.iter().collect()); + + if cosine { + NonBqDist::normalize(&mut leaf_p); + NonBqDist::normalize(&mut leaf_q); + } + + NonBqDist::init(&mut leaf_p); + NonBqDist::init(&mut leaf_q); + + let mut ic = 1.0; + let mut jc = 1.0; + for _ in 0..ITERATION_STEPS { + let node_k = leafs.choose(rng)?.unwrap(); + let node_k: Leaf<'static, NonBqDist> = new_leaf(node_k.vector.iter().collect()); + let di = ic * NonBqDist::non_built_distance(&leaf_p, &node_k); + let dj = jc * NonBqDist::non_built_distance(&leaf_q, &node_k); + let norm = if cosine { NonBqDist::norm(&node_k) } else { 1.0 }; + if norm.is_nan() || norm <= 0.0 { + continue; + } + if di < dj { + // update_mean(&mut leaf_p, node_k.vector.iter(), norm, ic); + Distance::update_mean(&mut leaf_p, &node_k, norm, ic); + Distance::init(&mut leaf_p); + ic += 1.0; + } else if dj < di { + // update_mean(&mut leaf_q, node_k.vector.iter(), norm, jc); + Distance::update_mean(&mut leaf_q, &node_k, norm, jc); + Distance::init(&mut leaf_q); + jc += 1.0; + } + } + + let leaf_p = new_leaf(leaf_p.vector.iter().collect()); + let leaf_q = new_leaf(leaf_q.vector.iter().collect()); + Ok([leaf_p, leaf_q]) +} + +fn update_mean(mean: &mut Vec, new_node: impl Iterator, norm: f32, c: f32) { + let vec: Vec<_> = + mean.iter().zip(new_node).map(|(x, n)| (x * c + n / norm) / (c + 1.0)).collect(); + *mean = vec; +} diff --git a/src/lib.rs b/src/lib.rs index c9a1bb49..b4271f68 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -105,8 +105,9 @@ pub mod internals { use rand::Rng; pub use crate::distance::{ - NodeHeaderAngular, NodeHeaderBinaryQuantizedEuclidean, NodeHeaderBinaryQuantizedManhattan, - NodeHeaderDotProduct, NodeHeaderEuclidean, NodeHeaderManhattan, + NodeHeaderAngular, NodeHeaderBinaryQuantizedAngular, NodeHeaderBinaryQuantizedEuclidean, + NodeHeaderBinaryQuantizedManhattan, NodeHeaderDotProduct, NodeHeaderEuclidean, + NodeHeaderManhattan, }; pub use crate::key::KeyCodec; pub use crate::node::{Leaf, NodeCodec}; @@ -136,8 +137,8 @@ pub mod internals { /// The set of distances implementing the [`Distance`] and supported by arroy. pub mod distances { pub use crate::distance::{ - Angular, BinaryQuantizedEuclidean, BinaryQuantizedManhattan, DotProduct, Euclidean, - Manhattan, + Angular, BinaryQuantizedAngular, BinaryQuantizedEuclidean, BinaryQuantizedManhattan, + DotProduct, Euclidean, Manhattan, }; } diff --git a/src/unaligned_vector/binary_quantized.rs b/src/unaligned_vector/binary_quantized.rs index b99c9998..a8eb607d 100644 --- a/src/unaligned_vector/binary_quantized.rs +++ b/src/unaligned_vector/binary_quantized.rs @@ -77,7 +77,12 @@ impl Iterator for BinaryQuantizedIterator<'_> { self.current_element >>= 1; self.current_iteration += 1; - Some(bit as f32) + if bit == 0 { + Some(-1.0) + } else { + Some(1.0) + } + // Some(bit as f32) } fn size_hint(&self) -> (usize, Option) {