Skip to content

Commit

Permalink
compute two_means on non binary quantized distances
Browse files Browse the repository at this point in the history
  • Loading branch information
irevoire committed Aug 5, 2024
1 parent 0e8fba2 commit 9f66448
Show file tree
Hide file tree
Showing 7 changed files with 194 additions and 48 deletions.
63 changes: 35 additions & 28 deletions examples/relevancy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ use std::fmt;
use rand::seq::SliceRandom;

use arroy::distances::{
Angular, BinaryQuantizedEuclidean, BinaryQuantizedManhattan, DotProduct, Euclidean, Manhattan,
Angular, BinaryQuantizedAngular, BinaryQuantizedEuclidean, BinaryQuantizedManhattan,
DotProduct, Euclidean, Manhattan,
};
use arroy::internals::{self, Leaf, NodeCodec, UnalignedVector};
use arroy::{Database, Distance, ItemId, Result, Writer};
Expand All @@ -28,28 +29,32 @@ fn main() {
println!();

for (distance_name, func) in &[
(
BinaryQuantizedAngular::name(),
&measure_distance::<BinaryQuantizedAngular, Angular> as &dyn Fn(usize, usize) -> f32,
),
(Angular::name(), &measure_distance::<Angular, Angular> as &dyn Fn(usize, usize) -> f32),
(
Euclidean::name(),
&measure_distance::<Euclidean, Euclidean> as &dyn Fn(usize, usize) -> f32,
BinaryQuantizedManhattan::name(),
&measure_distance::<BinaryQuantizedManhattan, Manhattan>
as &dyn Fn(usize, usize) -> f32,
),
(
Manhattan::name(),
&measure_distance::<Manhattan, Manhattan> as &dyn Fn(usize, usize) -> f32,
),
(
DotProduct::name(),
&measure_distance::<DotProduct, DotProduct> as &dyn Fn(usize, usize) -> f32,
),
(
BinaryQuantizedEuclidean::name(),
&measure_distance::<BinaryQuantizedEuclidean, Euclidean>
as &dyn Fn(usize, usize) -> f32,
),
(
BinaryQuantizedManhattan::name(),
&measure_distance::<BinaryQuantizedManhattan, Manhattan>
as &dyn Fn(usize, usize) -> f32,
Euclidean::name(),
&measure_distance::<Euclidean, Euclidean> as &dyn Fn(usize, usize) -> f32,
),
(
DotProduct::name(),
&measure_distance::<DotProduct, DotProduct> as &dyn Fn(usize, usize) -> f32,
),
] {
let now = std::time::Instant::now();
Expand Down Expand Up @@ -110,29 +115,31 @@ fn measure_distance<ArroyDistance: Distance, PerfectDistance: Distance>(

let reader = arroy::Reader::open(&wtxn, 0, database).unwrap();

let querying = points.choose(&mut rng).unwrap();

let relevant = partial_sort_by::<PerfectDistance>(
points.iter().map(|(i, v)| (*i, v.as_slice())),
&querying.1,
number_fetched,
);

let mut arroy = reader
.nns_by_item(&wtxn, querying.0, number_fetched * OVERSAMPLING, None, None)
.unwrap()
.unwrap();
arroy.truncate(number_fetched);

let mut correctly_retrieved = 0;
for ret in arroy {
if relevant.iter().any(|(id, _, _)| *id == ret.0) {
correctly_retrieved += 1;
for _ in 0..100 {
let querying = points.choose(&mut rng).unwrap();

let relevant = partial_sort_by::<PerfectDistance>(
points.iter().map(|(i, v)| (*i, v.as_slice())),
&querying.1,
number_fetched,
);

let mut arroy = reader
.nns_by_item(&wtxn, querying.0, number_fetched * OVERSAMPLING, None, None)
.unwrap()
.unwrap();
arroy.truncate(number_fetched);

for ret in arroy {
if relevant.iter().any(|(id, _, _)| *id == ret.0) {
correctly_retrieved += 1;
}
}
}

// println!("recall@{number_fetched}: {}", correctly_retrieved as f32 / relevant.len() as f32);
correctly_retrieved as f32 / relevant.len() as f32
correctly_retrieved as f32 / (number_fetched as f32 * 100.0)
}

fn partial_sort_by<'a, D: Distance>(
Expand Down
46 changes: 40 additions & 6 deletions src/distance/binary_quantized_euclidean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::borrow::Cow;
use bytemuck::{Pod, Zeroable};
use rand::Rng;

use super::two_means;
use super::{two_means_binary_quantized as two_means, Euclidean};
use crate::distance::Distance;
use crate::node::Leaf;
use crate::parallel::ImmutableSubsetLeafs;
Expand Down Expand Up @@ -46,8 +46,7 @@ impl Distance for BinaryQuantizedEuclidean {
}

fn norm_no_header(v: &UnalignedVector<Self::VectorCodec>) -> f32 {
let ones = v.as_bytes().iter().map(|b| b.count_ones()).sum::<u32>() as f32;
ones.sqrt()
dot_product(v, v).sqrt()
}

fn init(_node: &mut Leaf<Self>) {}
Expand All @@ -56,7 +55,7 @@ impl Distance for BinaryQuantizedEuclidean {
children: &'a ImmutableSubsetLeafs<Self>,
rng: &mut R,
) -> heed::Result<Cow<'a, UnalignedVector<Self::VectorCodec>>> {
let [node_p, node_q] = two_means(rng, children, false)?;
let [node_p, node_q] = two_means::<Self, Euclidean, R>(rng, children, false)?;
let vector: Vec<f32> =
node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect();
let mut normal = Leaf {
Expand All @@ -80,10 +79,35 @@ impl Distance for BinaryQuantizedEuclidean {
}
}

fn bits(mut word: u8) -> [f32; 8] {
let mut ret = [0.0; 8];
for i in 0..8 {
let bit = word & 1;
word >>= 1;
if bit == 0 {
ret[i] = -1.0;
} else {
ret[i] = 1.0;
}
}

ret
}

fn dot_product(u: &UnalignedVector<BinaryQuantized>, v: &UnalignedVector<BinaryQuantized>) -> f32 {
// /!\ If the number of dimensions is not a multiple of the `Word` size, we'll xor 0 bits at the end, which will generate a lot of 1s.
// This may or may not impact relevancy since the 1s will be added to every vector.
u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u | v).count_ones()).sum::<u32>() as f32
// u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u | v).count_ones()).sum::<u32>() as f32

u.as_bytes()
.iter()
.zip(v.as_bytes())
.flat_map(|(u, v)| {
let u = bits(*u);
let v = bits(*v);
u.into_iter().zip(v).map(|(u, v)| u * v)
})
.sum::<f32>()
}

fn squared_euclidean_distance(
Expand All @@ -92,5 +116,15 @@ fn squared_euclidean_distance(
) -> f32 {
// /!\ If the number of dimensions is not a multiple of the `Word` size, we'll xor 0 bits at the end, which will generate a lot of 1s.
// This may or may not impact relevancy since the 1s will be added to every vector.
u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u ^ v).count_ones()).sum::<u32>() as f32
// u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u ^ v).count_ones()).sum::<u32>() as f32

u.as_bytes()
.iter()
.zip(v.as_bytes())
.flat_map(|(u, v)| {
let u = bits(*u);
let v = bits(*v);
u.into_iter().zip(v).map(|(u, v)| (u - v) * (u - v))
})
.sum::<f32>()
}
45 changes: 40 additions & 5 deletions src/distance/binary_quantized_manhattan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::borrow::Cow;
use bytemuck::{Pod, Zeroable};
use rand::Rng;

use super::two_means;
use super::{two_means_binary_quantized as two_means, Manhattan};
use crate::distance::Distance;
use crate::node::Leaf;
use crate::parallel::ImmutableSubsetLeafs;
Expand Down Expand Up @@ -45,7 +45,7 @@ impl Distance for BinaryQuantizedManhattan {
}

fn norm_no_header(v: &UnalignedVector<Self::VectorCodec>) -> f32 {
let ones = v.as_bytes().iter().map(|b| b.count_ones()).sum::<u32>() as f32;
let ones = v.as_bytes().iter().flat_map(|b| bits(*b)).sum::<f32>();
ones.sqrt()
}

Expand All @@ -55,7 +55,7 @@ impl Distance for BinaryQuantizedManhattan {
children: &'a ImmutableSubsetLeafs<Self>,
rng: &mut R,
) -> heed::Result<Cow<'a, UnalignedVector<Self::VectorCodec>>> {
let [node_p, node_q] = two_means(rng, children, false)?;
let [node_p, node_q] = two_means::<Self, Manhattan, R>(rng, children, false)?;
let vector: Vec<f32> =
node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect();
let mut normal = Leaf {
Expand All @@ -79,15 +79,50 @@ impl Distance for BinaryQuantizedManhattan {
}
}

fn bits(mut word: u8) -> [f32; 8] {
let mut ret = [0.0; 8];
for i in 0..8 {
let bit = word & 1;
word >>= 1;
if bit == 0 {
ret[i] = -1.0;
} else {
ret[i] = 1.0;
}
}

ret
}

fn dot_product(u: &UnalignedVector<BinaryQuantized>, v: &UnalignedVector<BinaryQuantized>) -> f32 {
// /!\ If the number of dimensions is not a multiple of the `Word` size, we'll xor 0 bits at the end, which will generate a lot of 1s.
// This may or may not impact relevancy since the 1s will be added to every vector.
u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u | v).count_ones()).sum::<u32>() as f32
// u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u | v).count_ones()).sum::<u32>() as f32

u.as_bytes()
.iter()
.zip(v.as_bytes())
.flat_map(|(u, v)| {
let u = bits(*u);
let v = bits(*v);
u.into_iter().zip(v).map(|(u, v)| u * v)
})
.sum::<f32>()
}

fn manhattan_distance(
u: &UnalignedVector<BinaryQuantized>,
v: &UnalignedVector<BinaryQuantized>,
) -> f32 {
u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u ^ v).count_ones()).sum::<u32>() as f32
// u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u ^ v).count_ones()).sum::<u32>() as f32

u.as_bytes()
.iter()
.zip(v.as_bytes())
.flat_map(|(u, v)| {
let u = bits(*u);
let v = bits(*v);
u.into_iter().zip(v).map(|(u, v)| (u - v).abs())
})
.sum::<f32>()
}
3 changes: 1 addition & 2 deletions src/distance/euclidean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,10 @@ impl Distance for Euclidean {
let [node_p, node_q] = two_means(rng, children, false)?;
let vector: Vec<_> =
node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect();
let mut normal = Leaf {
let mut normal: Leaf<'static, Self> = Leaf {
header: NodeHeaderEuclidean { bias: 0.0 },
vector: UnalignedVector::from_vec(vector),
};
Self::normalize(&mut normal);

normal.header.bias = normal
.vector
Expand Down
69 changes: 67 additions & 2 deletions src/distance/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use std::borrow::Cow;
use std::fmt;

pub use angular::{Angular, NodeHeaderAngular};
pub use binary_quantized_angular::{BinaryQuantizedAngular, NodeHeaderBinaryQuantizedAngular};

Check failure on line 5 in src/distance/mod.rs

View workflow job for this annotation

GitHub Actions / lint

unresolved imports `binary_quantized_angular::BinaryQuantizedAngular`, `binary_quantized_angular::NodeHeaderBinaryQuantizedAngular`

Check failure on line 5 in src/distance/mod.rs

View workflow job for this annotation

GitHub Actions / test (macos-latest-xlarge, stable)

unresolved imports `binary_quantized_angular::BinaryQuantizedAngular`, `binary_quantized_angular::NodeHeaderBinaryQuantizedAngular`

Check failure on line 5 in src/distance/mod.rs

View workflow job for this annotation

GitHub Actions / test (ubuntu-latest, stable)

unresolved imports `binary_quantized_angular::BinaryQuantizedAngular`, `binary_quantized_angular::NodeHeaderBinaryQuantizedAngular`

Check failure on line 5 in src/distance/mod.rs

View workflow job for this annotation

GitHub Actions / test (macos-latest-xlarge, beta)

unresolved imports `binary_quantized_angular::BinaryQuantizedAngular`, `binary_quantized_angular::NodeHeaderBinaryQuantizedAngular`
pub use binary_quantized_euclidean::{
BinaryQuantizedEuclidean, NodeHeaderBinaryQuantizedEuclidean,
};
Expand All @@ -22,12 +23,18 @@ use crate::unaligned_vector::{UnalignedVector, UnalignedVectorCodec};
use crate::NodeCodec;

mod angular;
mod binary_quantized_angular;

Check failure on line 26 in src/distance/mod.rs

View workflow job for this annotation

GitHub Actions / lint

file not found for module `binary_quantized_angular`

Check failure on line 26 in src/distance/mod.rs

View workflow job for this annotation

GitHub Actions / test (macos-latest-xlarge, stable)

file not found for module `binary_quantized_angular`

Check failure on line 26 in src/distance/mod.rs

View workflow job for this annotation

GitHub Actions / test (ubuntu-latest, stable)

file not found for module `binary_quantized_angular`

Check failure on line 26 in src/distance/mod.rs

View workflow job for this annotation

GitHub Actions / test (macos-latest-xlarge, beta)

file not found for module `binary_quantized_angular`
mod binary_quantized_euclidean;
mod binary_quantized_manhattan;
mod dot_product;
mod euclidean;
mod manhattan;

fn new_leaf<D: Distance>(vec: Vec<f32>) -> Leaf<'static, D> {
let vector = UnalignedVector::from_vec(vec);
Leaf { header: D::new_header(&vector), vector }
}

/// A trait used by arroy to compute the distances,
/// compute the split planes, and normalize user vectors.
#[allow(missing_docs)]
Expand Down Expand Up @@ -137,8 +144,7 @@ fn two_means<D: Distance, R: Rng>(
const ITERATION_STEPS: usize = 200;

let [leaf_p, leaf_q] = leafs.choose_two(rng)?.unwrap();
let mut leaf_p = leaf_p.into_owned();
let mut leaf_q = leaf_q.into_owned();
let (mut leaf_p, mut leaf_q) = (leaf_p.into_owned(), leaf_q.into_owned());

if cosine {
D::normalize(&mut leaf_p);
Expand Down Expand Up @@ -171,3 +177,62 @@ fn two_means<D: Distance, R: Rng>(

Ok([leaf_p, leaf_q])
}

pub fn two_means_binary_quantized<D: Distance, NonBqDist: Distance, R: Rng>(
rng: &mut R,
leafs: &ImmutableSubsetLeafs<D>,
cosine: bool,
) -> heed::Result<[Leaf<'static, D>; 2]> {
// This algorithm is a huge heuristic. Empirically it works really well, but I
// can't motivate it well. The basic idea is to keep two centroids and assign
// points to either one of them. We weight each centroid by the number of points
// assigned to it, so to balance it.

const ITERATION_STEPS: usize = 200;

let [leaf_p, leaf_q] = leafs.choose_two(rng)?.unwrap();
let mut leaf_p: Leaf<'static, NonBqDist> = new_leaf(leaf_p.vector.iter().collect());
let mut leaf_q: Leaf<'static, NonBqDist> = new_leaf(leaf_q.vector.iter().collect());

if cosine {
NonBqDist::normalize(&mut leaf_p);
NonBqDist::normalize(&mut leaf_q);
}

NonBqDist::init(&mut leaf_p);
NonBqDist::init(&mut leaf_q);

let mut ic = 1.0;
let mut jc = 1.0;
for _ in 0..ITERATION_STEPS {
let node_k = leafs.choose(rng)?.unwrap();
let node_k: Leaf<'static, NonBqDist> = new_leaf(node_k.vector.iter().collect());
let di = ic * NonBqDist::non_built_distance(&leaf_p, &node_k);
let dj = jc * NonBqDist::non_built_distance(&leaf_q, &node_k);
let norm = if cosine { NonBqDist::norm(&node_k) } else { 1.0 };
if norm.is_nan() || norm <= 0.0 {
continue;
}
if di < dj {
// update_mean(&mut leaf_p, node_k.vector.iter(), norm, ic);
Distance::update_mean(&mut leaf_p, &node_k, norm, ic);
Distance::init(&mut leaf_p);
ic += 1.0;
} else if dj < di {
// update_mean(&mut leaf_q, node_k.vector.iter(), norm, jc);
Distance::update_mean(&mut leaf_q, &node_k, norm, jc);
Distance::init(&mut leaf_q);
jc += 1.0;
}
}

let leaf_p = new_leaf(leaf_p.vector.iter().collect());
let leaf_q = new_leaf(leaf_q.vector.iter().collect());
Ok([leaf_p, leaf_q])
}

fn update_mean(mean: &mut Vec<f32>, new_node: impl Iterator<Item = f32>, norm: f32, c: f32) {
let vec: Vec<_> =
mean.iter().zip(new_node).map(|(x, n)| (x * c + n / norm) / (c + 1.0)).collect();
*mean = vec;
}
9 changes: 5 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,9 @@ pub mod internals {
use rand::Rng;

pub use crate::distance::{
NodeHeaderAngular, NodeHeaderBinaryQuantizedEuclidean, NodeHeaderBinaryQuantizedManhattan,
NodeHeaderDotProduct, NodeHeaderEuclidean, NodeHeaderManhattan,
NodeHeaderAngular, NodeHeaderBinaryQuantizedAngular, NodeHeaderBinaryQuantizedEuclidean,
NodeHeaderBinaryQuantizedManhattan, NodeHeaderDotProduct, NodeHeaderEuclidean,
NodeHeaderManhattan,
};
pub use crate::key::KeyCodec;
pub use crate::node::{Leaf, NodeCodec};
Expand Down Expand Up @@ -136,8 +137,8 @@ pub mod internals {
/// The set of distances implementing the [`Distance`] and supported by arroy.
pub mod distances {
pub use crate::distance::{
Angular, BinaryQuantizedEuclidean, BinaryQuantizedManhattan, DotProduct, Euclidean,
Manhattan,
Angular, BinaryQuantizedAngular, BinaryQuantizedEuclidean, BinaryQuantizedManhattan,
DotProduct, Euclidean, Manhattan,
};
}

Expand Down
Loading

0 comments on commit 9f66448

Please sign in to comment.