Skip to content

Commit

Permalink
fix the normalized distance for the binary quantized euclidean distance
Browse files Browse the repository at this point in the history
  • Loading branch information
irevoire committed Jul 10, 2024
1 parent 7907da9 commit 47fd2bd
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 7 deletions.
2 changes: 1 addition & 1 deletion src/distance/angular.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ impl Distance for Angular {
}
}

fn normalized_distance(d: f32) -> f32 {
fn normalized_distance(d: f32, _dimensions: usize) -> f32 {
d
}

Expand Down
19 changes: 17 additions & 2 deletions src/distance/binary_quantized_euclidean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,17 @@ impl Distance for BinaryQuantizedEuclidean {
}

fn built_distance(p: &Leaf<Self>, q: &Leaf<Self>) -> f32 {
dot_product(&p.vector, &q.vector)
squared_euclidean_distance(&p.vector, &q.vector)
}

/// Normalizes the distance returned by the distance method.
fn normalized_distance(d: f32, dimensions: usize) -> f32 {
d / dimensions as f32
}

fn norm_no_header(v: &UnalignedVector<Self::VectorCodec>) -> f32 {
dot_product(v, v).sqrt()
let ones = v.as_bytes().iter().map(|b| b.count_ones()).sum::<u32>() as f32;
ones.sqrt()
}

fn init(_node: &mut Leaf<Self>) {}
Expand Down Expand Up @@ -75,6 +81,15 @@ impl Distance for BinaryQuantizedEuclidean {
}

fn dot_product(u: &UnalignedVector<BinaryQuantized>, v: &UnalignedVector<BinaryQuantized>) -> f32 {
// /!\ If the number of dimensions is not a multiple of the `Word` size, we'll xor 0 bits at the end, which will generate a lot of 1s.
// This may or may not impact relevancy since the 1s will be added to every vector.
u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u | v).count_ones()).sum::<u32>() as f32
}

fn squared_euclidean_distance(
u: &UnalignedVector<BinaryQuantized>,
v: &UnalignedVector<BinaryQuantized>,
) -> f32 {
// /!\ If the number of dimensions is not a multiple of the `Word` size, we'll xor 0 bits at the end, which will generate a lot of 1s.
// This may or may not impact relevancy since the 1s will be added to every vector.
u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u ^ v).count_ones()).sum::<u32>() as f32
Expand Down
2 changes: 1 addition & 1 deletion src/distance/dot_product.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ impl Distance for DotProduct {
dot_product(v, v).sqrt()
}

fn normalized_distance(d: f32) -> f32 {
fn normalized_distance(d: f32, _dimension: usize) -> f32 {
-d
}

Expand Down
2 changes: 1 addition & 1 deletion src/distance/manhattan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ impl Distance for Manhattan {
p.vector.iter().zip(q.vector.iter()).map(|(p, q)| (p - q).abs()).sum()
}

fn normalized_distance(d: f32) -> f32 {
fn normalized_distance(d: f32, _dimension: usize) -> f32 {
d.max(0.0)
}

Expand Down
2 changes: 1 addition & 1 deletion src/distance/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ pub trait Distance: Send + Sync + Sized + Clone + fmt::Debug + 'static {
}

/// Normalizes the distance returned by the distance method.
fn normalized_distance(d: f32) -> f32 {
fn normalized_distance(d: f32, _dimensions: usize) -> f32 {
d.sqrt()
}

Expand Down
2 changes: 1 addition & 1 deletion src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ impl<'t, D: Distance> Reader<'t, D> {
if output.len() == capacity {
break;
}
output.push((item, D::normalized_distance(dist)));
output.push((item, D::normalized_distance(dist, self.dimensions)));
}

Ok(output)
Expand Down

0 comments on commit 47fd2bd

Please sign in to comment.