Skip to content

Commit

Permalink
add an implementation for the binary quantized manhattan distance
Browse files Browse the repository at this point in the history
  • Loading branch information
irevoire committed Jul 10, 2024
1 parent 4c853b8 commit 7f004e8
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 7 deletions.
14 changes: 10 additions & 4 deletions examples/relevancy.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use rand::seq::SliceRandom;

use arroy::distances::{Angular, BinaryQuantizedEuclidean, DotProduct, Euclidean, Manhattan};
use arroy::distances::{
Angular, BinaryQuantizedEuclidean, BinaryQuantizedManhattan, DotProduct, Euclidean, Manhattan,
};
use arroy::internals::{self, Leaf, NodeCodec, UnalignedVector};
use arroy::{Database, Distance, ItemId, Result, Writer};
use heed::{EnvOpenOptions, RwTxn};
Expand All @@ -9,12 +11,12 @@ use rand::{Rng, SeedableRng};

const TWENTY_HUNDRED_MIB: usize = 2 * 1024 * 1024 * 1024;

const NUMBER_VECTORS: usize = 4_000;
const NUMBER_VECTORS: usize = 10_000;
// The openAI dimensions
const VECTOR_DIMENSIONS: usize = 256;
// const VECTOR_DIMENSIONS: usize = 256;
// const VECTOR_DIMENSIONS: usize = 512;
// const VECTOR_DIMENSIONS: usize = 1024;
// const VECTOR_DIMENSIONS: usize = 1536;
const VECTOR_DIMENSIONS: usize = 1536;
// const VECTOR_DIMENSIONS: usize = 3072;

fn main() {
Expand All @@ -27,6 +29,10 @@ fn main() {
BinaryQuantizedEuclidean::name(),
&measure_distance::<BinaryQuantizedEuclidean, Euclidean> as &dyn Fn(usize),
),
(
BinaryQuantizedManhattan::name(),
&measure_distance::<BinaryQuantizedManhattan, Manhattan> as &dyn Fn(usize),
),
] {
println!("{distance_name}");
for number_fetched in [1, 10, 50, 100] {
Expand Down
93 changes: 93 additions & 0 deletions src/distance/binary_quantized_manhattan.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
use std::borrow::Cow;

use bytemuck::{Pod, Zeroable};
use rand::Rng;

use super::two_means;
use crate::distance::Distance;
use crate::node::Leaf;
use crate::parallel::ImmutableSubsetLeafs;
use crate::unaligned_vector::{self, BinaryQuantized, UnalignedVector};

/// A taxicab geometry or a Manhattan geometry is a geometry whose usual distance function
/// or metric of Euclidean geometry is replaced by a new metric in which the distance between
/// two points is the sum of the absolute differences of their Cartesian coordinates.
#[derive(Debug, Clone)]
pub enum BinaryQuantizedManhattan {}

/// The header of BinaryQuantizedEuclidean leaf nodes.
#[repr(C)]
#[derive(Pod, Zeroable, Debug, Clone, Copy)]
pub struct NodeHeaderBinaryQuantizedManhattan {
/// An extra constant term to determine the offset of the plane
bias: f32,
}

impl Distance for BinaryQuantizedManhattan {
type Header = NodeHeaderBinaryQuantizedManhattan;
type VectorCodec = unaligned_vector::BinaryQuantized;

fn name() -> &'static str {
"binary quantized manhattan"
}

fn new_header(_vector: &UnalignedVector<Self::VectorCodec>) -> Self::Header {
NodeHeaderBinaryQuantizedManhattan { bias: 0.0 }
}

fn built_distance(p: &Leaf<Self>, q: &Leaf<Self>) -> f32 {
manhattan_distance(&p.vector, &q.vector)
}

/// Normalizes the distance returned by the distance method.
fn normalized_distance(d: f32, dimensions: usize) -> f32 {
d.max(0.0) / dimensions as f32
}

fn norm_no_header(v: &UnalignedVector<Self::VectorCodec>) -> f32 {
let ones = v.as_bytes().iter().map(|b| b.count_ones()).sum::<u32>() as f32;
ones.sqrt()
}

fn init(_node: &mut Leaf<Self>) {}

fn create_split<'a, R: Rng>(
children: &'a ImmutableSubsetLeafs<Self>,
rng: &mut R,
) -> heed::Result<Cow<'a, UnalignedVector<Self::VectorCodec>>> {
let [node_p, node_q] = two_means(rng, children, false)?;
let vector: Vec<f32> =
node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect();
let mut normal = Leaf {
header: NodeHeaderBinaryQuantizedManhattan { bias: 0.0 },
vector: UnalignedVector::from_slice(&vector),
};
Self::normalize(&mut normal);

Ok(Cow::Owned(normal.vector.into_owned()))
}

fn margin(p: &Leaf<Self>, q: &Leaf<Self>) -> f32 {
p.header.bias + dot_product(&p.vector, &q.vector)
}

fn margin_no_header(
p: &UnalignedVector<Self::VectorCodec>,
q: &UnalignedVector<Self::VectorCodec>,
) -> f32 {
dot_product(p, q)
}
}

fn dot_product(u: &UnalignedVector<BinaryQuantized>, v: &UnalignedVector<BinaryQuantized>) -> f32 {
// /!\ If the number of dimensions is not a multiple of the `Word` size, we'll xor 0 bits at the end, which will generate a lot of 1s.
// This may or may not impact relevancy since the 1s will be added to every vector.
u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u | v).count_ones()).sum::<u32>() as f32
}

fn manhattan_distance(
u: &UnalignedVector<BinaryQuantized>,
v: &UnalignedVector<BinaryQuantized>,
) -> f32 {
u.as_bytes().iter().zip(v.as_bytes()).map(|(u, v)| (u ^ v).count_ones()).sum::<u32>() as f32
}
4 changes: 4 additions & 0 deletions src/distance/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ pub use angular::{Angular, NodeHeaderAngular};
pub use binary_quantized_euclidean::{
BinaryQuantizedEuclidean, NodeHeaderBinaryQuantizedEuclidean,
};
pub use binary_quantized_manhattan::{
BinaryQuantizedManhattan, NodeHeaderBinaryQuantizedManhattan,
};
use bytemuck::{Pod, Zeroable};
pub use dot_product::{DotProduct, NodeHeaderDotProduct};
pub use euclidean::{Euclidean, NodeHeaderEuclidean};
Expand All @@ -20,6 +23,7 @@ use crate::NodeCodec;

mod angular;
mod binary_quantized_euclidean;
mod binary_quantized_manhattan;
mod dot_product;
mod euclidean;
mod manhattan;
Expand Down
7 changes: 4 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ pub mod internals {
use rand::Rng;

pub use crate::distance::{
NodeHeaderAngular, NodeHeaderBinaryQuantizedEuclidean, NodeHeaderDotProduct,
NodeHeaderEuclidean, NodeHeaderManhattan,
NodeHeaderAngular, NodeHeaderBinaryQuantizedEuclidean, NodeHeaderBinaryQuantizedManhattan,
NodeHeaderDotProduct, NodeHeaderEuclidean, NodeHeaderManhattan,
};
pub use crate::key::KeyCodec;
pub use crate::node::{Leaf, NodeCodec};
Expand Down Expand Up @@ -136,7 +136,8 @@ pub mod internals {
/// The set of distances implementing the [`Distance`] and supported by arroy.
pub mod distances {
pub use crate::distance::{
Angular, BinaryQuantizedEuclidean, DotProduct, Euclidean, Manhattan,
Angular, BinaryQuantizedEuclidean, BinaryQuantizedManhattan, DotProduct, Euclidean,
Manhattan,
};
}

Expand Down

0 comments on commit 7f004e8

Please sign in to comment.