Skip to content

Commit

Permalink
re-implements display for all kind of nodes taking the distance into …
Browse files Browse the repository at this point in the history
…account
  • Loading branch information
irevoire committed Jul 4, 2024
1 parent 692f1ef commit 27ed4d4
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 55 deletions.
21 changes: 20 additions & 1 deletion src/distance/binary_quantized_euclidean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use rand::Rng;

use super::two_means;
use crate::distance::Distance;
use crate::node::{Leaf, UnalignedVector};
use crate::node::{Leaf, SizeMismatch, UnalignedVector};
use crate::parallel::ImmutableSubsetLeafs;
use crate::spaces::simple::dot_product;

Expand All @@ -31,6 +31,25 @@ impl Distance for BinaryQuantizedEuclidean {
"binary quantized euclidean"
}

fn craft_owned_unaligned_vector_from_f32(vector: Vec<f32>) -> Cow<'static, UnalignedVector> {
// We need to allocate anyway so we use the version that take a ref
UnalignedVector::binary_quantized_vectors_from_slice(&vector)
}

fn craft_unaligned_vector_from_f32(vector: &[f32]) -> Cow<UnalignedVector> {
UnalignedVector::binary_quantized_vectors_from_slice(vector)
}

fn craft_unaligned_vector_from_bytes(
vector: &[u8],
) -> Result<Cow<UnalignedVector>, SizeMismatch> {
UnalignedVector::quantized_vectors_from_bytes(vector).map(Cow::Borrowed)
}

fn read_unaligned_vector(vector: &UnalignedVector) -> Vec<f32> {
vector.iter_binary_quantized().collect()
}

fn new_header(_vector: &UnalignedVector) -> Self::Header {
NodeHeaderBinaryQuantizedEuclidean { bias: 0.0 }
}
Expand Down
3 changes: 3 additions & 0 deletions src/distance/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ use std::borrow::Cow;
use std::fmt;

pub use angular::{Angular, NodeHeaderAngular};
pub use binary_quantized_euclidean::{
BinaryQuantizedEuclidean, NodeHeaderBinaryQuantizedEuclidean,

Check warning on line 6 in src/distance/mod.rs

View workflow job for this annotation

GitHub Actions / test (macos-latest-xlarge, stable)

unused imports: `BinaryQuantizedEuclidean`, `NodeHeaderBinaryQuantizedEuclidean`

Check warning on line 6 in src/distance/mod.rs

View workflow job for this annotation

GitHub Actions / test (ubuntu-latest, stable)

unused imports: `BinaryQuantizedEuclidean`, `NodeHeaderBinaryQuantizedEuclidean`

Check warning on line 6 in src/distance/mod.rs

View workflow job for this annotation

GitHub Actions / test (macos-latest-xlarge, beta)

unused imports: `BinaryQuantizedEuclidean` and `NodeHeaderBinaryQuantizedEuclidean`

Check warning on line 6 in src/distance/mod.rs

View workflow job for this annotation

GitHub Actions / lint

unused imports: `BinaryQuantizedEuclidean`, `NodeHeaderBinaryQuantizedEuclidean`
};
use bytemuck::{Pod, Zeroable};
pub use dot_product::{DotProduct, NodeHeaderDotProduct};
pub use euclidean::{Euclidean, NodeHeaderEuclidean};
Expand Down
132 changes: 86 additions & 46 deletions src/node.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::borrow::{Borrow, Cow};
use std::fmt;
use std::marker::PhantomData;
use std::mem::{size_of, transmute};

use bytemuck::{bytes_of, cast_slice, pod_collect_to_vec, pod_read_unaligned};
Expand All @@ -10,13 +11,26 @@ use roaring::RoaringBitmap;
use crate::distance::Distance;
use crate::{ItemId, NodeId};

#[derive(Debug, Clone)]
#[derive(Clone)]
pub enum Node<'a, D: Distance> {
Leaf(Leaf<'a, D>),
Descendants(Descendants<'a>),
SplitPlaneNormal(SplitPlaneNormal<'a>),
}

impl<D: Distance> fmt::Debug for Node<'_, D> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Node::Leaf(leaf) => f.debug_tuple("Leaf").field(&leaf).finish(),
Node::Descendants(desc) => f.debug_tuple("Descendants").field(&desc).finish(),
Node::SplitPlaneNormal(split) => f
.debug_tuple("SplitPlaneNormal")
.field(&DisplaySplitPlaneNormal::<D>(split, PhantomData))
.finish(),
}
}
}

const LEAF_TAG: u8 = 0;
const DESCENDANTS_TAG: u8 = 1;
const SPLIT_PLANE_NORMAL_TAG: u8 = 2;
Expand All @@ -31,16 +45,35 @@ impl<'a, D: Distance> Node<'a, D> {
}
}

/// Small structure used to implement `Debug` for the `Leaf` and the `SplitPlaneNormal`.
struct DisplayVec(Vec<f32>);
impl fmt::Debug for DisplayVec {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut list = f.debug_list();
self.0.iter().for_each(|float| {
list.entry(&format_args!("{:.4?}", float));
});
list.finish()
}
}

/// A leaf node which corresponds to the vector inputed
/// by the user and the distance header.
#[derive(Debug, Clone)]
#[derive(Clone)]
pub struct Leaf<'a, D: Distance> {
/// The header of this leaf.
pub header: D::Header,
/// The vector of this leaf.
pub vector: Cow<'a, UnalignedVector>,
}

impl<D: Distance> fmt::Debug for Leaf<'_, D> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let vec = DisplayVec(D::read_unaligned_vector(&self.vector));
f.debug_struct("Leaf").field("header", &self.header).field("vector", &vec).finish()
}
}

impl<D: Distance> Leaf<'_, D> {
/// Converts the leaf into an owned version of itself by cloning
/// the internal vector. Doing so will make it mutable.
Expand Down Expand Up @@ -83,16 +116,6 @@ impl UnalignedVector {
}
}

/// Creates an unaligned slice of `usize` wrapper from a slice of bytes.
pub(crate) fn quantized_vectors_from_bytes(bytes: &[u8]) -> Result<&Self, SizeMismatch> {
if bytes.len() % size_of::<QuantizedWord>() == 0 {
// safety: `UnalignedF32Slice` is transparent
Ok(unsafe { transmute(bytes) })
} else {
Err(SizeMismatch)
}
}

/// Creates an unaligned slice of f32 wrapper from a slice of f32.
/// The slice is already known to be of the right length.
pub(crate) fn f32_vectors_from_f32_slice(slice: &[f32]) -> &Self {
Expand All @@ -106,16 +129,27 @@ impl UnalignedVector {
Cow::Owned(bytes)
}

/// Creates an unaligned slice of f32 wrapper from a slice of f32.
/// The slice is already known to be of the right length.
pub(crate) fn binary_quantized_vectors_from_slice(slice: &[f32]) -> Cow<Self> {
let mut output: Vec<u8> = vec![0; slice.len() / QUANTIZED_WORD_SIZE];
for chunk in slice.chunks_exact(QUANTIZED_WORD_SIZE) {
/// Creates a binary quantized wrapper from a slice of bytes.
pub(crate) fn quantized_vectors_from_bytes(bytes: &[u8]) -> Result<&Self, SizeMismatch> {
if bytes.len() % size_of::<QuantizedWord>() == 0 {
// safety: `UnalignedF32Slice` is transparent
Ok(unsafe { transmute(bytes) })
} else {
Err(SizeMismatch)
}
}

/// Creates a binary quantized unaligned slice of bytes from a slice of f32.
/// Will allocate.
pub(crate) fn binary_quantized_vectors_from_slice(slice: &[f32]) -> Cow<'static, Self> {
let mut output: Vec<u8> = Vec::with_capacity(slice.len() / QUANTIZED_WORD_SIZE);
for chunk in slice.chunks(QUANTIZED_WORD_SIZE) {
let mut word: QuantizedWord = 0;
for bit in chunk {
let bit = bit.is_sign_positive();
todo!()
for bit in chunk.iter().rev() {
word <<= 1;
word += bit.is_sign_positive() as QuantizedWord;
}
output.extend_from_slice(&word.to_ne_bytes());
}

Cow::Owned(output)
Expand All @@ -142,14 +176,23 @@ impl UnalignedVector {
self.0.chunks_exact(size_of::<f32>()).map(NativeEndian::read_f32)
}

/// Returns an iterator of f32 that are read from the slice.
/// Returns an iterator of f32 that are read from the binary quantized slice.
/// The f32 are copied in memory and are therefore, aligned.
pub(crate) fn map_f32(&mut self, f: impl Fn(f32) -> f32) {
self.0.chunks_exact_mut(size_of::<f32>()).for_each(|chunk| {
let mut scalar = NativeEndian::read_f32(chunk);
scalar = f(scalar);
NativeEndian::write_f32(chunk, scalar);
})
pub(crate) fn iter_binary_quantized(&self) -> impl Iterator<Item = f32> + '_ {
self.0
.chunks_exact(size_of::<QuantizedWord>())
.map(|bytes| QuantizedWord::from_ne_bytes(bytes.try_into().unwrap()))
.flat_map(|mut word| {
let mut ret = vec![0.0; QUANTIZED_WORD_SIZE];
for index in 0..QUANTIZED_WORD_SIZE {
let bit = word & 1;
word >>= 1;
if bit == 1 {
ret[index] = 1.0;
}
}
ret
})
}

/// Returns the raw pointer to the start of this slice.
Expand All @@ -172,24 +215,7 @@ impl ToOwned for UnalignedVector {

impl Borrow<UnalignedVector> for Vec<u8> {
fn borrow(&self) -> &UnalignedVector {
UnalignedVector::from_bytes_unchecked(&self)
}
}

impl fmt::Debug for UnalignedVector {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
struct SmallF32(f32);
impl fmt::Debug for SmallF32 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_fmt(format_args!("{:.4?}", self.0))
}
}

let mut list = f.debug_list();
self.iter_f32().for_each(|float| {
list.entry(&SmallF32(float));
});
list.finish()
UnalignedVector::from_bytes_unchecked(self)
}
}

Expand Down Expand Up @@ -244,13 +270,27 @@ impl fmt::Debug for ItemIds<'_> {
}
}

#[derive(Debug, Clone)]
#[derive(Clone)]
pub struct SplitPlaneNormal<'a> {
pub left: NodeId,
pub right: NodeId,
pub normal: Cow<'a, UnalignedVector>,
}

/// Wraps a `SplitPlaneNormal` with its distance type to display it.
/// The distance is required to be able to read the normal.
pub struct DisplaySplitPlaneNormal<'a, D: Distance>(&'a SplitPlaneNormal<'a>, PhantomData<D>);
impl<D: Distance> fmt::Debug for DisplaySplitPlaneNormal<'_, D> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let normal = DisplayVec(D::read_unaligned_vector(&self.0.normal));
f.debug_struct("SplitPlaneNormal")
.field("left", &self.0.left)
.field("right", &self.0.right)
.field("normal", &normal)
.finish()
}
}

/// The codec used internally to encode and decode nodes.
pub struct NodeCodec<D>(D);

Expand Down
8 changes: 6 additions & 2 deletions src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,12 @@ impl<'t, D: Distance> Reader<'t, D> {

/// Returns the vector for item `i` that was previously added.
pub fn item_vector(&self, rtxn: &'t RoTxn, item: ItemId) -> Result<Option<Vec<f32>>> {
Ok(item_leaf(self.database, self.index, rtxn, item)?
.map(|leaf| D::read_unaligned_vector(&leaf.vector)))
Ok(item_leaf(self.database, self.index, rtxn, item)?.map(|leaf| {
let mut vec = D::read_unaligned_vector(&leaf.vector);
// Depending on the distance we may have up to 63 additional elements in the vec
vec.drain(self.dimensions()..);
vec
}))
}

/// Returns `true` if the index is empty.
Expand Down
1 change: 1 addition & 0 deletions src/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use tempfile::TempDir;
use crate::roaring::RoaringBitmapCodec;
use crate::{Database, Distance, MetadataCodec, NodeCodec, NodeMode, Reader};

mod binary_quantized;

Check failure on line 12 in src/tests/mod.rs

View workflow job for this annotation

GitHub Actions / test (macos-latest-xlarge, stable)

file not found for module `binary_quantized`

Check failure on line 12 in src/tests/mod.rs

View workflow job for this annotation

GitHub Actions / test (ubuntu-latest, stable)

file not found for module `binary_quantized`

Check failure on line 12 in src/tests/mod.rs

View workflow job for this annotation

GitHub Actions / test (macos-latest-xlarge, beta)

file not found for module `binary_quantized`
mod node;

Check failure on line 13 in src/tests/mod.rs

View workflow job for this annotation

GitHub Actions / test (macos-latest-xlarge, stable)

file not found for module `node`

Check failure on line 13 in src/tests/mod.rs

View workflow job for this annotation

GitHub Actions / test (ubuntu-latest, stable)

file not found for module `node`

Check failure on line 13 in src/tests/mod.rs

View workflow job for this annotation

GitHub Actions / test (macos-latest-xlarge, beta)

file not found for module `node`
mod reader;
mod writer;
Expand Down
14 changes: 8 additions & 6 deletions src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,9 @@ impl<D: Distance> Writer<D> {
while let Some((item_id, node)) = cursor.next().transpose()? {
match node {
Node::Leaf(Leaf { header: _, vector }) => {
let new_leaf = Node::Leaf(Leaf {
header: ND::new_header(&vector),
vector: Cow::Owned(vector.into_owned()),
});
let vector = D::read_unaligned_vector(&vector);
let vector = ND::craft_owned_unaligned_vector_from_f32(vector);
let new_leaf = Node::Leaf(Leaf { header: ND::new_header(&vector), vector });
unsafe {
// safety: We do not keep a reference to the current value, we own it.
cursor.put_current_with_options::<NodeCodec<ND>>(
Expand Down Expand Up @@ -87,8 +86,11 @@ impl<D: Distance> Writer<D> {

/// Returns an `Option`al vector previous stored in this database.
pub fn item_vector(&self, rtxn: &RoTxn, item: ItemId) -> Result<Option<Vec<f32>>> {
Ok(item_leaf(self.database, self.index, rtxn, item)?
.map(|leaf| D::read_unaligned_vector(&leaf.vector)))
Ok(item_leaf(self.database, self.index, rtxn, item)?.map(|leaf| {
let mut vec = D::read_unaligned_vector(&leaf.vector);
vec.drain(self.dimensions..);
vec
}))
}

/// Returns `true` if the index is empty.
Expand Down

0 comments on commit 27ed4d4

Please sign in to comment.