diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..3873e329 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +Cargo.lock +/target +/assets/test.tree diff --git a/.rustfmt.toml b/.rustfmt.toml new file mode 100644 index 00000000..250124b7 --- /dev/null +++ b/.rustfmt.toml @@ -0,0 +1,5 @@ +unstable_features = true + +use_small_heuristics = "max" +imports_granularity = "Module" +group_imports = "StdExternalCrate" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..bcd3f6a3 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "arroy" +version = "0.1.0" +edition = "2021" + +[dependencies] +annoy-rs = { path = "../RuAnnoy" } +bytemuck = { version = "1.14.0", features = ["min_const_generics", "derive", "extern_crate_alloc"] } +byteorder = "1.5.0" +ordered-float = "4.1.1" diff --git a/src/distance.rs b/src/distance.rs new file mode 100644 index 00000000..78bee07b --- /dev/null +++ b/src/distance.rs @@ -0,0 +1,35 @@ +pub fn dot_product_no_simd(u: &[f32], v: &[f32]) -> f32 { + u.iter().zip(v.iter()).map(|(x, y)| x * y).sum() +} + +pub fn minkowski_margin(u: &[f32], v: &[f32], bias: f32) -> f32 { + bias + dot_product_no_simd(u, v) +} + +pub fn cosine_distance_no_simd(u: &[f32], v: &[f32]) -> f32 { + // want to calculate (a/|a| - b/|b|)^2 + // = a^2 / a^2 + b^2 / b^2 - 2ab/|a||b| + // = 2 - 2cos + let mut pp: f32 = 0.0; + let mut qq: f32 = 0.0; + let mut pq: f32 = 0.0; + for (_u, _v) in u.iter().zip(v.iter()) { + pp += _u * _u; + qq += _v * _v; + pq += _u * _v; + } + let ppqq = pp * qq; + if ppqq.is_sign_positive() { + 2.0 - 2.0 * pq / ppqq.sqrt() + } else { + 2.0 + } +} + +pub fn manhattan_distance_no_simd(u: &[f32], v: &[f32]) -> f32 { + u.iter().zip(v.iter()).map(|(x, y)| (x - y).abs()).sum() +} + +pub fn euclidean_distance_no_simd(u: &[f32], v: &[f32]) -> f32 { + u.iter().zip(v.iter()).map(|(x, y)| (x - y).powi(2)).sum() +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 00000000..1b0a00b2 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,287 @@ +use core::fmt; +use std::cmp::Reverse; +use std::collections::BinaryHeap; +use std::iter; +use std::mem::size_of; + +use bytemuck::allocation::pod_collect_to_vec; +use byteorder::{ByteOrder, NativeEndian}; +use distance::{euclidean_distance_no_simd, manhattan_distance_no_simd}; +use node::{NodeHeaderAngular, NodeHeaderDot, NodeHeaderMinkowski}; +use ordered_float::OrderedFloat; + +use crate::distance::{cosine_distance_no_simd, dot_product_no_simd, minkowski_margin}; +use crate::node::Node; +use crate::priority_queue::BinaryHeapItem; + +mod distance; +mod node; +mod priority_queue; + +pub struct Arroy<'a> { + pub dimension: usize, + pub distance_type: DistanceType, + pub node_size: usize, + pub size: usize, + pub(crate) max_descendants: i32, + pub(crate) offset_before_children: usize, + pub(crate) node_header_size: usize, + pub(crate) storage: &'a [u8], + pub(crate) roots: Vec, +} + +impl<'a> Arroy<'a> { + pub fn new(storage: &'a [u8], dimension: usize, distance_type: DistanceType) -> Arroy { + let (offset_before_children, node_header_size, max_descendants) = match distance_type { + DistanceType::Angular => (4, NodeHeaderAngular::header_size(), dimension + 2), + DistanceType::Euclidean | DistanceType::Manhattan => { + (8, NodeHeaderMinkowski::header_size(), dimension + 2) + } + // DistanceType::Hamming => (4, 12), + DistanceType::Dot => (4, NodeHeaderDot::header_size(), dimension + 3), + }; + + let index_size = storage.len(); + let node_size = node_header_size + (size_of::() * dimension); + + let mut roots = Vec::new(); + let mut m = None; + let mut i = index_size - node_size; + loop { + let n_descendants = NativeEndian::read_i32(&storage[i..]); + if m.map_or(true, |m| n_descendants == m) { + roots.push(i / node_size); + m = Some(n_descendants); + } else { + break; + } + match i.checked_sub(node_size) { + Some(new) => i = new, + None => break, + } + } + + // hacky fix: since the last root precedes the copy of all roots, delete it + if roots.len() > 1 { + if let Some((first, last)) = roots.first().zip(roots.last()) { + let first_descendant = + get_nth_descendant_id(storage, first * node_size, offset_before_children, 0); + let last_descendant = + get_nth_descendant_id(storage, last * node_size, offset_before_children, 0); + if first_descendant == last_descendant { + roots.pop(); + } + } + } + + Arroy { + dimension, + distance_type, + offset_before_children, + node_header_size, + max_descendants: max_descendants as i32, + node_size, + storage, + roots, + size: m.unwrap() as usize, + } + } + + pub fn item_vector(&self, item_index: usize) -> Option> { + if item_index < self.size { + let node_offset = item_index * self.node_size; + Some(self.node_slice_with_offset(node_offset)) + } else { + None + } + } + + pub fn nns_by_vector( + &self, + query_vector: &[f32], + n_results: usize, + search_k: Option, + // should_include_distance: bool, + ) -> Vec<(usize, f32)> { + assert_eq!( + query_vector.len(), + self.dimension, + "invalid vector dimensions, provided {} but expected {}", + query_vector.len(), + self.dimension + ); + + let result_capacity = n_results.min(self.size).max(1); + let search_k = search_k.unwrap_or(result_capacity * self.roots.len()); + + let mut pq = BinaryHeap::with_capacity(result_capacity); + for &item in &self.roots { + pq.push(BinaryHeapItem { item: item as i32, ord: OrderedFloat(f32::MAX) }); + } + + let mut nearest_neighbors = Vec::with_capacity(search_k); + while !pq.is_empty() && nearest_neighbors.len() < search_k { + if let Some(BinaryHeapItem { item: top_node_id_i32, ord: top_node_margin }) = pq.pop() { + let top_node_id = top_node_id_i32 as usize; + let top_node = Node::new_with_id( + top_node_id, + self.node_size, + self.distance_type, + self.storage, + ); + let top_node_header = top_node.header; + let top_node_offset = top_node.offset; + let n_descendants = top_node_header.get_n_descendant(); + if n_descendants == 1 && top_node_id < self.size { + nearest_neighbors.push(top_node_id_i32); + } else if n_descendants <= self.max_descendants { + let children_ids = self.descendant_ids(top_node_offset, n_descendants as usize); + nearest_neighbors.extend(children_ids); + } else { + let v = self.node_slice_with_offset(top_node_offset); + let margin = self.margin(&v, query_vector, top_node_offset); + let [child_0, child_1] = top_node_header.get_children_id_slice(); + // NOTE: Hamming has different logic to calculate margin + pq.push(BinaryHeapItem { + item: child_1, + ord: OrderedFloat(top_node_margin.0.min(margin)), + }); + pq.push(BinaryHeapItem { + item: child_0, + ord: OrderedFloat(top_node_margin.0.min(-margin)), + }); + } + } + } + nearest_neighbors.sort_unstable(); + let mut sorted_nns = BinaryHeap::with_capacity(nearest_neighbors.len()); + let mut nn_id_last = -1; + for nn_id in nearest_neighbors { + if nn_id == nn_id_last { + continue; + } + nn_id_last = nn_id; + let node = + Node::new_with_id(nn_id as usize, self.node_size, self.distance_type, self.storage); + let n_descendants = node.header.get_n_descendant(); + if n_descendants != 1 { + continue; + } + + let s = self.node_slice_with_offset(nn_id as usize * self.node_size); + sorted_nns.push(Reverse(BinaryHeapItem { + item: nn_id, + ord: OrderedFloat(self.distance_no_norm(&s, query_vector)), + })); + } + + let final_result_capacity = n_results.min(sorted_nns.len()); + let mut output = Vec::with_capacity(final_result_capacity); + for _ in 0..final_result_capacity { + match sorted_nns.pop() { + Some(Reverse(BinaryHeapItem { item, ord: OrderedFloat(dist) })) => { + output.push((item as usize, self.normalized_distance(dist))); + } + None => break, + } + } + + output + } + + pub fn nns_by_item( + &self, + item: usize, + n_results: usize, + search_k: Option, + ) -> Option> { + let query_vector = self.item_vector(item)?; + Some(self.nns_by_vector(&query_vector, n_results, search_k)) + } + + fn descendant_ids(&self, node_offset: usize, n: usize) -> impl Iterator + 'a { + let offset = node_offset + self.offset_before_children; + let mut remaining = &self.storage[offset..offset + n * size_of::()]; + iter::from_fn(move || { + if remaining.is_empty() { + None + } else { + let number = NativeEndian::read_i32(remaining); + remaining = &remaining[size_of::()..]; + Some(number) + } + }) + } + + fn node_slice_with_offset(&self, node_offset: usize) -> Vec { + let offset = node_offset + self.node_header_size; + let size = self.dimension * size_of::(); + pod_collect_to_vec(&self.storage[offset..offset + size]) + } + + fn margin(&self, v1: &[f32], v2: &[f32], node_offset: usize) -> f32 { + match self.distance_type { + DistanceType::Angular => dot_product_no_simd(v1, v2), + DistanceType::Euclidean | DistanceType::Manhattan => { + let bias = NativeEndian::read_f32(&self.storage[node_offset + 4..]); + minkowski_margin(v1, v2, bias) + } + DistanceType::Dot => { + let dot = NativeEndian::read_f32(&self.storage[node_offset + 12..]).powi(2); + dot_product_no_simd(v1, v2) + dot + } + } + } + + fn distance_no_norm(&self, v1: &[f32], v2: &[f32]) -> f32 { + match self.distance_type { + DistanceType::Angular => cosine_distance_no_simd(v1, v2), + DistanceType::Euclidean => euclidean_distance_no_simd(v1, v2), + DistanceType::Manhattan => manhattan_distance_no_simd(v1, v2), + DistanceType::Dot => -dot_product_no_simd(v1, v2), + } + } + + fn normalized_distance(&self, distance: f32) -> f32 { + match self.distance_type { + DistanceType::Angular | DistanceType::Euclidean => distance.sqrt(), + DistanceType::Dot => -distance, + DistanceType::Manhattan => distance, + } + } +} + +impl fmt::Debug for Arroy<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Arroy") + .field("dimension", &self.dimension) + .field("distance_type", &self.distance_type) + .field("node_size", &self.node_size) + .field("size", &self.size) + .field("max_descendants", &self.max_descendants) + .field("offset_before_children", &self.offset_before_children) + .field("node_header_size", &self.node_header_size) + .field("roots", &self.roots) + .finish() + } +} + +pub(crate) fn get_nth_descendant_id( + storage: &[u8], + node_offset: usize, + offset_before_children: usize, + n: usize, +) -> usize { + let child_offset = node_offset + offset_before_children + n * size_of::(); + NativeEndian::read_i32(&storage[child_offset..]) as usize +} + +#[derive(PartialEq, Eq, Debug, Copy, Clone)] +#[repr(u8)] +pub enum DistanceType { + Angular = 0, + Euclidean = 1, + Manhattan = 2, + // Hamming = 3, + Dot = 4, +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 00000000..10c6d8bf --- /dev/null +++ b/src/main.rs @@ -0,0 +1,30 @@ +use annoy_rs::*; +use arroy::{Arroy, DistanceType}; + +fn main() { + let dimensions = 40; + let distance_type = DistanceType::Angular; + let tree = std::fs::read("test.tree").unwrap(); + let arroy = Arroy::new(&tree[..], dimensions, distance_type); + + // dbg!(&arroy); + let v = arroy.item_vector(0).unwrap(); + let results = arroy.nns_by_item(0, 3, None).unwrap(); + // println!("{v:?}"); + + let index = AnnoyIndex::load(40, "test.tree", IndexType::Angular).unwrap(); + // dbg!(&index); + let v0 = index.get_item_vector(0); + let results0 = index.get_nearest_to_item(0, 3, -1, true); + // println!("{v0:?}"); + + assert_eq!(v, v0); + + assert_eq!(results[0].0, results0.id_list[0] as usize); + assert_eq!(results[1].0, results0.id_list[1] as usize); + assert_eq!(results[2].0, results0.id_list[2] as usize); + + assert_eq!(results[0].1, results0.distance_list[0]); + assert_eq!(results[1].1, results0.distance_list[1]); + assert_eq!(results[2].1, results0.distance_list[2]); +} diff --git a/src/node.rs b/src/node.rs new file mode 100644 index 00000000..6cea6755 --- /dev/null +++ b/src/node.rs @@ -0,0 +1,121 @@ +use std::mem; + +use crate::DistanceType; +use bytemuck::{Pod, Zeroable}; + +pub(crate) struct Node { + pub offset: usize, + pub header: NodeHeader, +} + +impl Node { + pub fn new_with_id( + id: usize, + node_size: usize, + index_type: DistanceType, + storage: &[u8], + ) -> Node { + let offset = id * node_size; + Node { + offset, + header: NodeHeader::new(offset, index_type, storage), + } + } +} + +#[repr(C)] +pub(crate) enum NodeHeader { + Angular(NodeHeaderAngular), + Minkowski(NodeHeaderMinkowski), + Dot(NodeHeaderDot), +} + +impl NodeHeader { + pub fn new(offset: usize, distance_type: DistanceType, storage: &[u8]) -> NodeHeader { + match distance_type { + DistanceType::Angular => NodeHeader::Angular(NodeHeaderAngular::read(storage, offset)), + DistanceType::Euclidean | DistanceType::Manhattan => { + NodeHeader::Minkowski(NodeHeaderMinkowski::read(storage, offset)) + } + DistanceType::Dot => NodeHeader::Dot(NodeHeaderDot::read(storage, offset)), + } + } + + pub fn get_n_descendant(&self) -> i32 { + match self { + NodeHeader::Angular(h) => h.n_descendants, + NodeHeader::Minkowski(h) => h.n_descendants, + NodeHeader::Dot(h) => h.n_descendants, + } + } + + pub fn get_children_id_slice(&self) -> [i32; 2] { + match self { + NodeHeader::Angular(h) => h.children, + NodeHeader::Minkowski(h) => h.children, + NodeHeader::Dot(h) => h.children, + } + } +} + +#[repr(C)] +#[derive(Pod, Zeroable, Debug, Clone, Copy)] +pub struct NodeHeaderAngular { + n_descendants: i32, + children: [i32; 2], +} + +#[repr(C)] +#[derive(Pod, Zeroable, Debug, Clone, Copy)] +pub struct NodeHeaderMinkowski { + n_descendants: i32, + bias: f32, + children: [i32; 2], +} + +#[repr(C)] +#[derive(Pod, Zeroable, Debug, Clone, Copy)] +pub struct NodeHeaderDot { + n_descendants: i32, + children: [i32; 2], + dot_factor: f32, +} + +impl NodeHeaderAngular { + fn read(storage: &[u8], offset: usize) -> NodeHeaderAngular { + let array: [u8; mem::size_of::()] = storage[offset..offset + mem::size_of::()] + .try_into() + .unwrap(); + bytemuck::cast(array) + } + + pub const fn header_size() -> usize { + mem::size_of::() + } +} + +impl NodeHeaderMinkowski { + fn read(storage: &[u8], offset: usize) -> NodeHeaderMinkowski { + let array: [u8; mem::size_of::()] = storage[offset..offset + mem::size_of::()] + .try_into() + .unwrap(); + bytemuck::cast(array) + } + + pub const fn header_size() -> usize { + mem::size_of::() + } +} + +impl NodeHeaderDot { + fn read(storage: &[u8], offset: usize) -> NodeHeaderDot { + let array: [u8; mem::size_of::()] = storage[offset..offset + mem::size_of::()] + .try_into() + .unwrap(); + bytemuck::cast(array) + } + + pub const fn header_size() -> usize { + mem::size_of::() + } +} diff --git a/src/priority_queue.rs b/src/priority_queue.rs new file mode 100644 index 00000000..3f6437ae --- /dev/null +++ b/src/priority_queue.rs @@ -0,0 +1,264 @@ +use std::cmp::Ordering; +use std::fmt::Debug; + +#[derive(Debug, Clone, Eq)] +pub struct BinaryHeapItem +where + I: Eq, + O: PartialEq + Eq + PartialOrd, +{ + pub item: I, + pub ord: O, +} + +impl PartialEq for BinaryHeapItem +where + I: Eq, + O: PartialEq + Eq + PartialOrd, +{ + fn eq(&self, other: &Self) -> bool { + self.ord == other.ord + } +} + +impl Ord for BinaryHeapItem +where + I: Eq, + O: PartialEq + Eq + PartialOrd, +{ + fn cmp(&self, other: &Self) -> Ordering { + self.partial_cmp(other).unwrap_or(Ordering::Equal) + } +} + +impl PartialOrd for BinaryHeapItem +where + I: Eq, + O: PartialEq + Eq + PartialOrd, +{ + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +#[cfg(test)] +mod tests { + use std::cmp::Ordering; + use std::fmt::Debug; + + pub struct PriorityQueue + where + P: PartialOrd, + { + ord: Ordering, + keys: Vec, + priorities: Vec

, + } + + impl PriorityQueue + where + P: PartialOrd + Debug, + { + pub fn with_capacity(capacity: usize, reverse: bool) -> PriorityQueue { + PriorityQueue { + keys: Vec::with_capacity(capacity), + priorities: Vec::with_capacity(capacity), + ord: match reverse { + true => Ordering::Greater, + _ => Ordering::Less, + }, + } + } + + pub fn push(&mut self, key: K, priority: P) { + self.keys.push(key); + self.priorities.push(priority); + let pos = self.priorities.len() - 1; + if pos > 0 && !self.max_heap_up_adjust(pos) { + self.keys.pop(); + self.priorities.pop(); + } + } + + fn max_heap_up_adjust(&mut self, position: usize) -> bool { + let mut pos = position; + let priorities = self.priorities.as_mut_slice(); + let keys = self.keys.as_mut_slice(); + while pos > 0 { + let p_pos = (pos - 1) / 2; + match priorities[p_pos].partial_cmp(&priorities[pos]) { + None => return false, + Some(ord) if ord == self.ord => { + priorities.swap(pos, p_pos); + keys.swap(pos, p_pos); + pos = p_pos; + } + _ => return true, + } + } + true + } + + fn max_heap_down_adjust(&mut self, position: usize) -> bool { + let len = self.len(); + let mut pos = position; + let priorities = self.priorities.as_mut_slice(); + let keys = self.keys.as_mut_slice(); + loop { + let lc_pos = pos * 2 + 1; + let rc_pos = pos * 2 + 2; + let mut largest_pos = pos; + if lc_pos < len { + largest_pos = match priorities[largest_pos].partial_cmp(&priorities[lc_pos]) { + None => return false, + Some(ord) if ord == self.ord => lc_pos, + _ => largest_pos, + }; + } + if rc_pos < len { + largest_pos = match priorities[largest_pos].partial_cmp(&priorities[rc_pos]) { + None => return false, + Some(ord) if ord == self.ord => rc_pos, + _ => largest_pos, + }; + } + if largest_pos != pos { + priorities.swap(largest_pos, pos); + keys.swap(largest_pos, pos); + pos = largest_pos; + } else { + break; + } + } + true + } + + // fn max_heap_build(&mut self) { + // let len = self.len(); + // let priorities = self.priorities.as_mut_slice(); + // let keys = self.keys.as_mut_slice(); + // for pos in (0..len / 2).rev() { + // let lc_pos = pos * 2 + 1; + // let rc_pos = pos * 2 + 2; + // let mut largest_pos = pos; + // if lc_pos < len { + // largest_pos = match priorities[largest_pos].partial_cmp(&priorities[lc_pos]) { + // None => return, + // Some(Ordering::Less) => lc_pos, + // _ => largest_pos, + // }; + // } + // if rc_pos < len { + // largest_pos = match priorities[largest_pos].partial_cmp(&priorities[rc_pos]) { + // None => return, + // Some(Ordering::Less) => rc_pos, + // _ => largest_pos, + // }; + // } + // if largest_pos != pos { + // priorities.swap(largest_pos, pos); + // keys.swap(largest_pos, pos); + // } + // } + // } + + pub fn pop(&mut self) -> Option<(K, P)> { + let len = self.len(); + if len > 0 { + let k = self.keys.swap_remove(0); + let p = self.priorities.swap_remove(0); + if len > 2 { + // self.max_heap_build(); + self.max_heap_down_adjust(0); + } + return Some((k, p)); + } + None + } + + pub fn len(&self) -> usize { + self.keys.len() + } + } + + #[test] + fn test_pq_1() { + test_qp_inner(vec![5, 7, 9, 2, 4, 1].as_mut_slice()); + } + + #[test] + fn test_pq_2() { + test_qp_inner(vec![1, 2, 3, 4, 5, 6, 7, 8, 9].as_mut_slice()); + } + + #[test] + fn test_pq_3() { + test_qp_inner(vec![9, 8, 7, 6, 5, 4, 3, 2, 1].as_mut_slice()); + } + + fn test_qp_inner(s: &mut [T]) { + let mut pq = PriorityQueue::with_capacity(s.len(), false); + for &i in s.iter() { + pq.push(i, i); + } + assert_eq!(pq.len(), s.len()); + + let mut sorted = Vec::with_capacity(s.len()); + while pq.len() > 0 { + if let Some((k, _p)) = pq.pop() { + sorted.push(k); + } + } + + s.sort_by(|a, b| b.partial_cmp(a).unwrap()); + assert_eq!(s, sorted.as_slice()); + } +} + +#[cfg(test)] +#[cfg(nightly)] +mod bench { + use std::collections::BinaryHeap; + + use super::tests::*; + use super::*; + extern crate test; + use test::Bencher; + + #[bench] + fn bench_pq(bencher: &mut Bencher) { + let input = get_input(); + bencher.iter(|| { + let mut pq = PriorityQueue::with_capacity(input.len(), false); + for &i in input.iter() { + pq.push(i, i); + } + while pq.len() > 0 { + pq.pop(); + } + }); + } + + #[bench] + fn bench_binary_heap(bencher: &mut Bencher) { + let input = get_input(); + bencher.iter(|| { + let mut pq = BinaryHeap::with_capacity(input.len()); + for &i in input.iter() { + pq.push(BinaryHeapItem { item: i, ord: i }); + } + while pq.len() > 0 { + pq.pop(); + } + }); + } + + fn get_input() -> Vec { + let capacity = 1024; + let mut vec = Vec::with_capacity(capacity); + for i in 0..capacity { + vec.push(i as i32); + } + vec + } +}