From c087a68c4f42d13451bff011e0efd2c195070b38 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 24 Sep 2024 18:10:11 +0200 Subject: [PATCH 1/8] Provide a first builder for the build options --- examples/build-tree-no-commit.rs | 8 ++- examples/compare_with_hnsw.rs | 4 +- examples/import-vectors.rs | 4 +- src/lib.rs | 6 +- src/tests/binary_quantized.rs | 4 +- src/tests/reader.rs | 20 +++--- src/tests/writer.rs | 104 +++++++++++++-------------- src/writer.rs | 117 ++++++++++++++++++++++++++----- 8 files changed, 176 insertions(+), 91 deletions(-) diff --git a/examples/build-tree-no-commit.rs b/examples/build-tree-no-commit.rs index 740f8c8c..7a28b791 100644 --- a/examples/build-tree-no-commit.rs +++ b/examples/build-tree-no-commit.rs @@ -7,7 +7,7 @@ use std::path::PathBuf; use std::time::Instant; use arroy::distances::DotProduct; -use arroy::{Database, Reader, Stats, TreeStats, Writer}; +use arroy::{BuildOption, Database, Reader, Stats, TreeStats, Writer}; use clap::Parser; use heed::{EnvFlags, EnvOpenOptions}; use rand::rngs::StdRng; @@ -64,7 +64,11 @@ fn main() -> Result<(), heed::BoxedError> { let now = Instant::now(); println!("Building the arroy internal trees..."); let mut rng = StdRng::seed_from_u64(seed); - writer.build(&mut wtxn, &mut rng, n_trees).unwrap(); + let mut options = BuildOption::new(); + if let Some(n_trees) = n_trees { + options.with_n_trees(n_trees); + } + writer.build(&mut wtxn, &mut rng, &options).unwrap(); println!("Took {:.2?} to build", now.elapsed()); let reader = Reader::open(&wtxn, 0, database)?; diff --git a/examples/compare_with_hnsw.rs b/examples/compare_with_hnsw.rs index b97ccbe8..7eb619ad 100644 --- a/examples/compare_with_hnsw.rs +++ b/examples/compare_with_hnsw.rs @@ -3,7 +3,7 @@ use std::time::Instant; use arroy::distances::Euclidean; use arroy::internals::{Leaf, UnalignedVector}; -use arroy::{Database, Distance, ItemId, Reader, Result, Writer}; +use arroy::{BuildOption, Database, Distance, ItemId, Reader, Result, Writer}; use heed::{EnvOpenOptions, RwTxn}; use instant_distance::{Builder, HnswMap, MapItem}; use rand::rngs::StdRng; @@ -79,7 +79,7 @@ fn load_into_arroy( for (i, Point(vector)) in points.iter().enumerate() { writer.add_item(&mut wtxn, i.try_into().unwrap(), &vector[..])?; } - writer.build(&mut wtxn, rng, None)?; + writer.build(&mut wtxn, rng, &BuildOption::default())?; wtxn.commit()?; Ok(()) diff --git a/examples/import-vectors.rs b/examples/import-vectors.rs index a5b861d5..516f18a7 100644 --- a/examples/import-vectors.rs +++ b/examples/import-vectors.rs @@ -6,7 +6,7 @@ use std::path::PathBuf; use std::time::Instant; use arroy::distances::DotProduct; -use arroy::{Database, Writer}; +use arroy::{BuildOption, Database, Writer}; use clap::Parser; use heed::{EnvFlags, EnvOpenOptions}; use rand::rngs::StdRng; @@ -102,7 +102,7 @@ fn main() -> Result<(), heed::BoxedError> { println!("Building the arroy internal trees..."); let now = Instant::now(); - writer.build(&mut wtxn, &mut rng, n_trees).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_maybe_n_trees(n_trees)).unwrap(); wtxn.commit().unwrap(); println!("Took {:.2?} to build", now.elapsed()); diff --git a/src/lib.rs b/src/lib.rs index 2d4f89da..55e31a87 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,7 +11,7 @@ //! use std::num::NonZeroUsize; //! //! use arroy::distances::Euclidean; -//! use arroy::{Database as ArroyDatabase, Writer, Reader}; +//! use arroy::{Database as ArroyDatabase, Writer, Reader, BuildOption}; //! use rand::rngs::StdRng; //! use rand::{Rng, SeedableRng}; //! @@ -40,7 +40,7 @@ //! //! // You can specify the number of trees to use or specify None. //! let mut rng = StdRng::seed_from_u64(42); -//! writer.build(&mut wtxn, &mut rng, None)?; +//! writer.build(&mut wtxn, &mut rng, &BuildOption::new())?; //! //! // By committing, other readers can query the database in parallel. //! wtxn.commit()?; @@ -98,7 +98,7 @@ use node::{Node, NodeCodec}; use node_id::{NodeId, NodeMode}; pub use reader::Reader; pub use stats::{Stats, TreeStats}; -pub use writer::Writer; +pub use writer::{BuildOption, Writer}; /// The set of types used by the [`Distance`] trait. pub mod internals { diff --git a/src/tests/binary_quantized.rs b/src/tests/binary_quantized.rs index 8eb475f1..8e98e2a7 100644 --- a/src/tests/binary_quantized.rs +++ b/src/tests/binary_quantized.rs @@ -1,7 +1,7 @@ use crate::{ distance::BinaryQuantizedEuclidean, tests::{create_database, rng}, - Writer, + BuildOption, Writer, }; #[test] @@ -41,7 +41,7 @@ fn write_and_retrieve_binary_quantized_vector() { ] "###); - writer.build(&mut wtxn, &mut rng(), Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" diff --git a/src/tests/reader.rs b/src/tests/reader.rs index e0d7d94a..6beed985 100644 --- a/src/tests/reader.rs +++ b/src/tests/reader.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use super::*; use crate::distance::Angular; use crate::distances::{Euclidean, Manhattan}; -use crate::{ItemId, Reader, Writer}; +use crate::{BuildOption, ItemId, Reader, Writer}; pub struct NnsRes(pub Option>); @@ -44,7 +44,7 @@ fn open_db_with_wrong_dimension() { let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 0, &[0.0, 0.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); let rtxn = handle.env.read_txn().unwrap(); @@ -60,7 +60,7 @@ fn open_db_with_wrong_distance() { let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 0, &[0.0, 0.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); let rtxn = handle.env.read_txn().unwrap(); @@ -82,7 +82,7 @@ fn search_in_db_with_a_single_vector() { let writer = Writer::new(handle.database, 0, 3); writer.add_item(&mut wtxn, 0, &[0.00397, 0.553, 0.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), None).unwrap(); + writer.build(&mut wtxn, &mut rng(), &BuildOption::new()).unwrap(); wtxn.commit().unwrap(); let rtxn = handle.env.read_txn().unwrap(); @@ -105,7 +105,7 @@ fn two_dimension_on_a_line() { writer.add_item(&mut wtxn, i, &[i as f32, 0.0]).unwrap(); } - writer.build(&mut wtxn, &mut rng(), Some(50)).unwrap(); + writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(50)).unwrap(); wtxn.commit().unwrap(); let rtxn = handle.env.read_txn().unwrap(); @@ -153,7 +153,7 @@ fn two_dimension_on_a_column() { writer.add_item(&mut wtxn, i, &[0.0, i as f32]).unwrap(); } - writer.build(&mut wtxn, &mut rng(), Some(50)).unwrap(); + writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(50)).unwrap(); wtxn.commit().unwrap(); let rtxn = handle.env.read_txn().unwrap(); @@ -178,7 +178,7 @@ fn get_item_ids() { writer.add_item(&mut wtxn, i, &[0.0, i as f32]).unwrap(); } - writer.build(&mut wtxn, &mut rng(), Some(50)).unwrap(); + writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(50)).unwrap(); let reader = Reader::::open(&wtxn, 0, handle.database).unwrap(); let ret = reader.item_ids(); @@ -201,7 +201,7 @@ fn filtering() { writer.add_item(&mut wtxn, i, &[0.0, i as f32]).unwrap(); } - writer.build(&mut wtxn, &mut rng(), Some(50)).unwrap(); + writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(50)).unwrap(); wtxn.commit().unwrap(); let rtxn = handle.env.read_txn().unwrap(); @@ -230,7 +230,7 @@ fn search_in_empty_database() { let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); - writer.build(&mut wtxn, &mut rng(), None).unwrap(); + writer.build(&mut wtxn, &mut rng(), &BuildOption::new()).unwrap(); wtxn.commit().unwrap(); let rtxn = handle.env.read_txn().unwrap(); @@ -262,7 +262,7 @@ fn try_reading_in_a_non_built_database() { // we build the database once to get valid metadata let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); - writer.build(&mut wtxn, &mut rng(), None).unwrap(); + writer.build(&mut wtxn, &mut rng(), &BuildOption::new()).unwrap(); let writer = Writer::new(handle.database, 0, 2); writer.del_item(&mut wtxn, 0).unwrap(); // We don't build the database; this leaves the database in a corrupted state diff --git a/src/tests/writer.rs b/src/tests/writer.rs index 8811585e..25350665 100644 --- a/src/tests/writer.rs +++ b/src/tests/writer.rs @@ -4,7 +4,7 @@ use rand::Rng; use super::{create_database, rng}; use crate::distance::{Angular, BinaryQuantizedAngular, DotProduct, Euclidean}; -use crate::{Database, Reader, Writer}; +use crate::{BuildOption, Database, Reader, Writer}; #[test] fn clear_small_database() { @@ -20,11 +20,11 @@ fn clear_small_database() { let zero_writer = Writer::new(database, 0, 3); zero_writer.add_item(&mut wtxn, 0, &[0.0, 1.0, 2.0]).unwrap(); zero_writer.clear(&mut wtxn).unwrap(); - zero_writer.build(&mut wtxn, &mut rng(), None).unwrap(); + zero_writer.build(&mut wtxn, &mut rng(), &BuildOption::new()).unwrap(); let one_writer = Writer::new(database, 1, 3); one_writer.add_item(&mut wtxn, 0, &[1.0, 2.0, 3.0]).unwrap(); - one_writer.build(&mut wtxn, &mut rng(), None).unwrap(); + one_writer.build(&mut wtxn, &mut rng(), &BuildOption::new()).unwrap(); wtxn.commit().unwrap(); let mut wtxn = env.write_txn().unwrap(); @@ -43,7 +43,7 @@ fn use_u32_max_minus_one_for_a_vec() { let writer = Writer::new(handle.database, 0, 3); writer.add_item(&mut wtxn, u32::MAX - 1, &[0.0, 1.0, 2.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -62,7 +62,7 @@ fn use_u32_max_for_a_vec() { let writer = Writer::new(handle.database, 0, 3); writer.add_item(&mut wtxn, u32::MAX, &[0.0, 1.0, 2.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -81,7 +81,7 @@ fn write_one_vector() { let writer = Writer::new(handle.database, 0, 3); writer.add_item(&mut wtxn, 0, &[0.0, 1.0, 2.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), None).unwrap(); + writer.build(&mut wtxn, &mut rng(), &BuildOption::new()).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -100,7 +100,7 @@ fn write_one_vector_in_one_tree() { let writer = Writer::new(handle.database, 0, 3); writer.add_item(&mut wtxn, 0, &[0.0, 1.0, 2.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -119,7 +119,7 @@ fn write_one_vector_in_multiple_trees() { let writer = Writer::new(handle.database, 0, 3); writer.add_item(&mut wtxn, 0, &[0.0, 1.0, 2.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), Some(10)).unwrap(); + writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(10)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -142,7 +142,7 @@ fn write_vectors_until_there_is_a_descendants() { writer.add_item(&mut wtxn, id, &[i, i, i]).unwrap(); } - writer.build(&mut wtxn, &mut rng(), Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -167,7 +167,7 @@ fn write_vectors_until_there_is_a_split() { writer.add_item(&mut wtxn, id, &[i, i, i]).unwrap(); } - writer.build(&mut wtxn, &mut rng(), Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -194,7 +194,7 @@ fn write_and_update_lot_of_random_points() { writer.add_item(&mut wtxn, id, &vector).unwrap(); } - writer.build(&mut wtxn, &mut rng, Some(10)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(10)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle); @@ -204,7 +204,7 @@ fn write_and_update_lot_of_random_points() { let vector: [f32; 30] = std::array::from_fn(|_| rng.gen()); writer.add_item(&mut wtxn, id, &vector).unwrap(); } - writer.build(&mut wtxn, &mut rng, Some(10)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(10)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle); @@ -218,7 +218,7 @@ fn write_multiple_indexes() { for i in 0..5 { let writer = Writer::new(handle.database, i, 3); writer.add_item(&mut wtxn, 0, &[0.0, 1.0, 2.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); } wtxn.commit().unwrap(); @@ -268,7 +268,7 @@ fn write_random_vectors_to_random_indexes() { let vector: [f32; 10] = std::array::from_fn(|_| rng.gen()); writer.add_item(&mut wtxn, i, &vector).unwrap(); } - writer.build(&mut wtxn, &mut rng, None).unwrap(); + writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); } wtxn.commit().unwrap(); } @@ -283,7 +283,7 @@ fn overwrite_one_item_incremental() { for i in 0..6 { writer.add_item(&mut wtxn, i, &[i as f32, 0.]).unwrap(); } - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -309,7 +309,7 @@ fn overwrite_one_item_incremental() { writer.add_item(&mut wtxn, 3, &[6., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -338,7 +338,7 @@ fn delete_one_item_in_a_one_item_db() { let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -354,7 +354,7 @@ fn delete_one_item_in_a_one_item_db() { writer.del_item(&mut wtxn, 0).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -378,7 +378,7 @@ fn delete_document_in_an_empty_index_74() { let writer = Writer::new(handle.database, 0, 2); writer.del_item(&mut wtxn, 0).unwrap(); writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, None).unwrap(); + writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); wtxn.commit().unwrap(); @@ -398,8 +398,8 @@ fn delete_document_in_an_empty_index_74() { let writer2 = Writer::new(handle.database, 1, 2); writer2.del_item(&mut wtxn, 0).unwrap(); - writer1.build(&mut wtxn, &mut rng, None).unwrap(); - writer2.build(&mut wtxn, &mut rng, None).unwrap(); + writer1.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); + writer2.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); let reader = Reader::open(&wtxn, 1, handle.database).unwrap(); let ret = reader.nns_by_vector(&wtxn, &[0., 0.], 10, None, None, None).unwrap(); @@ -432,7 +432,7 @@ fn delete_one_item_in_a_descendant() { // first, insert a bunch of items writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); writer.add_item(&mut wtxn, 1, &[1., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -449,7 +449,7 @@ fn delete_one_item_in_a_descendant() { writer.del_item(&mut wtxn, 0).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -472,7 +472,7 @@ fn delete_one_leaf_in_a_split() { writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); writer.add_item(&mut wtxn, 1, &[1., 0.]).unwrap(); writer.add_item(&mut wtxn, 2, &[2., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -491,7 +491,7 @@ fn delete_one_leaf_in_a_split() { writer.del_item(&mut wtxn, 0).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); // after deleting the leaf, the split node should be replaced by a descendant @@ -514,7 +514,7 @@ fn delete_one_item_in_a_single_document_database() { // first, insert a bunch of elements writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, None).unwrap(); + writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -530,7 +530,7 @@ fn delete_one_item_in_a_single_document_database() { writer.del_item(&mut wtxn, 0).unwrap(); - writer.build(&mut wtxn, &mut rng, None).unwrap(); + writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -551,7 +551,7 @@ fn delete_one_item() { for i in 0..6 { writer.add_item(&mut wtxn, i, &[i as f32, 0.]).unwrap(); } - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -576,7 +576,7 @@ fn delete_one_item() { writer.del_item(&mut wtxn, 3).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -600,7 +600,7 @@ fn delete_one_item() { writer.del_item(&mut wtxn, 1).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -623,7 +623,7 @@ fn add_one_item_incrementally_in_an_empty_db() { let mut rng = rng(); let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -635,7 +635,7 @@ fn add_one_item_incrementally_in_an_empty_db() { let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -654,7 +654,7 @@ fn add_one_item_incrementally_in_a_one_item_db() { let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -668,7 +668,7 @@ fn add_one_item_incrementally_in_a_one_item_db() { let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 1, &[1., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -689,7 +689,7 @@ fn add_one_item_incrementally_to_create_a_split_node() { let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); writer.add_item(&mut wtxn, 1, &[1., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -704,7 +704,7 @@ fn add_one_item_incrementally_to_create_a_split_node() { let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 2, &[2., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -729,7 +729,7 @@ fn add_one_item_incrementally() { for i in 0..6 { writer.add_item(&mut wtxn, i, &[i as f32, 0.]).unwrap(); } - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -754,7 +754,7 @@ fn add_one_item_incrementally() { writer.add_item(&mut wtxn, 25, &[25., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -781,7 +781,7 @@ fn add_one_item_incrementally() { writer.add_item(&mut wtxn, 8, &[8., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -817,7 +817,7 @@ fn delete_extraneous_tree() { writer.add_item(&mut wtxn, i, &[i as f32, 0., 0., 0.]).unwrap(); } // 5 nodes of 4 dimensions should create 3 trees by default. - writer.build(&mut wtxn, &mut rng, None).unwrap(); + writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -839,7 +839,7 @@ fn delete_extraneous_tree() { let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); - writer.build(&mut wtxn, &mut rng, Some(2)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(2)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -859,7 +859,7 @@ fn delete_extraneous_tree() { let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -886,7 +886,7 @@ fn reuse_node_id() { for i in 0..6 { writer.add_item(&mut wtxn, i, &[i as f32, 0.]).unwrap(); } - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -911,7 +911,7 @@ fn reuse_node_id() { // if we delete the 1 it should free the node id 0 writer.del_item(&mut wtxn, 1).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -934,7 +934,7 @@ fn reuse_node_id() { // if we re-insert the 1 the node id 0 should be re-used writer.add_item(&mut wtxn, 1, &[1., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, Some(1)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -958,7 +958,7 @@ fn reuse_node_id() { let writer = Writer::new(handle.database, 0, 2); // if we now build a new tree, the id 1 should be re-used - writer.build(&mut wtxn, &mut rng, Some(2)).unwrap(); + writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(2)).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -997,7 +997,7 @@ fn need_build() { writer.need_build(&wtxn).unwrap(), "because metadata are missing and an item has been updated" ); - writer.build(&mut wtxn, &mut rng, None).unwrap(); + writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); let writer = Writer::new(handle.database, 0, 2); writer.del_item(&mut wtxn, 0).unwrap(); @@ -1013,17 +1013,17 @@ fn prepare_changing_distance() { writer.add_item(&mut wtxn, 0, &[0.0, 0.0]).unwrap(); writer.add_item(&mut wtxn, 1, &[1.0, 1.0]).unwrap(); writer.add_item(&mut wtxn, 3, &[3.0, 3.0]).unwrap(); - writer.build(&mut wtxn, &mut rng, None).unwrap(); + writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); let writer = Writer::new(handle.database, 1, 2); writer.add_item(&mut wtxn, 0, &[0.0, 0.0]).unwrap(); writer.add_item(&mut wtxn, 1, &[1.0, 1.0]).unwrap(); writer.add_item(&mut wtxn, 3, &[3.0, 3.0]).unwrap(); - writer.build(&mut wtxn, &mut rng, None).unwrap(); + writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); let writer = Writer::new(handle.database, 2, 2); writer.add_item(&mut wtxn, 0, &[0.0, 0.0]).unwrap(); writer.add_item(&mut wtxn, 1, &[1.0, 1.0]).unwrap(); writer.add_item(&mut wtxn, 3, &[3.0, 3.0]).unwrap(); - writer.build(&mut wtxn, &mut rng, None).unwrap(); + writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); wtxn.commit().unwrap(); let mut wtxn = handle.env.write_txn().unwrap(); @@ -1032,7 +1032,7 @@ fn prepare_changing_distance() { let writer = writer.prepare_changing_distance::(&mut wtxn).unwrap(); assert!(writer.need_build(&wtxn).unwrap(), "after changing the distance"); - writer.build(&mut wtxn, &mut rng, None).unwrap(); + writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); wtxn.commit().unwrap(); // TODO: this should not works, see https://github.com/meilisearch/arroy/issues/92 @@ -1040,5 +1040,5 @@ fn prepare_changing_distance() { let writer = Writer::new(handle.database, 1, 2); writer.del_item(&mut wtxn, 0).unwrap(); assert!(writer.need_build(&wtxn).unwrap(), "because an item has been updated"); - writer.build(&mut wtxn, &mut rng, None).unwrap(); + writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); } diff --git a/src/writer.rs b/src/writer.rs index 1bcedeaa..5475cc16 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -27,6 +27,76 @@ use crate::{ PrefixCodec, Result, }; +/// The options available when building the arroy database. +#[derive(Default, Clone)] +pub struct BuildOption { + /// The number of trees to build. If `None` arroy will determine the best amount to build for your number of vectors itself. + pub n_trees: Option, + /// Configure the maximum number of items stored in a descendant node. + /// This is only applied to the newly created or updated tree node. + /// If the value is modified while working on an already existing database, + /// the nodes that don't need to be updated won't be recreated. + pub split_after: Option, +} + +impl BuildOption { + /// Create a new `BuildOption`. + /// + /// # Example + /// + /// ``` + /// use arroy::BuildOption; + /// BuildOption::new(); + /// ``` + pub fn new() -> Self { + Self::default() + } + + /// The number of trees to build. If not set arroy will determine the best amount to build for your number of vectors itself. + /// See also `[Self::with_maybe_n_trees]`. + /// + /// # Example + /// + /// ``` + /// use arroy::BuildOption; + /// BuildOption::new().with_n_trees(10); + /// ``` + pub fn with_n_trees(&mut self, n_trees: usize) -> &mut Self { + self.n_trees = Some(n_trees); + self + } + + /// The number of trees to build. If `None` arroy will determine the best amount to build for your number of vectors itself. + /// See also `[Self::with_n_trees]`. + /// + /// # Example + /// + /// ``` + /// use arroy::BuildOption; + /// BuildOption::new().with_maybe_n_trees(Some(10)); + /// ``` + pub fn with_maybe_n_trees(&mut self, n_trees: Option) -> &mut Self { + self.n_trees = n_trees; + self + } + + /// Configure the maximum number of items stored in a descendant node. + /// This is only applied to the newly created or updated tree node. + /// If the value is modified while working on an already existing database, + /// the nodes that don't need to be updated won't be recreated. + /// + /// # Example + /// + /// ``` + /// use arroy::BuildOption; + /// BuildOption::new().with_split_after(1000); + /// ``` + pub fn with_split_after(&mut self, split_after: usize) -> &mut Self { + self.split_after = Some(split_after); + self + } +} + /// A writer to store new items, remove existing ones, /// and build the search tree to query the nearest /// neighbors to items or vectors. @@ -252,8 +322,9 @@ impl Writer { // we simplify the max descendants (_K) thing by considering // that we can fit as much descendants as the number of dimensions - fn fit_in_descendant(&self, n: u64) -> bool { - n <= self.dimensions as u64 + fn fit_in_descendant(&self, opt: &BuildOption, n: u64) -> bool { + let max_in_descendant = opt.split_after.unwrap_or(self.dimensions) as u64; + n <= max_in_descendant } /// Generates a forest of `n_trees` trees. @@ -268,7 +339,7 @@ impl Writer { self, wtxn: &mut RwTxn, rng: &mut R, - n_trees: Option, + options: &BuildOption, ) -> Result<()> { log::debug!("started preprocessing the items..."); @@ -283,7 +354,7 @@ impl Writer { let item_indices = self.item_indices(wtxn)?; let n_items = item_indices.len(); - if self.fit_in_descendant(item_indices.len()) { + if self.fit_in_descendant(options, item_indices.len()) { log::debug!("We can fit every elements in a single descendant node, we can skip all the build process"); // No item left in the index, we can clear every tree @@ -363,7 +434,7 @@ impl Writer { metadata.roots.len() ); let (new_roots, mut tmp_nodes_reader) = - self.update_trees(rng, metadata, &to_insert, to_delete, &frozzen_reader)?; + self.update_trees(options, rng, metadata, &to_insert, to_delete, &frozzen_reader)?; nodes_to_write.append(&mut tmp_nodes_reader); roots = new_roots; } @@ -371,17 +442,18 @@ impl Writer { log::debug!("started building trees for {} items...", n_items); log::debug!( "running {} parallel tree building...", - n_trees.map_or_else(|| "an unknown number of".to_string(), |n| n.to_string()) + options.n_trees.map_or_else(|| "an unknown number of".to_string(), |n| n.to_string()) ); // Once we updated the current trees we also need to create the new missing trees // So we can run the normal path of building trees from scratch. - let n_trees_to_build = n_trees + let n_trees_to_build = options + .n_trees .zip(metadata) .map(|(n_trees, metadata)| n_trees.saturating_sub(metadata.roots.len())) - .or(n_trees); + .or(options.n_trees); let (mut thread_roots, mut tmp_nodes) = - self.build_trees(rng, n_trees_to_build, &item_indices, &frozzen_reader)?; + self.build_trees(options, rng, n_trees_to_build, &item_indices, &frozzen_reader)?; nodes_to_write.append(&mut tmp_nodes); log::debug!("started updating the tree nodes of {} trees...", tmp_nodes.len()); @@ -410,7 +482,7 @@ impl Writer { self.delete_extra_trees( wtxn, &mut roots, - n_trees, + options.n_trees, concurrent_node_ids.used(), n_items, )?; @@ -440,6 +512,7 @@ impl Writer { fn update_trees( &self, + opt: &BuildOption, rng: &mut R, metadata: &Metadata, to_insert: &RoaringBitmap, @@ -459,6 +532,7 @@ impl Writer { }; let root_node = NodeId::tree(root); let (node_id, _items) = self.update_nodes_in_file( + opt, frozen_reader, &mut rng, root_node, @@ -478,8 +552,10 @@ impl Writer { /// Run in O(n) on the total number of nodes. Return a tuple containing the /// node ID you should use instead of the current_node and the number of /// items in the subtree. + #[allow(clippy::too_many_arguments)] fn update_nodes_in_file( &self, + opt: &BuildOption, frozen_reader: &FrozzenReader, rng: &mut R, current_node: NodeId, @@ -501,7 +577,7 @@ impl Writer { } else { Ok((NodeId::item(item_id), new_items)) } - } else if self.fit_in_descendant(new_items.len()) { + } else if self.fit_in_descendant(opt, new_items.len()) { let node_id = frozen_reader.concurrent_node_ids.next()?; let node_id = NodeId::tree(node_id); tmp_nodes.put( @@ -513,7 +589,7 @@ impl Writer { Ok((node_id, new_items)) } else { let new_id = - self.make_tree_in_file(frozen_reader, rng, &new_items, tmp_nodes)?; + self.make_tree_in_file(opt, frozen_reader, rng, &new_items, tmp_nodes)?; return Ok((new_id, new_items)); } @@ -532,10 +608,11 @@ impl Writer { if descendants.as_ref() == &new_descendants { // if nothing changed, do nothing Ok((current_node, descendants.into_owned())) - } else if !self.fit_in_descendant(new_descendants.len()) { + } else if !self.fit_in_descendant(opt, new_descendants.len()) { // if it doesn't fit in one descendent we need to craft a new whole subtree tmp_nodes.remove(current_node.item); let new_id = self.make_tree_in_file( + opt, frozen_reader, rng, &new_descendants, @@ -576,6 +653,7 @@ impl Writer { } let (new_left, left_items) = self.update_nodes_in_file( + opt, frozen_reader, rng, left, @@ -584,6 +662,7 @@ impl Writer { tmp_nodes, )?; let (new_right, right_items) = self.update_nodes_in_file( + opt, frozen_reader, rng, right, @@ -594,7 +673,7 @@ impl Writer { let total_items = left_items | right_items; - if self.fit_in_descendant(total_items.len()) { + if self.fit_in_descendant(opt, total_items.len()) { // Since we're shrinking we KNOW that new_left and new_right are descendants // thus we can delete them directly knowing there is no sub-tree to look at. if new_left.mode == NodeMode::Tree { @@ -639,6 +718,7 @@ impl Writer { fn build_trees( &self, + opt: &BuildOption, rng: &mut R, n_trees: Option, item_indices: &RoaringBitmap, @@ -663,7 +743,7 @@ impl Writer { None => TmpNodes::new()?, }; let root_id = - self.make_tree_in_file(frozen_reader, &mut rng, item_indices, &mut tmp_nodes)?; + self.make_tree_in_file(opt, frozen_reader, &mut rng, item_indices, &mut tmp_nodes)?; assert!( root_id.mode != NodeMode::Item, "make_tree_in_file returned an item even though there was more than a single element" @@ -680,6 +760,7 @@ impl Writer { /// and root nodes in files that will be stored in the database later. fn make_tree_in_file( &self, + opt: &BuildOption, reader: &FrozzenReader, rng: &mut R, item_indices: &RoaringBitmap, @@ -689,7 +770,7 @@ impl Writer { return Ok(NodeId::item(item_indices.min().unwrap())); } - if self.fit_in_descendant(item_indices.len()) { + if self.fit_in_descendant(opt, item_indices.len()) { let item_id = reader.concurrent_node_ids.next()?; let item = Node::Descendants(Descendants { descendants: Cow::Borrowed(item_indices) }); tmp_nodes.put(item_id, &item)?; @@ -742,8 +823,8 @@ impl Writer { let normal = SplitPlaneNormal { normal, - left: self.make_tree_in_file(reader, rng, &children_left, tmp_nodes)?, - right: self.make_tree_in_file(reader, rng, &children_right, tmp_nodes)?, + left: self.make_tree_in_file(opt, reader, rng, &children_left, tmp_nodes)?, + right: self.make_tree_in_file(opt, reader, rng, &children_right, tmp_nodes)?, }; let new_node_id = reader.concurrent_node_ids.next()?; From 141eaf021241a8a8b6cb53a2b46adaa807993ccf Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 25 Sep 2024 14:34:03 +0200 Subject: [PATCH 2/8] add documentation on all the new items --- examples/compare_with_hnsw.rs | 10 +- examples/search_movies.rs | 4 +- src/lib.rs | 14 +-- src/reader.rs | 167 +++++++++++++++++++++++----------- src/tests/reader.rs | 23 +++-- src/tests/writer.rs | 4 +- 6 files changed, 140 insertions(+), 82 deletions(-) diff --git a/examples/compare_with_hnsw.rs b/examples/compare_with_hnsw.rs index 7eb619ad..e4525258 100644 --- a/examples/compare_with_hnsw.rs +++ b/examples/compare_with_hnsw.rs @@ -39,14 +39,16 @@ fn main() -> Result<()> { let rtxn = env.read_txn()?; let reader = Reader::::open(&rtxn, 0, database)?; + let mut query = reader.nns(NUMBER_FETCHED); + // By making it precise we are near the HNSW but // we take a lot more time to search than the HNSW. let is_precise = true; - let search_k = - if is_precise { NonZeroUsize::new(NUMBER_FETCHED * reader.n_trees() * 20) } else { None }; + if is_precise { + query.search_k(NonZeroUsize::new(NUMBER_FETCHED * reader.n_trees() * 20).unwrap()); + } - let arroy_results = - reader.nns_by_item(&rtxn, 0, NUMBER_FETCHED, search_k, None, None)?.unwrap(); + let arroy_results = query.by_item(&rtxn, 0)?.unwrap(); eprintln!("took {:.02?} to find into arroy", before.elapsed()); let first = Point(reader.item_vector(&rtxn, 0)?.unwrap()); diff --git a/examples/search_movies.rs b/examples/search_movies.rs index cc817836..e42a87d0 100644 --- a/examples/search_movies.rs +++ b/examples/search_movies.rs @@ -27,7 +27,7 @@ fn main() { #[rustfmt::skip] let query: Vec = vec![-0.016822422, -0.021621706, 0.00019239318, 0.054372, 0.034900583, -0.011089119, 0.042128928, 0.02026509, 0.0019038923, -0.0014809829, -0.033832666, -0.029640282, -0.054234847, 0.018665258, -0.055895746, 0.0753006, 0.0061977296, 0.032228395, 0.022795584, -0.058140032, 0.026109613, -0.029421866, 0.04866331, 0.020661665, 0.017602501, 0.020653117, 0.046483666, 0.042843442, -0.045545023, -0.0017392042, 0.012052985, -0.0058927303, 0.032480225, 0.009872672, 0.024758337, -0.013354463, -0.044432696, -0.03226193, -0.059227727, 0.0078192735, 0.013650394, 0.0031477972, 0.005877705, 0.0068786396, 0.002517114, -0.011458909, 0.008640344, 0.044904687, -0.0047290456, -0.012748338, -0.048921518, 0.07827129, 0.005205742, -0.021857478, -0.02370976, 0.041743826, -0.016076453, -0.011403813, -0.025544455, -0.0046601044, -0.021723151, 0.007303265, -0.0136509575, 0.0073000537, -0.005085544, 0.04384241, -0.018964743, 0.03818674, -0.09198379, -0.032043297, -0.0067259674, 0.019887544, 0.005341308, 0.0050093965, 0.054900724, -0.020799952, 0.020495495, 0.01472667, 0.019677797, 0.037550557, -0.010920308, 0.03371257, 0.0020930816, 0.03709999, -0.036922902, -0.049608115, 0.0154750785, 0.007696657, -0.058294553, 0.045302838, -0.023393214, -0.060448237, -0.005798211, 0.053323198, 0.04070376, -0.0028753958, 0.051668108, -0.0069777397, 0.033418525, 0.016234992, -0.033323497, -0.0074829464, -0.008664235, -0.05547656, -0.11400871, -0.03518515, -0.0056998464, 0.01812429, -0.031799175, -0.0073341345, -0.06147767, -0.003742939, -0.004249079, -0.013904026, -0.00065635156, 0.09179383, 0.004267396, 0.00015509031, -0.049766053, 0.029881846, 0.10485467, -0.03120661, 0.014043553, 0.08671136, 0.059179407, 0.029454986, -0.0122302845, 0.06451508, 0.021481989, -0.06484224, 0.018707344, 0.022344032, -0.004790084, -0.04705671, 0.016396629, -0.03346155, 0.0064264126, -0.0053360737, 0.06672058, -0.0078784805, -0.016174054, 0.026566355, -0.046398066, 0.0025418145, 0.046905387, -0.020884424, -0.051193744, -0.031737294, -0.009927951, 0.023741305, -0.058117628, 0.051733956, -0.025581324, -0.030992776, 0.008804903, 0.04388304, 0.013213721, 0.004467152, -0.04988626, 0.0069321035, 0.039403677, 0.019677948, -0.066907056, 0.018572355, 0.013511877, -0.010518738, 0.010099771, -0.003633823, -0.0631501, -0.025649378, -0.043461364, 0.0016490245, 0.064196914, 0.033599235, -0.013222726, 0.015318823, 0.0771801, -0.0070276, -0.031138066, 0.0055310773, -0.09972089, 0.05066132, 0.047467627, -0.03498512, -0.03416252, -0.018362196, 0.040274452, -0.031371195, 0.030042851, 0.016328678, -0.05765591, -0.048823263, 0.054553114, -0.02033182, 0.046627544, 0.016558101, -0.0033715998, 0.0006232865, -0.0065704435, 0.008104579, 0.016307961, -0.041840676, 0.048135996, -0.018808063, -0.036892023, -0.0450471, 0.02718623, -0.036660295, -0.022694368, 0.005702901, -0.022678563, 0.0013453028, 0.07429447, -0.034700394, -0.032727163, 0.00596015, 0.034842487, -0.027818438, -0.00051779655, -0.014468772, 0.033954486, 0.04148899, -0.0829876, -0.015300944, 0.015376903, 0.09567573, 0.036652327, -0.049033575, -0.04484115, 0.041701544, -0.057027884, 0.0069984253, -0.0053272387, 0.025826871, 0.002177651, -0.030157669, 0.007895542, -0.014717798, 0.054724272, -0.05034077, -0.016694192, 0.038352106, -0.060709346, 0.08236629, -0.0096279215, 0.014632059, 0.025158316, -0.0009260515, -0.043707818, -0.01941624, -0.0118600605, -0.035666965, 0.037794825, 0.014687504, 0.038666032, -0.075831376, -0.038647566, -0.048394937, 0.031239703, 0.029136332, -0.00076040986, -0.015906896, 0.03718925, -0.0140040675, -0.037951406, -0.041062936, -0.06529122, 0.011906159, -0.011368897, 0.0060307034, 0.03684682, 0.031995844, -0.033985753, -0.018714348, -0.012443444, -0.007389346, 0.03257332, -0.04580996, -0.026579294, -0.024290696, -0.025647637, 0.022456668, -0.02420987, -0.065709755, -0.02623659, -0.028259972, 0.019707581, -0.022819564, -0.0409341, 0.026851093, 0.031858675, 0.048687093, -0.013439109, 0.011736404, 0.016420575, 0.03451187, -0.0059358296, 0.015338021, 0.04402986, 0.033739056, 0.033959225, 0.0068245684, -0.0143376645, -0.0007635987, -0.01949658, 0.016379116, 0.018640755, -0.06126936, -0.22691156, 0.015514225, -0.0010716971, 0.0044359663, 0.03258783, -0.0018310734, 0.010761778, -0.033404265, 0.005418415, 0.028870588, -0.0366465, 0.025508897, -0.003327967, -0.025249101, 0.041501254, -0.06906739, -0.03184493, -0.041302733, -0.03037772, 0.015740091, 0.008446552, -0.0459613, -0.022405358, -0.0036640046, 0.017842831, 0.003960712, -0.025942408, -0.038227286, -0.045894515, -0.01752483, 0.017444108, -0.051017836, 0.029609472, 0.008688325, 0.020816054, 0.008120903, 0.03892946, -0.033378396, 0.02176841, 0.027685048, -0.012064678, -0.079198286, -0.04271553, 0.005021753, 0.066962436, -0.03443632, -0.004004281, -0.050009515, -0.01630804, 0.06379373, 0.055116866, 0.027930314, 0.043325268, 0.02733439, -0.015951807, 0.059688378, -0.0075212875, 0.03786285, -0.04638327, -0.043671872, 0.043587692, 0.011264745, -0.059823193, 0.008415408, -0.040225852, -0.05263509, -0.0038932117, -0.047234535, 0.05749084, 0.029582193, -0.012869698, 0.027698075, -0.014221754, -0.05440618, 0.007839065, -0.028753158, -0.029088387, -0.00039888048, 0.012631819, 0.0038486738, -0.059913363, -0.0034661351, 0.011339918, 0.005836589, -0.018044928, -0.035229705, 0.0015524679, -0.035521194, -0.028409205, 0.0004174717, 0.060292065, -0.009710763, -0.04719587, 0.034226153, 0.04258676, 0.03754591, 0.056335006, -0.012293127, 0.03885916, -0.011872468, 0.026709288, -0.030494772, -0.0027441443, 0.01256448, 0.0070703924, 0.011282641, -0.03820788, -0.029001744, 0.0024300558, -0.0032799696, 0.037857816, 0.001686728, 0.056249045, -0.01862739, 0.04376537, -0.0019654054, 0.050269835, 0.035223164, 0.0059567657, 0.013870472, -0.001804614, 0.027300585, -0.03382222, -0.041098855, -0.060636565, 0.0047175046, 0.029142305, 0.06523361, 0.028681634, -0.023454288, -0.018000197, -0.030090509, -0.0046562785, -0.04519735, 0.047884777, -0.00059952086, -0.03280122, -0.08012555, 0.008639195, 0.01629006, 0.032155965, 0.034481294, 0.021274198, 0.010470909, 0.022913199, -0.035904404, 0.041294016, -0.00987633, -0.021613108, 0.012205929, 0.005322071, 0.025864823, 0.08942025, -0.08067831, -0.014871667, -0.034839284, 0.028048998, -0.0063091223, 0.037978478, -0.055790387, 0.0045954804, -0.042958327, 0.02137769, -0.008589233, -0.00062141696, 0.052822173, 0.034125473, -0.015106767, 0.0030919765, -0.0072712647, 0.0056287237, 0.019516133, -0.031278323, 0.025771588, 0.01701546, 0.019516064, 0.016180338, 0.01349268, 0.011978184, 0.011838524, -0.0050102035, -0.06970658, 0.022854539, -0.004192521, 0.0577575, -0.003954721, -0.054374386, -0.027609108, 0.0134023735, 0.010305641, -0.011130317, 0.052328475, 0.014928648, -0.013976018, -0.07100651, -0.06789901, -0.031873316, -0.011598853, 0.029284442, -0.04940027, 0.0100974385, -0.02187546, -0.062819175, 0.0069366414, 0.052176703, -0.06834835, 0.013463273, -0.0013379813, 0.005786334, 0.017775143, -0.01291353, -0.016923305, -0.049682386, -0.034103107, 0.010883184, -0.055132758, 0.025268175, -0.025599582, 0.015927013, -0.03237898, -0.027073668, -0.034358867, -0.027672807, 0.022677394, -0.03531693, 0.010573503, 0.00032215187, 0.0066956943, -0.051510572, -0.029456092, 0.05758612, -0.038166363, -0.00999853, 0.05758596, -0.006796505, 0.028503977, -0.024184246, 0.054051045, 0.0040905816, 0.099899694, 0.06076009, 0.011753628, -0.03253187, -0.0035343366, 0.02351163, 0.03206495, 0.004892613, -0.04530409, -0.0056237346, -0.006101407, 0.019704496, -0.010228795, 0.027814431, 0.020409154, 0.033115197, -0.07446951, -0.042142425, 0.03928483, -0.022784598, -0.003539396, -0.0074683367, 0.043651864, 0.007761874, 0.022063423, 0.05344986, 0.05065469, 0.029476669, -0.028968832, 0.023550583, -0.022291148, 0.055309687, -0.053843252, 0.020895477, -0.0148687605, 0.012166838, 0.0033556349, -0.07022937, -0.059401378, 0.013194393, -0.0419862, -0.0070434613, 0.030479655, -0.053955454, -0.031870224, 0.034511264, -0.047943473, 0.0069080396, 0.026099209, -0.012516935, 0.0003174421, -0.006716995, 0.07027558, 0.038463045, -0.016081728, 0.05018074, -0.062176052, 0.08961092, 0.03679902, 0.011107996, -0.0032339245, -0.0118898135, 0.013669906, 0.056221563, -0.049234938, 0.003090264, 0.01062722, -0.008937757, -0.08464787, -0.032616463, 0.055935893, 0.006192905, -0.014768529, 0.04930304, 0.053852808, -0.036349185, -0.037947245, 0.0076732435, -0.040889677, 0.022189876, 0.015142795, 0.005928425, -0.009679575, 0.039194115, 0.0041091475, 0.035120673, 0.016776932, -0.04100678, 0.041131947, 0.040904496, 0.047341976, 0.029321635, 0.030489001, -0.0135518275, 0.038717188, 0.0017859036, -0.044703316, -0.007397534, 0.029149175, -0.00021891313, 0.019795585, -0.054424375, 0.010228703, -0.0057461066, 0.05096695, 0.05683213, -0.018136851, -0.0030009004, -0.033427265, 0.010878728, 0.050759643, 0.040795874, 0.019920254, -0.026135486, -0.07642272, 0.035290312, 0.004655317, 0.0043676766, -0.010411962, -0.0076723946, 0.015248613, 0.008905208, -0.0002423048, 0.03892336, -0.025703456, -0.021123456, -0.00066909986, 0.04459856, 0.052217484, -0.017885901, -0.015303531, 0.0057848957, -0.036129624, -0.0736907, 0.035401847, -0.025658514, -0.0082354145, -0.0012491915, -0.040769547, -0.039205503, 0.05530217, -0.014954734, 0.0056790086, -0.04454665, -0.028425619, -0.034654, -0.057087515, -0.0224583, -0.005496095, -0.009889468, -0.05025576, -0.009459795, -0.00871503, 0.021968294, 0.0074964114, -0.032455806, -0.005696087, 0.005180231, 0.056079067, -0.03189999, 0.045113377, 0.061360348, 0.01839327, -0.053088665, 0.04942768, 0.014662789, 0.06641078, -0.008998172, -0.009717696, -0.079248, 0.047506567, 0.04778238, 0.025009798, -0.03899872, 0.009850679, -0.04399064, -0.053494785, 0.055456433, 0.026770461, -0.011158729, -0.073486604, -0.04088162, -0.023263954, -0.022185653, 0.03401001, -0.034742568, 0.043794204, 0.004035502, 0.011585448, -0.009235968, 0.031503983, 0.016500674, -0.012498497, -0.05733327, 0.0024852154, -0.02377962, -0.072548844, -0.008489325, 0.01825339, 0.032909963, -0.023669574, 0.0022601841, -0.008336443, 0.0041536367, 0.007989558, -0.035507284, -0.03951105, 0.0069870483, 0.04283141, -0.05102877, -0.025309727, 0.052937508, -0.014378752, -0.012047669, -0.024964543, -0.00071902486, 0.009493713, 0.024152702, 0.022622166, 0.06481285, 0.0022744837]; let now = Instant::now(); - let _ret = reader.nns_by_vector(&rtxn, &query, 20, None, None, None).unwrap(); + let _ret = reader.nns(20).by_vector(&rtxn, &query).unwrap(); println!("Louis's query took {:?}", now.elapsed()); let mut durations = Vec::new(); @@ -37,7 +37,7 @@ fn main() { println!("Starts querying all documents ..."); for (id, _) in vectors.into_iter() { let now = Instant::now(); - reader.nns_by_item(&rtxn, id, 20, None, None, None).unwrap().unwrap(); + let _ret = reader.nns(20).by_item(&rtxn, id).unwrap().unwrap(); // reader.nns_by_item(&rtxn, id, 20, None, Some(&filter)).unwrap().unwrap(); durations.push(now.elapsed()); } diff --git a/src/lib.rs b/src/lib.rs index 55e31a87..26df5b53 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -49,18 +49,18 @@ //! let reader = Reader::::open(&rtxn, index, db)?; //! let n_results = 20; //! +//! let mut query = reader.nns(n_results); +//! //! // You can increase the quality of the results by forcing arroy to search into more nodes. //! // This multiplier is arbitrary but basically the higher, the better the results, the slower the query. //! let is_precise = true; -//! let search_k = if is_precise { -//! NonZeroUsize::new(n_results * reader.n_trees() * 15) -//! } else { -//! None -//! }; +//! if is_precise { +//! query.search_k(NonZeroUsize::new(n_results * reader.n_trees() * 15).unwrap()); +//! } //! //! // Similar searching can be achieved by requesting the nearest neighbors of a given item. //! let item_id = 0; -//! let arroy_results = reader.nns_by_item(&rtxn, item_id, n_results, search_k, None, None)?.unwrap(); +//! let arroy_results = query.by_item(&rtxn, item_id)?.unwrap(); //! # Ok(()) } //! ``` @@ -96,7 +96,7 @@ use key::{Key, Prefix, PrefixCodec}; use metadata::{Metadata, MetadataCodec}; use node::{Node, NodeCodec}; use node_id::{NodeId, NodeMode}; -pub use reader::Reader; +pub use reader::{QueryBuilder, Reader}; pub use stats::{Stats, TreeStats}; pub use writer::{BuildOption, Writer}; diff --git a/src/reader.rs b/src/reader.rs index 658146d7..d79ad90e 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -19,6 +19,107 @@ use crate::{ TreeStats, }; +/// Options used to make a query against an arroy [`Reader`]. +pub struct QueryBuilder<'a, D: Distance> { + reader: &'a Reader<'a, D>, + count: usize, + search_k: Option, + oversampling: Option, + candidates: Option<&'a RoaringBitmap>, +} + +impl<'a, D: Distance> QueryBuilder<'a, D> { + /// Returns the closests items from `item`. + /// + /// See also [`Self::by_vector`]. + /// + /// # Examples + /// + /// ```no_run + /// # use arroy::{Reader, distances::Euclidean}; + /// # let (reader, rtxn): (Reader, heed::RoTxn) = todo!(); + /// reader.nns(20).by_item(&rtxn, 5); + /// ``` + pub fn by_item(&self, rtxn: &RoTxn, item: ItemId) -> Result>> { + match item_leaf(self.reader.database, self.reader.index, rtxn, item)? { + Some(leaf) => self.reader.nns_by_leaf(rtxn, &leaf, self).map(Some), + None => Ok(None), + } + } + + /// Returns the closest items from the provided `vector`. + /// + /// See also [`Self::by_item`]. + /// + /// # Examples + /// + /// ```no_run + /// # use arroy::{Reader, distances::Euclidean}; + /// # let (reader, rtxn): (Reader, heed::RoTxn) = todo!(); + /// reader.nns(20).by_vector(&rtxn, &[1.25854, -0.75598, 0.58524]); + /// ``` + pub fn by_vector(&self, rtxn: &RoTxn, vector: &'a [f32]) -> Result> { + if vector.len() != self.reader.dimensions() { + return Err(Error::InvalidVecDimension { + expected: self.reader.dimensions(), + received: vector.len(), + }); + } + + let vector = UnalignedVector::from_slice(vector); + let leaf = Leaf { header: D::new_header(&vector), vector }; + self.reader.nns_by_leaf(rtxn, &leaf, self) + } + + /// During the query, arroy will inspect up to `search_k` nodes which defaults + /// to `n_trees * count` if not provided. `search_k` gives you a run-time + /// tradeoff between better accuracy and speed. + /// + /// # Examples + /// + /// ```no_run + /// # use arroy::{Reader, distances::Euclidean}; + /// # let (reader, rtxn): (Reader, heed::RoTxn) = todo!(); + /// use std::num::NonZeroUsize; + /// reader.nns(20).search_k(NonZeroUsize::new(1000).unwrap()).by_item(&rtxn, 3); + /// ``` + pub fn search_k(&mut self, search_k: NonZeroUsize) -> &mut Self { + self.search_k = Some(search_k); + self + } + + /// Oversampling will multiply [`search_k`] by the specified number. + /// That's useful when you don't want to compute `search_k` yourself. + /// + /// # Examples + /// + /// ```no_run + /// # use arroy::{Reader, distances::Euclidean}; + /// # let (reader, rtxn): (Reader, heed::RoTxn) = todo!(); + /// use std::num::NonZeroUsize; + /// reader.nns(20).oversampling(NonZeroUsize::new(6).unwrap()).by_item(&rtxn, 5); + /// ``` + pub fn oversampling(&mut self, oversampling: NonZeroUsize) -> &mut Self { + self.oversampling = Some(oversampling); + self + } + + /// Specify a subset of candidates to inspect. Filters out everything else. + /// + /// # Examples + /// + /// ```no_run + /// # use arroy::{Reader, distances::Euclidean}; + /// # let (reader, rtxn): (Reader, heed::RoTxn) = todo!(); + /// let candidates = roaring::RoaringBitmap::from_iter([1, 3, 4, 5, 6, 7, 8, 9, 15, 16]); + /// reader.nns(20).candidates(&candidates).by_item(&rtxn, 6); + /// ``` + pub fn candidates(&mut self, candidates: &'a RoaringBitmap) -> &mut Self { + self.candidates = Some(candidates); + self + } +} + /// A reader over the arroy trees and user items. #[derive(Debug)] pub struct Reader<'t, D: Distance> { @@ -163,62 +264,17 @@ impl<'t, D: Distance> Reader<'t, D> { }) } - /// Returns the `count` closests items from `item`. - /// - /// During the query it will inspect up to `search_k` nodes which defaults - /// to `n_trees * count` if not provided. `search_k` gives you a run-time - /// tradeoff between better accuracy and speed. - /// - /// The candidates parameter corresponds to the subset of item ids arroy will return. - pub fn nns_by_item( - &self, - rtxn: &'t RoTxn, - item: ItemId, - count: usize, - search_k: Option, - oversampling: Option, - candidates: Option<&RoaringBitmap>, - ) -> Result>> { - match item_leaf(self.database, self.index, rtxn, item)? { - Some(leaf) => { - self.nns_by_leaf(rtxn, &leaf, count, search_k, oversampling, candidates).map(Some) - } - None => Ok(None), - } - } - - /// Returns the `count` closest items from the provided `vector`. - /// - /// See [`Reader::nns_by_item`] for more details. - pub fn nns_by_vector( - &self, - rtxn: &'t RoTxn, - vector: &[f32], - count: usize, - search_k: Option, - oversampling: Option, - candidates: Option<&RoaringBitmap>, - ) -> Result> { - if vector.len() != self.dimensions { - return Err(Error::InvalidVecDimension { - expected: self.dimensions(), - received: vector.len(), - }); - } - - let vector = UnalignedVector::from_slice(vector); - let leaf = Leaf { header: D::new_header(&vector), vector }; - self.nns_by_leaf(rtxn, &leaf, count, search_k, oversampling, candidates) + /// Return a [`QueryBuilder`] that lets you configure and execute a search request. + /// You must provide the number of items you want to receive. + pub fn nns(&self, count: usize) -> QueryBuilder { + QueryBuilder { reader: self, count, search_k: None, oversampling: None, candidates: None } } fn nns_by_leaf( &self, rtxn: &'t RoTxn, query_leaf: &Leaf, - count: usize, - search_k: Option, - oversampling: Option, - candidates: Option<&RoaringBitmap>, + opt: &QueryBuilder, ) -> Result> { if self.items.is_empty() { return Ok(Vec::new()); @@ -227,8 +283,9 @@ impl<'t, D: Distance> Reader<'t, D> { // The number of root nodes + log2 of the total number of vectors. let mut queue = BinaryHeap::with_capacity(self.roots.len() + self.items.len().ilog2() as usize); - let search_k = search_k.map_or(count * self.roots.len(), NonZeroUsize::get); - let search_k = oversampling + let search_k = opt.search_k.map_or(opt.count * self.roots.len(), NonZeroUsize::get); + let search_k = opt + .oversampling .map_or(search_k.saturating_mul(D::DEFAULT_OVERSAMPLING), |oversampling| { search_k.saturating_mul(oversampling.get()) }); @@ -246,12 +303,12 @@ impl<'t, D: Distance> Reader<'t, D> { let key = Key::new(self.index, item); match self.database.get(rtxn, &key)?.ok_or(Error::missing_key(key))? { Node::Leaf(_) => { - if candidates.map_or(true, |c| c.contains(item.item)) { + if opt.candidates.map_or(true, |c| c.contains(item.item)) { nns.push(item.unwrap_item()); } } Node::Descendants(Descendants { descendants }) => { - if let Some(candidates) = candidates { + if let Some(candidates) = opt.candidates { nns.extend((descendants.into_owned() & candidates).iter()); } else { nns.extend(descendants.iter()); @@ -282,7 +339,7 @@ impl<'t, D: Distance> Reader<'t, D> { } let mut sorted_nns = BinaryHeap::from(nns_distances); - let capacity = count.min(sorted_nns.len()); + let capacity = opt.count.min(sorted_nns.len()); let mut output = Vec::with_capacity(capacity); while let Some(Reverse((OrderedFloat(dist), item))) = sorted_nns.pop() { if output.len() == capacity { diff --git a/src/tests/reader.rs b/src/tests/reader.rs index 6beed985..4b306193 100644 --- a/src/tests/reader.rs +++ b/src/tests/reader.rs @@ -49,7 +49,7 @@ fn open_db_with_wrong_dimension() { let rtxn = handle.env.read_txn().unwrap(); let reader = Reader::::open(&rtxn, 0, handle.database).unwrap(); - let ret = reader.nns_by_vector(&rtxn, &[1.0, 2.0, 3.0], 5, None, None, None).unwrap_err(); + let ret = reader.nns(5).by_vector(&rtxn, &[1.0, 2.0, 3.0]).unwrap_err(); insta::assert_snapshot!(ret, @"Invalid vector dimensions. Got 3 but expected 2"); } @@ -88,7 +88,7 @@ fn search_in_db_with_a_single_vector() { let rtxn = handle.env.read_txn().unwrap(); let reader = Reader::::open(&rtxn, 0, handle.database).unwrap(); - let ret = reader.nns_by_item(&rtxn, 0, 1, None, None, None).unwrap(); + let ret = reader.nns(1).by_item(&rtxn, 0).unwrap(); insta::assert_snapshot!(NnsRes(ret), @r###" id(0): distance(0) "###); @@ -112,14 +112,15 @@ fn two_dimension_on_a_line() { let reader = Reader::::open(&rtxn, 0, handle.database).unwrap(); // if we can't look into enough nodes we find some random points - let ret = reader.nns_by_item(&rtxn, 0, 5, NonZeroUsize::new(1), None, None).unwrap(); + let ret = reader.nns(5).search_k(NonZeroUsize::new(1).unwrap()).by_item(&rtxn, 0).unwrap(); insta::assert_snapshot!(NnsRes(ret), @r###" id(48): distance(48) id(92): distance(92) "###); // if we can look into all the node there is no inifinite loop and it works - let ret = reader.nns_by_item(&rtxn, 0, 5, NonZeroUsize::new(usize::MAX), None, None).unwrap(); + let ret = + reader.nns(5).search_k(NonZeroUsize::new(usize::MAX).unwrap()).by_item(&rtxn, 0).unwrap(); insta::assert_snapshot!(NnsRes(ret), @r###" id(0): distance(0) id(1): distance(1) @@ -128,7 +129,7 @@ fn two_dimension_on_a_line() { id(4): distance(4) "###); - let ret = reader.nns_by_item(&rtxn, 0, 5, None, None, None).unwrap(); + let ret = reader.nns(5).by_item(&rtxn, 0).unwrap(); insta::assert_snapshot!(NnsRes(ret), @r###" id(1): distance(1) id(2): distance(2) @@ -158,7 +159,7 @@ fn two_dimension_on_a_column() { let rtxn = handle.env.read_txn().unwrap(); let reader = Reader::::open(&rtxn, 0, handle.database).unwrap(); - let ret = reader.nns_by_item(&rtxn, 0, 5, None, None, None).unwrap(); + let ret = reader.nns(5).by_item(&rtxn, 0).unwrap(); insta::assert_snapshot!(NnsRes(ret), @r###" id(1): distance(1) @@ -207,16 +208,14 @@ fn filtering() { let rtxn = handle.env.read_txn().unwrap(); let reader = Reader::::open(&rtxn, 0, handle.database).unwrap(); - let ret = - reader.nns_by_item(&rtxn, 0, 5, None, None, Some(&RoaringBitmap::from_iter(0..2))).unwrap(); + let ret = reader.nns(5).candidates(&RoaringBitmap::from_iter(0..2)).by_item(&rtxn, 0).unwrap(); insta::assert_snapshot!(NnsRes(ret), @r###" id(0): distance(0) id(1): distance(1) "###); - let ret = reader - .nns_by_item(&rtxn, 0, 5, None, None, Some(&RoaringBitmap::from_iter(98..1000))) - .unwrap(); + let ret = + reader.nns(5).candidates(&RoaringBitmap::from_iter(98..1000)).by_item(&rtxn, 0).unwrap(); insta::assert_snapshot!(NnsRes(ret), @r###" id(98): distance(98) id(99): distance(99) @@ -235,7 +234,7 @@ fn search_in_empty_database() { let rtxn = handle.env.read_txn().unwrap(); let reader = Reader::open(&rtxn, 0, handle.database).unwrap(); - let ret = reader.nns_by_vector(&rtxn, &[0., 0.], 10, None, None, None).unwrap(); + let ret = reader.nns(10).by_vector(&rtxn, &[0., 0.]).unwrap(); insta::assert_debug_snapshot!(ret, @"[]"); } diff --git a/src/tests/writer.rs b/src/tests/writer.rs index 25350665..08f7648b 100644 --- a/src/tests/writer.rs +++ b/src/tests/writer.rs @@ -402,7 +402,7 @@ fn delete_document_in_an_empty_index_74() { writer2.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); let reader = Reader::open(&wtxn, 1, handle.database).unwrap(); - let ret = reader.nns_by_vector(&wtxn, &[0., 0.], 10, None, None, None).unwrap(); + let ret = reader.nns(10).by_vector(&wtxn, &[0., 0.]).unwrap(); insta::assert_debug_snapshot!(ret, @"[]"); wtxn.commit().unwrap(); @@ -418,7 +418,7 @@ fn delete_document_in_an_empty_index_74() { let rtxn = handle.env.read_txn().unwrap(); let reader = Reader::open(&rtxn, 1, handle.database).unwrap(); - let ret = reader.nns_by_vector(&rtxn, &[0., 0.], 10, None, None, None).unwrap(); + let ret = reader.nns(10).by_vector(&rtxn, &[0., 0.]).unwrap(); insta::assert_debug_snapshot!(ret, @"[]"); } From b685d9850db9e67b874dda774fa27f569b3056cc Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 25 Sep 2024 17:19:55 +0200 Subject: [PATCH 3/8] update the writer builder --- examples/build-tree-no-commit.rs | 8 +-- examples/compare_with_hnsw.rs | 4 +- examples/import-vectors.rs | 8 ++- src/lib.rs | 6 +- src/tests/binary_quantized.rs | 4 +- src/tests/reader.rs | 20 +++--- src/tests/writer.rs | 104 ++++++++++++++--------------- src/writer.rs | 110 ++++++++++++++++--------------- 8 files changed, 135 insertions(+), 129 deletions(-) diff --git a/examples/build-tree-no-commit.rs b/examples/build-tree-no-commit.rs index 7a28b791..a90b03c2 100644 --- a/examples/build-tree-no-commit.rs +++ b/examples/build-tree-no-commit.rs @@ -7,7 +7,7 @@ use std::path::PathBuf; use std::time::Instant; use arroy::distances::DotProduct; -use arroy::{BuildOption, Database, Reader, Stats, TreeStats, Writer}; +use arroy::{Database, Reader, Stats, TreeStats, Writer}; use clap::Parser; use heed::{EnvFlags, EnvOpenOptions}; use rand::rngs::StdRng; @@ -64,11 +64,11 @@ fn main() -> Result<(), heed::BoxedError> { let now = Instant::now(); println!("Building the arroy internal trees..."); let mut rng = StdRng::seed_from_u64(seed); - let mut options = BuildOption::new(); + let mut builder = writer.builder(&mut rng); if let Some(n_trees) = n_trees { - options.with_n_trees(n_trees); + builder.n_trees(n_trees); } - writer.build(&mut wtxn, &mut rng, &options).unwrap(); + builder.build(&mut wtxn).unwrap(); println!("Took {:.2?} to build", now.elapsed()); let reader = Reader::open(&wtxn, 0, database)?; diff --git a/examples/compare_with_hnsw.rs b/examples/compare_with_hnsw.rs index e4525258..e35ac755 100644 --- a/examples/compare_with_hnsw.rs +++ b/examples/compare_with_hnsw.rs @@ -3,7 +3,7 @@ use std::time::Instant; use arroy::distances::Euclidean; use arroy::internals::{Leaf, UnalignedVector}; -use arroy::{BuildOption, Database, Distance, ItemId, Reader, Result, Writer}; +use arroy::{Database, Distance, ItemId, Reader, Result, Writer}; use heed::{EnvOpenOptions, RwTxn}; use instant_distance::{Builder, HnswMap, MapItem}; use rand::rngs::StdRng; @@ -81,7 +81,7 @@ fn load_into_arroy( for (i, Point(vector)) in points.iter().enumerate() { writer.add_item(&mut wtxn, i.try_into().unwrap(), &vector[..])?; } - writer.build(&mut wtxn, rng, &BuildOption::default())?; + writer.builder(rng).build(&mut wtxn)?; wtxn.commit()?; Ok(()) diff --git a/examples/import-vectors.rs b/examples/import-vectors.rs index 516f18a7..6351c37d 100644 --- a/examples/import-vectors.rs +++ b/examples/import-vectors.rs @@ -6,7 +6,7 @@ use std::path::PathBuf; use std::time::Instant; use arroy::distances::DotProduct; -use arroy::{BuildOption, Database, Writer}; +use arroy::{Database, Writer}; use clap::Parser; use heed::{EnvFlags, EnvOpenOptions}; use rand::rngs::StdRng; @@ -102,7 +102,11 @@ fn main() -> Result<(), heed::BoxedError> { println!("Building the arroy internal trees..."); let now = Instant::now(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_maybe_n_trees(n_trees)).unwrap(); + let mut builder = writer.builder(&mut rng); + if let Some(n_trees) = n_trees { + builder.n_trees(n_trees); + } + builder.build(&mut wtxn)?; wtxn.commit().unwrap(); println!("Took {:.2?} to build", now.elapsed()); diff --git a/src/lib.rs b/src/lib.rs index 26df5b53..ee5687d3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,7 +11,7 @@ //! use std::num::NonZeroUsize; //! //! use arroy::distances::Euclidean; -//! use arroy::{Database as ArroyDatabase, Writer, Reader, BuildOption}; +//! use arroy::{Database as ArroyDatabase, Writer, Reader}; //! use rand::rngs::StdRng; //! use rand::{Rng, SeedableRng}; //! @@ -40,7 +40,7 @@ //! //! // You can specify the number of trees to use or specify None. //! let mut rng = StdRng::seed_from_u64(42); -//! writer.build(&mut wtxn, &mut rng, &BuildOption::new())?; +//! writer.builder(&mut rng).build(&mut wtxn)?; //! //! // By committing, other readers can query the database in parallel. //! wtxn.commit()?; @@ -98,7 +98,7 @@ use node::{Node, NodeCodec}; use node_id::{NodeId, NodeMode}; pub use reader::{QueryBuilder, Reader}; pub use stats::{Stats, TreeStats}; -pub use writer::{BuildOption, Writer}; +pub use writer::{ArroyBuilder, Writer}; /// The set of types used by the [`Distance`] trait. pub mod internals { diff --git a/src/tests/binary_quantized.rs b/src/tests/binary_quantized.rs index 8e98e2a7..9e784f5d 100644 --- a/src/tests/binary_quantized.rs +++ b/src/tests/binary_quantized.rs @@ -1,7 +1,7 @@ use crate::{ distance::BinaryQuantizedEuclidean, tests::{create_database, rng}, - BuildOption, Writer, + Writer, }; #[test] @@ -41,7 +41,7 @@ fn write_and_retrieve_binary_quantized_vector() { ] "###); - writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng()).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" diff --git a/src/tests/reader.rs b/src/tests/reader.rs index 4b306193..959cd580 100644 --- a/src/tests/reader.rs +++ b/src/tests/reader.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use super::*; use crate::distance::Angular; use crate::distances::{Euclidean, Manhattan}; -use crate::{BuildOption, ItemId, Reader, Writer}; +use crate::{ItemId, Reader, Writer}; pub struct NnsRes(pub Option>); @@ -44,7 +44,7 @@ fn open_db_with_wrong_dimension() { let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 0, &[0.0, 0.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng()).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); let rtxn = handle.env.read_txn().unwrap(); @@ -60,7 +60,7 @@ fn open_db_with_wrong_distance() { let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 0, &[0.0, 0.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng()).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); let rtxn = handle.env.read_txn().unwrap(); @@ -82,7 +82,7 @@ fn search_in_db_with_a_single_vector() { let writer = Writer::new(handle.database, 0, 3); writer.add_item(&mut wtxn, 0, &[0.00397, 0.553, 0.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), &BuildOption::new()).unwrap(); + writer.builder(&mut rng()).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); let rtxn = handle.env.read_txn().unwrap(); @@ -105,7 +105,7 @@ fn two_dimension_on_a_line() { writer.add_item(&mut wtxn, i, &[i as f32, 0.0]).unwrap(); } - writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(50)).unwrap(); + writer.builder(&mut rng()).n_trees(50).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); let rtxn = handle.env.read_txn().unwrap(); @@ -154,7 +154,7 @@ fn two_dimension_on_a_column() { writer.add_item(&mut wtxn, i, &[0.0, i as f32]).unwrap(); } - writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(50)).unwrap(); + writer.builder(&mut rng()).n_trees(50).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); let rtxn = handle.env.read_txn().unwrap(); @@ -179,7 +179,7 @@ fn get_item_ids() { writer.add_item(&mut wtxn, i, &[0.0, i as f32]).unwrap(); } - writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(50)).unwrap(); + writer.builder(&mut rng()).n_trees(50).build(&mut wtxn).unwrap(); let reader = Reader::::open(&wtxn, 0, handle.database).unwrap(); let ret = reader.item_ids(); @@ -202,7 +202,7 @@ fn filtering() { writer.add_item(&mut wtxn, i, &[0.0, i as f32]).unwrap(); } - writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(50)).unwrap(); + writer.builder(&mut rng()).n_trees(50).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); let rtxn = handle.env.read_txn().unwrap(); @@ -229,7 +229,7 @@ fn search_in_empty_database() { let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); - writer.build(&mut wtxn, &mut rng(), &BuildOption::new()).unwrap(); + writer.builder(&mut rng()).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); let rtxn = handle.env.read_txn().unwrap(); @@ -261,7 +261,7 @@ fn try_reading_in_a_non_built_database() { // we build the database once to get valid metadata let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); - writer.build(&mut wtxn, &mut rng(), &BuildOption::new()).unwrap(); + writer.builder(&mut rng()).build(&mut wtxn).unwrap(); let writer = Writer::new(handle.database, 0, 2); writer.del_item(&mut wtxn, 0).unwrap(); // We don't build the database; this leaves the database in a corrupted state diff --git a/src/tests/writer.rs b/src/tests/writer.rs index 08f7648b..2dd069dc 100644 --- a/src/tests/writer.rs +++ b/src/tests/writer.rs @@ -4,7 +4,7 @@ use rand::Rng; use super::{create_database, rng}; use crate::distance::{Angular, BinaryQuantizedAngular, DotProduct, Euclidean}; -use crate::{BuildOption, Database, Reader, Writer}; +use crate::{Database, Reader, Writer}; #[test] fn clear_small_database() { @@ -20,11 +20,11 @@ fn clear_small_database() { let zero_writer = Writer::new(database, 0, 3); zero_writer.add_item(&mut wtxn, 0, &[0.0, 1.0, 2.0]).unwrap(); zero_writer.clear(&mut wtxn).unwrap(); - zero_writer.build(&mut wtxn, &mut rng(), &BuildOption::new()).unwrap(); + zero_writer.builder(&mut rng()).build(&mut wtxn).unwrap(); let one_writer = Writer::new(database, 1, 3); one_writer.add_item(&mut wtxn, 0, &[1.0, 2.0, 3.0]).unwrap(); - one_writer.build(&mut wtxn, &mut rng(), &BuildOption::new()).unwrap(); + one_writer.builder(&mut rng()).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); let mut wtxn = env.write_txn().unwrap(); @@ -43,7 +43,7 @@ fn use_u32_max_minus_one_for_a_vec() { let writer = Writer::new(handle.database, 0, 3); writer.add_item(&mut wtxn, u32::MAX - 1, &[0.0, 1.0, 2.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng()).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -62,7 +62,7 @@ fn use_u32_max_for_a_vec() { let writer = Writer::new(handle.database, 0, 3); writer.add_item(&mut wtxn, u32::MAX, &[0.0, 1.0, 2.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng()).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -81,7 +81,7 @@ fn write_one_vector() { let writer = Writer::new(handle.database, 0, 3); writer.add_item(&mut wtxn, 0, &[0.0, 1.0, 2.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), &BuildOption::new()).unwrap(); + writer.builder(&mut rng()).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -100,7 +100,7 @@ fn write_one_vector_in_one_tree() { let writer = Writer::new(handle.database, 0, 3); writer.add_item(&mut wtxn, 0, &[0.0, 1.0, 2.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng()).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -119,7 +119,7 @@ fn write_one_vector_in_multiple_trees() { let writer = Writer::new(handle.database, 0, 3); writer.add_item(&mut wtxn, 0, &[0.0, 1.0, 2.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(10)).unwrap(); + writer.builder(&mut rng()).n_trees(10).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -142,7 +142,7 @@ fn write_vectors_until_there_is_a_descendants() { writer.add_item(&mut wtxn, id, &[i, i, i]).unwrap(); } - writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng()).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -167,7 +167,7 @@ fn write_vectors_until_there_is_a_split() { writer.add_item(&mut wtxn, id, &[i, i, i]).unwrap(); } - writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng()).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -194,7 +194,7 @@ fn write_and_update_lot_of_random_points() { writer.add_item(&mut wtxn, id, &vector).unwrap(); } - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(10)).unwrap(); + writer.builder(&mut rng).n_trees(10).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle); @@ -204,7 +204,7 @@ fn write_and_update_lot_of_random_points() { let vector: [f32; 30] = std::array::from_fn(|_| rng.gen()); writer.add_item(&mut wtxn, id, &vector).unwrap(); } - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(10)).unwrap(); + writer.builder(&mut rng).n_trees(10).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle); @@ -218,7 +218,7 @@ fn write_multiple_indexes() { for i in 0..5 { let writer = Writer::new(handle.database, i, 3); writer.add_item(&mut wtxn, 0, &[0.0, 1.0, 2.0]).unwrap(); - writer.build(&mut wtxn, &mut rng(), BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng()).n_trees(1).build(&mut wtxn).unwrap(); } wtxn.commit().unwrap(); @@ -268,7 +268,7 @@ fn write_random_vectors_to_random_indexes() { let vector: [f32; 10] = std::array::from_fn(|_| rng.gen()); writer.add_item(&mut wtxn, i, &vector).unwrap(); } - writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); + writer.builder(&mut rng).build(&mut wtxn).unwrap(); } wtxn.commit().unwrap(); } @@ -283,7 +283,7 @@ fn overwrite_one_item_incremental() { for i in 0..6 { writer.add_item(&mut wtxn, i, &[i as f32, 0.]).unwrap(); } - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -309,7 +309,7 @@ fn overwrite_one_item_incremental() { writer.add_item(&mut wtxn, 3, &[6., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -338,7 +338,7 @@ fn delete_one_item_in_a_one_item_db() { let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -354,7 +354,7 @@ fn delete_one_item_in_a_one_item_db() { writer.del_item(&mut wtxn, 0).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -378,7 +378,7 @@ fn delete_document_in_an_empty_index_74() { let writer = Writer::new(handle.database, 0, 2); writer.del_item(&mut wtxn, 0).unwrap(); writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); + writer.builder(&mut rng).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); @@ -398,8 +398,8 @@ fn delete_document_in_an_empty_index_74() { let writer2 = Writer::new(handle.database, 1, 2); writer2.del_item(&mut wtxn, 0).unwrap(); - writer1.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); - writer2.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); + writer1.builder(&mut rng).build(&mut wtxn).unwrap(); + writer2.builder(&mut rng).build(&mut wtxn).unwrap(); let reader = Reader::open(&wtxn, 1, handle.database).unwrap(); let ret = reader.nns(10).by_vector(&wtxn, &[0., 0.]).unwrap(); @@ -432,7 +432,7 @@ fn delete_one_item_in_a_descendant() { // first, insert a bunch of items writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); writer.add_item(&mut wtxn, 1, &[1., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -449,7 +449,7 @@ fn delete_one_item_in_a_descendant() { writer.del_item(&mut wtxn, 0).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -472,7 +472,7 @@ fn delete_one_leaf_in_a_split() { writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); writer.add_item(&mut wtxn, 1, &[1., 0.]).unwrap(); writer.add_item(&mut wtxn, 2, &[2., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -491,7 +491,7 @@ fn delete_one_leaf_in_a_split() { writer.del_item(&mut wtxn, 0).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); // after deleting the leaf, the split node should be replaced by a descendant @@ -514,7 +514,7 @@ fn delete_one_item_in_a_single_document_database() { // first, insert a bunch of elements writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); + writer.builder(&mut rng).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -530,7 +530,7 @@ fn delete_one_item_in_a_single_document_database() { writer.del_item(&mut wtxn, 0).unwrap(); - writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); + writer.builder(&mut rng).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -551,7 +551,7 @@ fn delete_one_item() { for i in 0..6 { writer.add_item(&mut wtxn, i, &[i as f32, 0.]).unwrap(); } - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -576,7 +576,7 @@ fn delete_one_item() { writer.del_item(&mut wtxn, 3).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -600,7 +600,7 @@ fn delete_one_item() { writer.del_item(&mut wtxn, 1).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -623,7 +623,7 @@ fn add_one_item_incrementally_in_an_empty_db() { let mut rng = rng(); let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -635,7 +635,7 @@ fn add_one_item_incrementally_in_an_empty_db() { let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -654,7 +654,7 @@ fn add_one_item_incrementally_in_a_one_item_db() { let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -668,7 +668,7 @@ fn add_one_item_incrementally_in_a_one_item_db() { let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 1, &[1., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -689,7 +689,7 @@ fn add_one_item_incrementally_to_create_a_split_node() { let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 0, &[0., 0.]).unwrap(); writer.add_item(&mut wtxn, 1, &[1., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -704,7 +704,7 @@ fn add_one_item_incrementally_to_create_a_split_node() { let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); writer.add_item(&mut wtxn, 2, &[2., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -729,7 +729,7 @@ fn add_one_item_incrementally() { for i in 0..6 { writer.add_item(&mut wtxn, i, &[i as f32, 0.]).unwrap(); } - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -754,7 +754,7 @@ fn add_one_item_incrementally() { writer.add_item(&mut wtxn, 25, &[25., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -781,7 +781,7 @@ fn add_one_item_incrementally() { writer.add_item(&mut wtxn, 8, &[8., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -817,7 +817,7 @@ fn delete_extraneous_tree() { writer.add_item(&mut wtxn, i, &[i as f32, 0., 0., 0.]).unwrap(); } // 5 nodes of 4 dimensions should create 3 trees by default. - writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); + writer.builder(&mut rng).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -839,7 +839,7 @@ fn delete_extraneous_tree() { let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(2)).unwrap(); + writer.builder(&mut rng).n_trees(2).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -859,7 +859,7 @@ fn delete_extraneous_tree() { let mut wtxn = handle.env.write_txn().unwrap(); let writer = Writer::new(handle.database, 0, 2); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -886,7 +886,7 @@ fn reuse_node_id() { for i in 0..6 { writer.add_item(&mut wtxn, i, &[i as f32, 0.]).unwrap(); } - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -911,7 +911,7 @@ fn reuse_node_id() { // if we delete the 1 it should free the node id 0 writer.del_item(&mut wtxn, 1).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -934,7 +934,7 @@ fn reuse_node_id() { // if we re-insert the 1 the node id 0 should be re-used writer.add_item(&mut wtxn, 1, &[1., 0.]).unwrap(); - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(1)).unwrap(); + writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -958,7 +958,7 @@ fn reuse_node_id() { let writer = Writer::new(handle.database, 0, 2); // if we now build a new tree, the id 1 should be re-used - writer.build(&mut wtxn, &mut rng, BuildOption::new().with_n_trees(2)).unwrap(); + writer.builder(&mut rng).n_trees(2).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); insta::assert_snapshot!(handle, @r###" @@ -997,7 +997,7 @@ fn need_build() { writer.need_build(&wtxn).unwrap(), "because metadata are missing and an item has been updated" ); - writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); + writer.builder(&mut rng).build(&mut wtxn).unwrap(); let writer = Writer::new(handle.database, 0, 2); writer.del_item(&mut wtxn, 0).unwrap(); @@ -1013,17 +1013,17 @@ fn prepare_changing_distance() { writer.add_item(&mut wtxn, 0, &[0.0, 0.0]).unwrap(); writer.add_item(&mut wtxn, 1, &[1.0, 1.0]).unwrap(); writer.add_item(&mut wtxn, 3, &[3.0, 3.0]).unwrap(); - writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); + writer.builder(&mut rng).build(&mut wtxn).unwrap(); let writer = Writer::new(handle.database, 1, 2); writer.add_item(&mut wtxn, 0, &[0.0, 0.0]).unwrap(); writer.add_item(&mut wtxn, 1, &[1.0, 1.0]).unwrap(); writer.add_item(&mut wtxn, 3, &[3.0, 3.0]).unwrap(); - writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); + writer.builder(&mut rng).build(&mut wtxn).unwrap(); let writer = Writer::new(handle.database, 2, 2); writer.add_item(&mut wtxn, 0, &[0.0, 0.0]).unwrap(); writer.add_item(&mut wtxn, 1, &[1.0, 1.0]).unwrap(); writer.add_item(&mut wtxn, 3, &[3.0, 3.0]).unwrap(); - writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); + writer.builder(&mut rng).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); let mut wtxn = handle.env.write_txn().unwrap(); @@ -1032,7 +1032,7 @@ fn prepare_changing_distance() { let writer = writer.prepare_changing_distance::(&mut wtxn).unwrap(); assert!(writer.need_build(&wtxn).unwrap(), "after changing the distance"); - writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); + writer.builder(&mut rng).build(&mut wtxn).unwrap(); wtxn.commit().unwrap(); // TODO: this should not works, see https://github.com/meilisearch/arroy/issues/92 @@ -1040,5 +1040,5 @@ fn prepare_changing_distance() { let writer = Writer::new(handle.database, 1, 2); writer.del_item(&mut wtxn, 0).unwrap(); assert!(writer.need_build(&wtxn).unwrap(), "because an item has been updated"); - writer.build(&mut wtxn, &mut rng, &BuildOption::new()).unwrap(); + writer.builder(&mut rng).build(&mut wtxn).unwrap(); } diff --git a/src/writer.rs b/src/writer.rs index 5475cc16..6130266c 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -28,72 +28,77 @@ use crate::{ }; /// The options available when building the arroy database. -#[derive(Default, Clone)] -pub struct BuildOption { - /// The number of trees to build. If `None` arroy will determine the best amount to build for your number of vectors itself. - pub n_trees: Option, - /// Configure the maximum number of items stored in a descendant node. - /// This is only applied to the newly created or updated tree node. - /// If the value is modified while working on an already existing database, - /// the nodes that don't need to be updated won't be recreated. - pub split_after: Option, +pub struct ArroyBuilder<'a, D: Distance, R: Rng + SeedableRng> { + writer: &'a Writer, + rng: &'a mut R, + inner: BuildOption, } -impl BuildOption { - /// Create a new `BuildOption`. - /// - /// # Example - /// - /// ``` - /// use arroy::BuildOption; - /// BuildOption::new(); - /// ``` - pub fn new() -> Self { - Self::default() - } +/// The options available when building the arroy database. +struct BuildOption { + n_trees: Option, + split_after: Option, +} +impl<'a, D: Distance, R: Rng + SeedableRng> ArroyBuilder<'a, D, R> { /// The number of trees to build. If not set arroy will determine the best amount to build for your number of vectors itself. - /// See also `[Self::with_maybe_n_trees]`. /// /// # Example /// + /// ```no_run + /// # use arroy::{Writer, distances::Euclidean}; + /// # let (writer, wtxn): (Writer, heed::RwTxn) = todo!(); + /// use rand::rngs::StdRng; + /// use rand::SeedableRng; + /// let mut rng = StdRng::seed_from_u64(13); + /// writer.builder(&mut rng).n_trees(10).build(&mut wtxn); /// ``` - /// use arroy::BuildOption; - /// BuildOption::new().with_n_trees(10); - /// ``` - pub fn with_n_trees(&mut self, n_trees: usize) -> &mut Self { - self.n_trees = Some(n_trees); + pub fn n_trees(&mut self, n_trees: usize) -> &mut Self { + self.inner.n_trees = Some(n_trees); self } - /// The number of trees to build. If `None` arroy will determine the best amount to build for your number of vectors itself. - /// See also `[Self::with_n_trees]`. + /// Configure the maximum number of items stored in a descendant node. + /// This is only applied to the newly created or updated tree node. + /// If the value is modified while working on an already existing database, + /// the nodes that don't need to be updated won't be recreated. /// /// # Example /// + /// ```no_run + /// # use arroy::{Writer, distances::Euclidean}; + /// # let (writer, wtxn): (Writer, heed::RwTxn) = todo!(); + /// use rand::rngs::StdRng; + /// use rand::SeedableRng; + /// let mut rng = StdRng::seed_from_u64(92); + /// writer.builder(&mut rng).split_after(1000).build(&mut wtxn); /// ``` - /// use arroy::BuildOption; - /// BuildOption::new().with_maybe_n_trees(Some(10)); - /// ``` - pub fn with_maybe_n_trees(&mut self, n_trees: Option) -> &mut Self { - self.n_trees = n_trees; + pub fn split_after(&mut self, split_after: usize) -> &mut Self { + self.inner.split_after = Some(split_after); self } - /// Configure the maximum number of items stored in a descendant node. - /// This is only applied to the newly created or updated tree node. - /// If the value is modified while working on an already existing database, - /// the nodes that don't need to be updated won't be recreated. + /// Generates a forest of `n_trees` trees. + /// + /// More trees give higher precision when querying at the cost of more disk usage. + /// After calling build, no more items can be added. + /// + /// This function is using rayon to spawn threads. It can be configured + /// by using the [`rayon::ThreadPoolBuilder`] and the + /// [`rayon::ThreadPool::install`] to use it. /// /// # Example /// + /// ```no_run + /// # use arroy::{Writer, distances::Euclidean}; + /// # let (writer, wtxn): (Writer, heed::RwTxn) = todo!(); + /// use rand::rngs::StdRng; + /// use rand::SeedableRng; + /// let mut rng = StdRng::seed_from_u64(92); + /// writer.builder(&mut rng).build(&mut wtxn); /// ``` - /// use arroy::BuildOption; - /// BuildOption::new().with_split_after(1000); - /// ``` - pub fn with_split_after(&mut self, split_after: usize) -> &mut Self { - self.split_after = Some(split_after); - self + pub fn build(&mut self, wtxn: &mut RwTxn) -> Result<()> { + self.writer.build(wtxn, self.rng, &self.inner) } } @@ -327,16 +332,13 @@ impl Writer { n <= max_in_descendant } - /// Generates a forest of `n_trees` trees. - /// - /// More trees give higher precision when querying at the cost of more disk usage. - /// After calling build, no more items can be added. - /// - /// This function is using rayon to spawn threads. It can be configured - /// by using the [`rayon::ThreadPoolBuilder`] and the - /// [`rayon::ThreadPool::install`] to use it. - pub fn build( - self, + /// Returns an [`ArroyBuilder`] to configure the available options to build the database. + pub fn builder<'a, R: Rng + SeedableRng>(&'a self, rng: &'a mut R) -> ArroyBuilder<'a, D, R> { + ArroyBuilder { writer: self, rng, inner: BuildOption { n_trees: None, split_after: None } } + } + + fn build( + &self, wtxn: &mut RwTxn, rng: &mut R, options: &BuildOption, From 5cf3e2d7f8e477aed0dacbb871426cef2b3e5492 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 25 Sep 2024 17:24:34 +0200 Subject: [PATCH 4/8] fix the fuzzer --- examples/fuzz.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/fuzz.rs b/examples/fuzz.rs index 4256a79f..105997d2 100644 --- a/examples/fuzz.rs +++ b/examples/fuzz.rs @@ -93,7 +93,7 @@ fn main() -> Result<()> { Operation::Delete(doc) => drop(writer.del_item(&mut wtxn, doc.id)?), } } - writer.build(&mut wtxn, &mut rng_arroy, None)?; + writer.builder(&mut rng_arroy).build(&mut wtxn)?; wtxn.commit()?; let rtxn = env.read_txn()?; let reader = Reader::::open(&rtxn, 0, database)?; From aa31841efc7e26731b0e27f1c6bf7e1724c96277 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 26 Sep 2024 10:07:51 +0200 Subject: [PATCH 5/8] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- src/writer.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/writer.rs b/src/writer.rs index 6130266c..60d39ef0 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -41,7 +41,7 @@ struct BuildOption { } impl<'a, D: Distance, R: Rng + SeedableRng> ArroyBuilder<'a, D, R> { - /// The number of trees to build. If not set arroy will determine the best amount to build for your number of vectors itself. + /// The number of trees to build. If not set arroy will determine the best amount to build for your number of vectors by itself. /// /// # Example /// @@ -59,6 +59,7 @@ impl<'a, D: Distance, R: Rng + SeedableRng> ArroyBuilder<'a, D, R> { } /// Configure the maximum number of items stored in a descendant node. + /// /// This is only applied to the newly created or updated tree node. /// If the value is modified while working on an already existing database, /// the nodes that don't need to be updated won't be recreated. @@ -85,7 +86,7 @@ impl<'a, D: Distance, R: Rng + SeedableRng> ArroyBuilder<'a, D, R> { /// /// This function is using rayon to spawn threads. It can be configured /// by using the [`rayon::ThreadPoolBuilder`] and the - /// [`rayon::ThreadPool::install`] to use it. + /// [`rayon::ThreadPool::install`]. /// /// # Example /// From 80206c18c53585bc704021373b3430bea6df7cb2 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 26 Sep 2024 10:08:20 +0200 Subject: [PATCH 6/8] Update src/reader.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- src/reader.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/reader.rs b/src/reader.rs index d79ad90e..b4586d64 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -265,6 +265,7 @@ impl<'t, D: Distance> Reader<'t, D> { } /// Return a [`QueryBuilder`] that lets you configure and execute a search request. + /// /// You must provide the number of items you want to receive. pub fn nns(&self, count: usize) -> QueryBuilder { QueryBuilder { reader: self, count, search_k: None, oversampling: None, candidates: None } From 60d8b2d257afc6f2e442afb7068e0399c501e397 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 26 Sep 2024 10:09:15 +0200 Subject: [PATCH 7/8] remove outdated sentence --- src/writer.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/writer.rs b/src/writer.rs index 60d39ef0..eaf64c7d 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -82,7 +82,6 @@ impl<'a, D: Distance, R: Rng + SeedableRng> ArroyBuilder<'a, D, R> { /// Generates a forest of `n_trees` trees. /// /// More trees give higher precision when querying at the cost of more disk usage. - /// After calling build, no more items can be added. /// /// This function is using rayon to spawn threads. It can be configured /// by using the [`rayon::ThreadPoolBuilder`] and the From be5aa28d3a7f0bb0995e9747a8eb8be4602342ed Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 26 Sep 2024 10:23:20 +0200 Subject: [PATCH 8/8] update the readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7558e6f2..8d3f21a9 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![dependency status](https://deps.rs/repo/github/meilisearch/arroy/status.svg)](https://deps.rs/repo/github/meilisearch/arroy) [![Build](https://github.com/meilisearch/arroy/actions/workflows/rust.yml/badge.svg)](https://github.com/meilisearch/arroy/actions/workflows/rust.yml) -Arroy ([Approximate Rearest Reighbors][1] Oh Yeah) is a Rust library with the interface of the [Annoy Python library][2] to search for vectors in space that are close to a given query vector. It is based on LMDB, a memory-mapped key-value store, so many processes may share the same data and atomically modify the vectors. +Arroy ([Approximate Rearest Reighbors][1] Oh Yeah) is a Rust library with an interface close of the [Annoy Python library][2] to search for vectors in space that are near a targeted vector. It is based on LMDB, a memory-mapped key-value store, so many processes may share the same data and atomically modify the vectors. ## Background