Skip to content

Commit d72b469

Browse files
authored
Merge pull request #99 from meilisearch/store-the-updated-id-in-lmdb
Store the list of updated IDs directly in LMDB instead of a roaring bitmap
2 parents 24083df + d9a5694 commit d72b469

File tree

6 files changed

+76
-58
lines changed

6 files changed

+76
-58
lines changed

src/error.rs

+1
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ impl Error {
7575
NodeMode::Item => "Item",
7676
NodeMode::Tree => "Tree",
7777
NodeMode::Metadata => "Metadata",
78+
NodeMode::Updated => "Updated",
7879
},
7980
item: key.node.item,
8081
}

src/key.rs

+8-3
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,13 @@ use heed::BoxedError;
77
use crate::{NodeId, NodeMode};
88

99
/// This whole structure must fit in an u64 so we can tell LMDB to optimize its storage.
10-
/// The `prefix` is specified by the user and is used to differentiate between multiple arroy indexes.
10+
/// The `index` is specified by the user and is used to differentiate between multiple arroy indexes.
1111
/// The `mode` indicates what we're looking at.
1212
/// The `item` point to a specific node.
1313
/// If the mode is:
1414
/// - `Item`: we're looking at a `Leaf` node.
1515
/// - `Tree`: we're looking at one of the internal generated node from arroy. Could be a descendants or a split plane.
16+
/// - `Updated`: The list of items that has been updated since the last build of the database.
1617
/// - `Metadata`: There is only one item at `0` that contains the header required to read the index.
1718
#[derive(Debug, Copy, Clone)]
1819
pub struct Key {
@@ -32,8 +33,8 @@ impl Key {
3233
Self::new(index, NodeId::metadata())
3334
}
3435

35-
pub const fn updated(index: u16) -> Self {
36-
Self::new(index, NodeId::updated())
36+
pub const fn updated(index: u16, item: u32) -> Self {
37+
Self::new(index, NodeId::updated(item))
3738
}
3839

3940
pub const fn item(index: u16, item: u32) -> Self {
@@ -98,6 +99,10 @@ impl Prefix {
9899
pub const fn tree(index: u16) -> Self {
99100
Self { index, mode: Some(NodeMode::Tree) }
100101
}
102+
103+
pub const fn updated(index: u16) -> Self {
104+
Self { index, mode: Some(NodeMode::Updated) }
105+
}
101106
}
102107

103108
pub enum PrefixCodec {}

src/node_id.rs

+18-6
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,15 @@ use crate::ItemId;
99
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
1010
#[repr(u8)]
1111
pub enum NodeMode {
12+
/// Stores the metadata under the `ItemId` 0
1213
Metadata = 0,
13-
Tree = 1,
14-
Item = 2,
14+
/// Stores the list of all the `ItemId` that have been updated.
15+
/// We only stores `Unit` values under the keys.
16+
Updated = 1,
17+
/// The tree nodes are stored under this id.
18+
Tree = 2,
19+
/// The original vectors are stored under this id in `Leaf` structures.
20+
Item = 3,
1521
}
1622

1723
impl TryFrom<u8> for NodeMode {
@@ -21,6 +27,7 @@ impl TryFrom<u8> for NodeMode {
2127
match v {
2228
v if v == NodeMode::Item as u8 => Ok(NodeMode::Item),
2329
v if v == NodeMode::Tree as u8 => Ok(NodeMode::Tree),
30+
v if v == NodeMode::Updated as u8 => Ok(NodeMode::Updated),
2431
v if v == NodeMode::Metadata as u8 => Ok(NodeMode::Metadata),
2532
v => Err(format!("Could not convert {v} as a `NodeMode`.")),
2633
}
@@ -47,8 +54,8 @@ impl NodeId {
4754
Self { mode: NodeMode::Metadata, item: 0 }
4855
}
4956

50-
pub const fn updated() -> Self {
51-
Self { mode: NodeMode::Metadata, item: 1 }
57+
pub const fn updated(item: u32) -> Self {
58+
Self { mode: NodeMode::Updated, item }
5259
}
5360

5461
pub const fn tree(item: u32) -> Self {
@@ -107,11 +114,16 @@ mod test {
107114
assert!(NodeId::tree(1) > NodeId::tree(0));
108115
assert!(NodeId::tree(0) < NodeId::tree(1));
109116

117+
assert!(NodeId::updated(0) == NodeId::updated(0));
118+
assert!(NodeId::updated(1) > NodeId::updated(0));
119+
assert!(NodeId::updated(0) < NodeId::updated(1));
120+
110121
// tree < item whatever is the value
111122
assert!(NodeId::tree(u32::MAX) < NodeId::item(0));
112123

113124
assert!(NodeId::metadata() == NodeId::metadata());
114-
assert!(NodeId::metadata() < NodeId::tree(u32::MAX));
115-
assert!(NodeId::metadata() < NodeId::item(u32::MAX));
125+
assert!(NodeId::metadata() < NodeId::tree(u32::MIN));
126+
assert!(NodeId::metadata() < NodeId::updated(u32::MIN));
127+
assert!(NodeId::metadata() < NodeId::item(u32::MIN));
116128
}
117129
}

src/reader.rs

+8-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use std::iter::repeat;
44
use std::marker;
55
use std::num::NonZeroUsize;
66

7-
use heed::types::{Bytes, DecodeIgnore};
7+
use heed::types::DecodeIgnore;
88
use heed::RoTxn;
99
use ordered_float::OrderedFloat;
1010
use roaring::RoaringBitmap;
@@ -146,7 +146,13 @@ impl<'t, D: Distance> Reader<'t, D> {
146146
received: D::name(),
147147
});
148148
}
149-
if database.remap_data_type::<Bytes>().get(rtxn, &Key::updated(index))?.is_some() {
149+
if database
150+
.remap_types::<PrefixCodec, DecodeIgnore>()
151+
.prefix_iter(rtxn, &Prefix::updated(index))?
152+
.remap_key_type::<KeyCodec>()
153+
.next()
154+
.is_some()
155+
{
150156
return Err(Error::NeedBuild(index));
151157
}
152158

src/tests/mod.rs

+1-3
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,7 @@ impl<D: Distance> fmt::Display for DatabaseHandle<D> {
7777
.unwrap();
7878
writeln!(f, "updated_item_ids: {updated_item_ids:?}")?;
7979
}
80-
NodeMode::Metadata => {
81-
panic!()
82-
}
80+
NodeMode::Updated | NodeMode::Metadata => panic!(),
8381
}
8482
}
8583

src/writer.rs

+40-44
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use std::borrow::Cow;
33
use std::mem;
44
use std::path::PathBuf;
55

6-
use heed::types::{Bytes, DecodeIgnore};
6+
use heed::types::{Bytes, DecodeIgnore, Unit};
77
use heed::{MdbError, PutFlags, RoTxn, RwTxn};
88
use rand::{Rng, SeedableRng};
99
use rayon::iter::repeatn;
@@ -20,7 +20,6 @@ use crate::parallel::{
2020
TmpNodesReader,
2121
};
2222
use crate::reader::item_leaf;
23-
use crate::roaring::RoaringBitmapCodec;
2423
use crate::unaligned_vector::UnalignedVector;
2524
use crate::{
2625
Database, Error, ItemId, Key, Metadata, MetadataCodec, Node, NodeCodec, NodeId, Prefix,
@@ -224,8 +223,10 @@ impl<D: Distance> Writer<D> {
224223
pub fn need_build(&self, rtxn: &RoTxn) -> Result<bool> {
225224
Ok(self
226225
.database
227-
.remap_data_type::<DecodeIgnore>()
228-
.get(rtxn, &Key::updated(self.index))?
226+
.remap_types::<PrefixCodec, DecodeIgnore>()
227+
.prefix_iter(rtxn, &Prefix::updated(self.index))?
228+
.remap_key_type::<KeyCodec>()
229+
.next()
229230
.is_some()
230231
|| self
231232
.database
@@ -266,17 +267,7 @@ impl<D: Distance> Writer<D> {
266267
let vector = UnalignedVector::from_slice(vector);
267268
let leaf = Leaf { header: D::new_header(&vector), vector };
268269
self.database.put(wtxn, &Key::item(self.index, item), &Node::Leaf(leaf))?;
269-
let mut updated = self
270-
.database
271-
.remap_data_type::<RoaringBitmapCodec>()
272-
.get(wtxn, &Key::updated(self.index))?
273-
.unwrap_or_default();
274-
updated.insert(item);
275-
self.database.remap_data_type::<RoaringBitmapCodec>().put(
276-
wtxn,
277-
&Key::updated(self.index),
278-
&updated,
279-
)?;
270+
self.database.remap_data_type::<Unit>().put(wtxn, &Key::updated(self.index, item), &())?;
280271

281272
Ok(())
282273
}
@@ -302,35 +293,19 @@ impl<D: Distance> Writer<D> {
302293
Err(heed::Error::Mdb(MdbError::KeyExist)) => return Err(Error::InvalidItemAppend),
303294
Err(e) => return Err(e.into()),
304295
}
305-
let mut updated = self
306-
.database
307-
.remap_data_type::<RoaringBitmapCodec>()
308-
.get(wtxn, &Key::updated(self.index))?
309-
.unwrap_or_default();
310-
// We cannot append here because we may have removed an item with a larger id before
311-
updated.insert(item);
312-
self.database.remap_data_type::<RoaringBitmapCodec>().put(
313-
wtxn,
314-
&Key::updated(self.index),
315-
&updated,
316-
)?;
296+
// We cannot append here because the items appear after the updated keys
297+
self.database.remap_data_type::<Unit>().put(wtxn, &Key::updated(self.index, item), &())?;
317298

318299
Ok(())
319300
}
320301

321302
/// Deletes an item stored in this database and returns `true` if it existed.
322303
pub fn del_item(&self, wtxn: &mut RwTxn, item: ItemId) -> Result<bool> {
323304
if self.database.delete(wtxn, &Key::item(self.index, item))? {
324-
let mut updated = self
325-
.database
326-
.remap_data_type::<RoaringBitmapCodec>()
327-
.get(wtxn, &Key::updated(self.index))?
328-
.unwrap_or_default();
329-
updated.insert(item);
330-
self.database.remap_data_type::<RoaringBitmapCodec>().put(
305+
self.database.remap_data_type::<Unit>().put(
331306
wtxn,
332-
&Key::updated(self.index),
333-
&updated,
307+
&Key::updated(self.index, item),
308+
&(),
334309
)?;
335310

336311
Ok(true)
@@ -430,7 +405,18 @@ impl<D: Distance> Writer<D> {
430405
}
431406

432407
log::debug!("reset the updated items...");
433-
self.database.delete(wtxn, &Key::updated(self.index))?;
408+
let mut updated_iter = self
409+
.database
410+
.remap_types::<PrefixCodec, DecodeIgnore>()
411+
.prefix_iter_mut(wtxn, &Prefix::updated(self.index))?
412+
.remap_key_type::<KeyCodec>();
413+
while updated_iter.next().transpose()?.is_some() {
414+
// Safe because we don't hold any reference to the database currently
415+
unsafe {
416+
updated_iter.del_current()?;
417+
}
418+
}
419+
drop(updated_iter);
434420

435421
log::debug!("write the metadata...");
436422
let metadata = Metadata {
@@ -448,11 +434,23 @@ impl<D: Distance> Writer<D> {
448434
return Ok(());
449435
}
450436

451-
let updated_items = self
437+
log::debug!("reset and retrieve the updated items...");
438+
let mut updated_items = RoaringBitmap::new();
439+
let mut updated_iter = self
452440
.database
453-
.remap_data_type::<RoaringBitmapCodec>()
454-
.get(wtxn, &Key::updated(self.index))?
455-
.unwrap_or_default();
441+
.remap_types::<PrefixCodec, DecodeIgnore>()
442+
.prefix_iter_mut(wtxn, &Prefix::updated(self.index))?
443+
.remap_key_type::<KeyCodec>();
444+
while let Some((key, _)) = updated_iter.next().transpose()? {
445+
let inserted = updated_items.push(key.node.item);
446+
debug_assert!(inserted, "The keys should be sorted by LMDB");
447+
// Safe because we don't hold any reference to the database currently
448+
unsafe {
449+
updated_iter.del_current()?;
450+
}
451+
}
452+
drop(updated_iter);
453+
456454
// while iterating on the nodes we want to delete all the modified element even if they are being inserted right after.
457455
let to_delete = &updated_items;
458456
let to_insert = &item_indices & &updated_items;
@@ -548,9 +546,6 @@ impl<D: Distance> Writer<D> {
548546
roots.append(&mut thread_roots);
549547
}
550548

551-
log::debug!("reset the updated items...");
552-
self.database.delete(wtxn, &Key::updated(self.index))?;
553-
554549
log::debug!("write the metadata...");
555550
let metadata = Metadata {
556551
dimensions: self.dimensions.try_into().unwrap(),
@@ -774,6 +769,7 @@ impl<D: Distance> Writer<D> {
774769
}
775770
}
776771
NodeMode::Metadata => unreachable!(),
772+
NodeMode::Updated => todo!(),
777773
}
778774
}
779775

0 commit comments

Comments
 (0)