Skip to content

Commit 2bfe350

Browse files
committed
feat: add tree::Editor
With it it's easy to alter existing trees or build entirely new ones, efficiently.
1 parent 71bf808 commit 2bfe350

File tree

11 files changed

+990
-277
lines changed

11 files changed

+990
-277
lines changed

Cargo.lock

+9
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

gix-object/Cargo.toml

+3
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ gix-features = { version = "^0.38.2", path = "../gix-features", features = [
4141
"progress",
4242
] }
4343
gix-hash = { version = "^0.14.2", path = "../gix-hash" }
44+
gix-hashtable = { version = "^0.5.2", path = "../gix-hashtable" }
4445
gix-validate = { version = "^0.9.0", path = "../gix-validate" }
4546
gix-actor = { version = "^0.32.0", path = "../gix-actor" }
4647
gix-date = { version = "^0.9.0", path = "../gix-date" }
@@ -64,6 +65,8 @@ document-features = { version = "0.2.0", optional = true }
6465
criterion = "0.5.1"
6566
pretty_assertions = "1.0.0"
6667
gix-testtools = { path = "../tests/tools" }
68+
gix-odb = { path = "../gix-odb" }
69+
termtree = "0.5.1"
6770

6871
[package.metadata.docs.rs]
6972
all-features = true

gix-object/src/tree/editor.rs

+267
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
use crate::tree::EntryKind;
2+
use crate::{tree, Tree};
3+
use bstr::{BStr, BString, ByteSlice, ByteVec};
4+
use gix_hash::ObjectId;
5+
use gix_hashtable::hash_map::Entry;
6+
use std::cmp::Ordering;
7+
8+
/// The state needed to apply edits instantly to in-memory trees.
9+
///
10+
/// It's made so that each tree is looked at in the object database at most once, and held in memory for
11+
/// all edits until everything is flushed to write all changed trees.
12+
///
13+
/// The editor is optimized to edit existing trees, but can deal with building entirely new trees as well
14+
/// with some penalties.
15+
///
16+
/// ### Note
17+
///
18+
/// For reasons of efficiency, internally a SHA1 based hashmap is used to avoid having to store full paths
19+
/// to each edited tree. The chance of collision is low, but could be engineered to overwrite or write into
20+
/// an unintended tree.
21+
pub struct Editor<'a> {
22+
/// A way to lookup trees.
23+
find: &'a dyn crate::FindExt,
24+
/// All trees we currently hold in memory. Each of these may change while adding and removing entries.
25+
/// null-object-ids mark tree-entries whose value we don't know yet, they are placeholders that will be
26+
/// dropped when writing at the latest.
27+
trees: gix_hashtable::HashMap<ObjectId, Tree>,
28+
/// A buffer to build up paths when finding the tree to edit.
29+
path_buf: BString,
30+
/// Our buffer for storing tree-data in, right before decoding it.
31+
tree_buf: Vec<u8>,
32+
}
33+
34+
/// Lifecycle
35+
impl<'a> Editor<'a> {
36+
/// Create a new editor that uses `root` as base for all edits. Use `find` to lookup existing
37+
/// trees when edits are made. Each tree will only be looked-up once and then edited in place from
38+
/// that point on.
39+
pub fn new(root: Tree, find: &'a dyn crate::FindExt) -> Self {
40+
Editor {
41+
find,
42+
trees: gix_hashtable::HashMap::from_iter(Some((empty_path_hash(), root))),
43+
path_buf: Vec::with_capacity(256).into(),
44+
tree_buf: Vec::with_capacity(512),
45+
}
46+
}
47+
}
48+
49+
/// Operations
50+
impl<'a> Editor<'a> {
51+
/// Write the entire in-memory state of all changed trees (and only changed trees) to `out`.
52+
///
53+
/// The last call to `out` will be the changed root tree, whose object-id will also be returned.
54+
/// `out` is free to do any kind of additional validation, like to assure that all entries in the tree exist.
55+
/// We don't assure that as there is no validation that inserted entries are valid object ids.
56+
///
57+
/// Future calls to [`upsert`](Self::upsert) or similar will keep working on the last seen state of the
58+
/// just-written root-tree.
59+
/// If this is not desired, use [set_root()](Self::set_root()).
60+
pub fn write<E>(&mut self, mut out: impl FnMut(&Tree) -> Result<ObjectId, E>) -> Result<ObjectId, E> {
61+
assert_ne!(self.trees.len(), 0, "there is at least the root tree");
62+
63+
// back is for children, front is for parents.
64+
let mut parents = vec![(
65+
None::<usize>,
66+
BString::default(),
67+
self.trees
68+
.remove(&empty_path_hash())
69+
.expect("root tree is always present"),
70+
)];
71+
let mut children = Vec::new();
72+
while let Some((parent_idx, mut rela_path, mut tree)) = children.pop().or_else(|| parents.pop()) {
73+
let mut all_entries_unchanged_or_written = true;
74+
for entry in &tree.entries {
75+
if entry.mode.is_tree() {
76+
let prev_len = push_path_component(&mut rela_path, &entry.filename);
77+
if let Some(sub_tree) = self.trees.remove(&path_hash(&rela_path)) {
78+
all_entries_unchanged_or_written = false;
79+
let next_parent_idx = parents.len();
80+
children.push((Some(next_parent_idx), rela_path.clone(), sub_tree));
81+
}
82+
rela_path.truncate(prev_len);
83+
}
84+
}
85+
if all_entries_unchanged_or_written {
86+
tree.entries.retain(|e| !e.oid.is_null());
87+
if let Some((_, _, parent_to_adjust)) =
88+
parent_idx.map(|idx| parents.get_mut(idx).expect("always present, pointing towards zero"))
89+
{
90+
let name = filename(rela_path.as_bstr());
91+
let entry_idx = parent_to_adjust
92+
.entries
93+
.binary_search_by(|e| cmp_entry_with_name(e, name, true))
94+
.expect("the parent always knows us by name");
95+
if tree.entries.is_empty() {
96+
parent_to_adjust.entries.remove(entry_idx);
97+
} else {
98+
parent_to_adjust.entries[entry_idx].oid = out(&tree)?;
99+
}
100+
} else if parents.is_empty() {
101+
debug_assert!(children.is_empty(), "we consume children before parents");
102+
debug_assert!(rela_path.is_empty(), "this should always be the root tree");
103+
104+
// There may be left-over trees if they are replaced with blobs for example.
105+
let root_tree_id = out(&tree)?;
106+
self.trees.clear();
107+
self.trees.insert(empty_path_hash(), tree);
108+
return Ok(root_tree_id);
109+
} else if !tree.entries.is_empty() {
110+
out(&tree)?;
111+
}
112+
} else {
113+
parents.push((parent_idx, rela_path, tree));
114+
}
115+
}
116+
117+
unreachable!("we exit as soon as everything is consumed")
118+
}
119+
120+
/// Insert a new entry of `kind` with `id` at `rela_path`, an iterator over each path component in the tree,
121+
/// like `a/b/c`. Names are matched case-sensitively.
122+
///
123+
/// Existing leaf-entries will be overwritten unconditionally, and it is assumed that `id` is available in the object database
124+
/// or will be made available at a later point to assure the integrity of the produced tree.
125+
///
126+
/// Intermediate trees will be created if they don't exist in the object database, otherwise they will be loaded and entries
127+
/// will be inserted into them instead.
128+
///
129+
/// Note that `id` can be [null](ObjectId::null()) to create a placeholder. These will not be written, and paths leading
130+
/// through them will not be considered a problem.
131+
///
132+
/// `id` can also be an empty tree, along with [the respective `kind`](EntryKind::Tree), even though that's normally not allowed
133+
/// in Git trees.
134+
pub fn upsert<I, C>(
135+
&mut self,
136+
rela_path: I,
137+
kind: EntryKind,
138+
id: ObjectId,
139+
) -> Result<&mut Self, crate::find::existing_object::Error>
140+
where
141+
I: IntoIterator<Item = C>,
142+
C: AsRef<BStr>,
143+
{
144+
let mut cursor = self.trees.get_mut(&empty_path_hash()).expect("root is always present");
145+
self.path_buf.clear();
146+
let mut rela_path = rela_path.into_iter().peekable();
147+
while let Some(name) = rela_path.next() {
148+
let name = name.as_ref();
149+
let is_last = rela_path.peek().is_none();
150+
let mut needs_sorting = false;
151+
let current_level_must_be_tree = !is_last || kind == EntryKind::Tree;
152+
let check_type_change = |entry: &tree::Entry| entry.mode.is_tree() != current_level_must_be_tree;
153+
let tree_to_lookup = match cursor
154+
.entries
155+
.binary_search_by(|e| cmp_entry_with_name(e, name, false))
156+
.or_else(|file_insertion_idx| {
157+
cursor
158+
.entries
159+
.binary_search_by(|e| cmp_entry_with_name(e, name, true))
160+
.map_err(|dir_insertion_index| {
161+
if current_level_must_be_tree {
162+
dir_insertion_index
163+
} else {
164+
file_insertion_idx
165+
}
166+
})
167+
}) {
168+
Ok(idx) => {
169+
let entry = &mut cursor.entries[idx];
170+
if is_last {
171+
// unconditionally overwrite what's there.
172+
entry.oid = id;
173+
needs_sorting = check_type_change(entry);
174+
entry.mode = kind.into();
175+
None
176+
} else if entry.mode.is_tree() {
177+
// Possibly lookup the existing tree on our way down the path.
178+
Some(entry.oid)
179+
} else {
180+
// it is no tree, but we are traversing a path, so turn it into one.
181+
entry.oid = id.kind().null();
182+
needs_sorting = check_type_change(entry);
183+
entry.mode = EntryKind::Tree.into();
184+
None
185+
}
186+
}
187+
Err(insertion_idx) => {
188+
cursor.entries.insert(
189+
insertion_idx,
190+
tree::Entry {
191+
filename: name.into(),
192+
mode: if is_last { kind.into() } else { EntryKind::Tree.into() },
193+
oid: if is_last { id } else { id.kind().null() },
194+
},
195+
);
196+
if is_last {
197+
break;
198+
}
199+
None
200+
}
201+
};
202+
if needs_sorting {
203+
cursor.entries.sort();
204+
}
205+
if is_last {
206+
break;
207+
}
208+
push_path_component(&mut self.path_buf, name);
209+
let path_id = path_hash(&self.path_buf);
210+
cursor = match self.trees.entry(path_id) {
211+
Entry::Occupied(e) => e.into_mut(),
212+
Entry::Vacant(e) => e.insert(
213+
if let Some(tree_id) = tree_to_lookup.filter(|tree_id| !tree_id.is_empty_tree()) {
214+
self.find.find_tree(&tree_id, &mut self.tree_buf)?.into()
215+
} else {
216+
Tree::default()
217+
},
218+
),
219+
};
220+
}
221+
Ok(self)
222+
}
223+
224+
/// Set the root tree of the modification to `root`, assuring it has a well-known state.
225+
///
226+
/// Note that this erases all previous edits.
227+
///
228+
/// This is useful if the same editor is re-used for various trees.
229+
pub fn set_root(&mut self, root: Tree) -> &mut Self {
230+
self.trees.clear();
231+
self.trees.insert(empty_path_hash(), root);
232+
self
233+
}
234+
}
235+
236+
fn cmp_entry_with_name(a: &tree::Entry, filename: &BStr, is_tree: bool) -> Ordering {
237+
let common = a.filename.len().min(filename.len());
238+
a.filename[..common].cmp(&filename[..common]).then_with(|| {
239+
let a = a.filename.get(common).or_else(|| a.mode.is_tree().then_some(&b'/'));
240+
let b = filename.get(common).or_else(|| is_tree.then_some(&b'/'));
241+
a.cmp(&b)
242+
})
243+
}
244+
245+
fn filename(path: &BStr) -> &BStr {
246+
path.rfind_byte(b'/').map_or(path, |pos| &path[pos + 1..])
247+
}
248+
249+
fn empty_path_hash() -> ObjectId {
250+
gix_features::hash::hasher(gix_hash::Kind::Sha1).digest().into()
251+
}
252+
253+
fn path_hash(path: &[u8]) -> ObjectId {
254+
let mut hasher = gix_features::hash::hasher(gix_hash::Kind::Sha1);
255+
hasher.update(path);
256+
hasher.digest().into()
257+
}
258+
259+
fn push_path_component(base: &mut BString, component: &[u8]) -> usize {
260+
let prev_len = base.len();
261+
debug_assert!(base.last() != Some(&b'/'));
262+
if !base.is_empty() {
263+
base.push_byte(b'/');
264+
}
265+
base.push_str(component);
266+
prev_len
267+
}

gix-object/src/tree/mod.rs

+3
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ use crate::{
55
tree,
66
};
77

8+
mod editor;
9+
pub use editor::Editor;
10+
811
mod ref_iter;
912
///
1013
pub mod write;

gix-object/src/tree/write.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,12 @@ impl crate::WriteTo for Tree {
2727
/// Serialize this tree to `out` in the git internal format.
2828
fn write_to(&self, out: &mut dyn io::Write) -> io::Result<()> {
2929
debug_assert_eq!(
30+
&self.entries,
3031
&{
3132
let mut entries_sorted = self.entries.clone();
3233
entries_sorted.sort();
3334
entries_sorted
3435
},
35-
&self.entries,
3636
"entries for serialization must be sorted by filename"
3737
);
3838
let mut buf = Default::default();

0 commit comments

Comments
 (0)