Skip to content

Commit 1a69c40

Browse files
authored
Merge pull request #1785 from GitoxideLabs/improvements
various improvements
2 parents 8df0db2 + f3257f3 commit 1a69c40

File tree

9 files changed

+462
-3
lines changed

9 files changed

+462
-3
lines changed

gix-diff/src/blob/mod.rs

+3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ pub mod pipeline;
1111
///
1212
pub mod platform;
1313

14+
pub mod unified_diff;
15+
pub use unified_diff::_impl::UnifiedDiff;
16+
1417
/// Information about the diff performed to detect similarity.
1518
#[derive(Debug, Default, Clone, Copy, PartialEq, PartialOrd)]
1619
pub struct DiffLineStats {

gix-diff/src/blob/unified_diff.rs

+255
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
//! Facilities to produce the unified diff format.
2+
//!
3+
//! Originally based on <https://github.com/pascalkuthe/imara-diff/pull/14>.
4+
5+
/// Defines the size of the context printed before and after each change.
6+
///
7+
/// Similar to the `-U` option in git diff or gnu-diff. If the context overlaps
8+
/// with previous or next change, the context gets reduced accordingly.
9+
#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq, Ord, PartialOrd)]
10+
pub struct ContextSize {
11+
/// Defines the size of the context printed before and after each change.
12+
symmetrical: u32,
13+
}
14+
15+
impl Default for ContextSize {
16+
fn default() -> Self {
17+
ContextSize::symmetrical(3)
18+
}
19+
}
20+
21+
/// Instantiation
22+
impl ContextSize {
23+
/// Create a symmetrical context with `n` lines before and after a changed hunk.
24+
pub fn symmetrical(n: u32) -> Self {
25+
ContextSize { symmetrical: n }
26+
}
27+
}
28+
29+
/// A utility trait for use in [`UnifiedDiff`](super::UnifiedDiff).
30+
pub trait ConsumeHunk {
31+
/// The item this instance produces after consuming all hunks.
32+
type Out;
33+
34+
/// Consume a single `hunk` in unified diff format, that would be prefixed with `header`.
35+
/// Note that all newlines are added.
36+
///
37+
/// Note that the [`UnifiedDiff`](super::UnifiedDiff) sink will wrap its output in an [`std::io::Result`].
38+
/// After this method returned its first error, it will not be called anymore.
39+
///
40+
/// The following is hunk-related information and the same that is used in the `header`.
41+
/// * `before_hunk_start` is the 1-based first line of this hunk in the old file.
42+
/// * `before_hunk_len` the amount of lines of this hunk in the old file.
43+
/// * `after_hunk_start` is the 1-based first line of this hunk in the new file.
44+
/// * `after_hunk_len` the amount of lines of this hunk in the new file.
45+
fn consume_hunk(
46+
&mut self,
47+
before_hunk_start: u32,
48+
before_hunk_len: u32,
49+
after_hunk_start: u32,
50+
after_hunk_len: u32,
51+
header: &str,
52+
hunk: &[u8],
53+
) -> std::io::Result<()>;
54+
/// Called after the last hunk is consumed to produce an output.
55+
fn finish(self) -> Self::Out;
56+
}
57+
58+
pub(super) mod _impl {
59+
use super::{ConsumeHunk, ContextSize};
60+
use bstr::{ByteSlice, ByteVec};
61+
use imara_diff::{intern, Sink};
62+
use intern::{InternedInput, Interner, Token};
63+
use std::hash::Hash;
64+
use std::io::ErrorKind;
65+
use std::ops::Range;
66+
67+
/// A [`Sink`] that creates a textual diff in the format typically output by git or `gnu-diff` if the `-u` option is used,
68+
/// and passes it in full to a consumer.
69+
pub struct UnifiedDiff<'a, T, D>
70+
where
71+
T: Hash + Eq + AsRef<[u8]>,
72+
D: ConsumeHunk,
73+
{
74+
before: &'a [Token],
75+
after: &'a [Token],
76+
interner: &'a Interner<T>,
77+
78+
pos: u32,
79+
before_hunk_start: u32,
80+
after_hunk_start: u32,
81+
before_hunk_len: u32,
82+
after_hunk_len: u32,
83+
/// Symmetrical context before and after the changed hunk.
84+
ctx_size: u32,
85+
86+
buffer: Vec<u8>,
87+
header_buf: String,
88+
delegate: D,
89+
newline: &'a str,
90+
91+
err: Option<std::io::Error>,
92+
}
93+
94+
impl<'a, T, D> UnifiedDiff<'a, T, D>
95+
where
96+
T: Hash + Eq + AsRef<[u8]>,
97+
D: ConsumeHunk,
98+
{
99+
/// Create a new instance to create unified diff using the lines in `input`,
100+
/// which also must be used when running the diff algorithm.
101+
/// `context_size` is the amount of lines around each hunk which will be passed
102+
///to `consume_hunk`.
103+
///
104+
/// `consume_hunk` is called for each hunk in unified-diff format, as created from each line separated by `newline_separator`,
105+
pub fn new(
106+
input: &'a InternedInput<T>,
107+
consume_hunk: D,
108+
newline_separator: &'a str,
109+
context_size: ContextSize,
110+
) -> Self {
111+
Self {
112+
before_hunk_start: 0,
113+
after_hunk_start: 0,
114+
before_hunk_len: 0,
115+
after_hunk_len: 0,
116+
buffer: Vec::with_capacity(8),
117+
header_buf: String::new(),
118+
delegate: consume_hunk,
119+
interner: &input.interner,
120+
before: &input.before,
121+
after: &input.after,
122+
pos: 0,
123+
ctx_size: context_size.symmetrical,
124+
newline: newline_separator,
125+
126+
err: None,
127+
}
128+
}
129+
130+
fn print_tokens(&mut self, tokens: &[Token], prefix: char) {
131+
for &token in tokens {
132+
self.buffer.push_char(prefix);
133+
self.buffer.push_str(&self.interner[token]);
134+
self.buffer.push_str(self.newline.as_bytes());
135+
}
136+
}
137+
138+
fn flush(&mut self) -> std::io::Result<()> {
139+
if self.before_hunk_len == 0 && self.after_hunk_len == 0 {
140+
return Ok(());
141+
}
142+
143+
let end = (self.pos + self.ctx_size).min(self.before.len() as u32);
144+
self.update_pos(end, end);
145+
146+
self.header_buf.clear();
147+
148+
std::fmt::Write::write_fmt(
149+
&mut self.header_buf,
150+
format_args!(
151+
"@@ -{},{} +{},{} @@{nl}",
152+
self.before_hunk_start + 1,
153+
self.before_hunk_len,
154+
self.after_hunk_start + 1,
155+
self.after_hunk_len,
156+
nl = self.newline
157+
),
158+
)
159+
.map_err(|err| std::io::Error::new(ErrorKind::Other, err))?;
160+
self.delegate.consume_hunk(
161+
self.before_hunk_start + 1,
162+
self.before_hunk_len,
163+
self.after_hunk_start + 1,
164+
self.after_hunk_len,
165+
&self.header_buf,
166+
&self.buffer,
167+
)?;
168+
self.buffer.clear();
169+
self.before_hunk_len = 0;
170+
self.after_hunk_len = 0;
171+
Ok(())
172+
}
173+
174+
fn update_pos(&mut self, print_to: u32, move_to: u32) {
175+
self.print_tokens(&self.before[self.pos as usize..print_to as usize], ' ');
176+
let len = print_to - self.pos;
177+
self.pos = move_to;
178+
self.before_hunk_len += len;
179+
self.after_hunk_len += len;
180+
}
181+
}
182+
183+
impl<T, D> Sink for UnifiedDiff<'_, T, D>
184+
where
185+
T: Hash + Eq + AsRef<[u8]>,
186+
D: ConsumeHunk,
187+
{
188+
type Out = std::io::Result<D::Out>;
189+
190+
fn process_change(&mut self, before: Range<u32>, after: Range<u32>) {
191+
if self.err.is_some() {
192+
return;
193+
}
194+
if ((self.pos == 0) && (before.start - self.pos > self.ctx_size))
195+
|| (before.start - self.pos > 2 * self.ctx_size)
196+
{
197+
if let Err(err) = self.flush() {
198+
self.err = Some(err);
199+
return;
200+
}
201+
self.pos = before.start - self.ctx_size;
202+
self.before_hunk_start = self.pos;
203+
self.after_hunk_start = after.start - self.ctx_size;
204+
}
205+
self.update_pos(before.start, before.end);
206+
self.before_hunk_len += before.end - before.start;
207+
self.after_hunk_len += after.end - after.start;
208+
self.print_tokens(&self.before[before.start as usize..before.end as usize], '-');
209+
self.print_tokens(&self.after[after.start as usize..after.end as usize], '+');
210+
}
211+
212+
fn finish(mut self) -> Self::Out {
213+
if let Err(err) = self.flush() {
214+
self.err = Some(err);
215+
}
216+
if let Some(err) = self.err {
217+
return Err(err);
218+
}
219+
Ok(self.delegate.finish())
220+
}
221+
}
222+
223+
/// An implementation that fails if the input isn't UTF-8.
224+
impl ConsumeHunk for String {
225+
type Out = Self;
226+
227+
fn consume_hunk(&mut self, _: u32, _: u32, _: u32, _: u32, header: &str, hunk: &[u8]) -> std::io::Result<()> {
228+
self.push_str(header);
229+
self.push_str(
230+
hunk.to_str()
231+
.map_err(|err| std::io::Error::new(ErrorKind::Other, err))?,
232+
);
233+
Ok(())
234+
}
235+
236+
fn finish(self) -> Self::Out {
237+
self
238+
}
239+
}
240+
241+
/// An implementation that writes hunks into a byte buffer.
242+
impl ConsumeHunk for Vec<u8> {
243+
type Out = Self;
244+
245+
fn consume_hunk(&mut self, _: u32, _: u32, _: u32, _: u32, header: &str, hunk: &[u8]) -> std::io::Result<()> {
246+
self.push_str(header);
247+
self.push_str(hunk);
248+
Ok(())
249+
}
250+
251+
fn finish(self) -> Self::Out {
252+
self
253+
}
254+
}
255+
}

gix-diff/tests/diff/blob/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pub(crate) mod pipeline;
22
mod platform;
3+
mod unified_diff;

0 commit comments

Comments
 (0)