Skip to content

Commit f465d27

Browse files
committed
Handle formatter flags in byte OsStr Display
The Display implementation for `OsStr` and `Path` on non-Windows systems (the byte-oriented version) only handles formatter flags when the entire string is valid UTF-8. As most paths are valid UTF-8, the common case is formatted like `str`; however, flags are ignored when they contain an invalid UTF-8 sequence. Implement its Display with the same logic as that of `str`. Additionally, it forwards flags when formatting the last chunk of valid UTF-8, so it erroneously inserts padding within the string when it has invalid UTF-8. Fix this. Fixes #136617 for non-Windows systems.
1 parent 1bced94 commit f465d27

File tree

5 files changed

+89
-17
lines changed

5 files changed

+89
-17
lines changed

library/core/src/str/lossy.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,10 @@ impl fmt::Debug for Utf8Chunks<'_> {
244244
/// sequences. When the current sequence is invalid, the maximal prefix of a
245245
/// valid UTF-8 code unit sequence is consumed. Returns whether the sequence is
246246
/// a valid Unicode scalar value.
247+
#[doc(hidden)]
248+
#[unstable(feature = "str_internals", issue = "none")]
247249
#[inline]
248-
fn advance_utf8(bytes: &mut slice::Iter<'_, u8>) -> bool {
250+
pub fn advance_utf8(bytes: &mut slice::Iter<'_, u8>) -> bool {
249251
const TAG_CONT_U8: u8 = 128;
250252
#[inline]
251253
fn peek(bytes: &slice::Iter<'_, u8>) -> u8 {

library/core/src/str/mod.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ mod converts;
1010
mod count;
1111
mod error;
1212
mod iter;
13+
mod lossy;
1314
mod traits;
1415
mod validations;
1516

@@ -21,7 +22,6 @@ use crate::{ascii, mem};
2122

2223
pub mod pattern;
2324

24-
mod lossy;
2525
#[unstable(feature = "str_from_raw_parts", issue = "119206")]
2626
pub use converts::{from_raw_parts, from_raw_parts_mut};
2727
#[stable(feature = "rust1", since = "1.0.0")]
@@ -52,6 +52,8 @@ pub use iter::{Matches, RMatches};
5252
pub use iter::{RSplit, RSplitTerminator, Split, SplitTerminator};
5353
#[stable(feature = "rust1", since = "1.0.0")]
5454
pub use iter::{RSplitN, SplitN};
55+
#[unstable(feature = "str_internals", issue = "none")]
56+
pub use lossy::advance_utf8;
5557
#[stable(feature = "utf8_chunks", since = "1.79.0")]
5658
pub use lossy::{Utf8Chunk, Utf8Chunks};
5759
#[stable(feature = "rust1", since = "1.0.0")]

library/std/src/ffi/os_str/tests.rs

+13
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,19 @@ fn display() {
111111
assert_eq!(format!("a{:^10}e", os_string.display()), "a bcd e");
112112
}
113113

114+
#[cfg(unix)]
115+
#[test]
116+
fn display_invalid_utf8_unix() {
117+
use crate::os::unix::ffi::OsStringExt;
118+
119+
let os_string = OsString::from_vec(b"b\xFFd".to_vec());
120+
assert_eq!(format!("a{:^10}e", os_string.display()), "a b�d e");
121+
assert_eq!(format!("a{:^10}e", os_string.as_os_str().display()), "a b�d e");
122+
let os_string = OsString::from_vec(b"b\xE1\xBAd".to_vec());
123+
assert_eq!(format!("a{:^10}e", os_string.display()), "a b�d e");
124+
assert_eq!(format!("a{:^10}e", os_string.as_os_str().display()), "a b�d e");
125+
}
126+
114127
#[cfg(windows)]
115128
#[test]
116129
fn display_invalid_wtf8_windows() {

library/std/src/sys/os_str/bytes.rs

+56-15
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
//! systems: just a `Vec<u8>`/`[u8]`.
33
44
use core::clone::CloneToUninit;
5+
use core::str::advance_utf8;
56

67
use crate::borrow::Cow;
78
use crate::collections::TryReserveError;
@@ -64,25 +65,37 @@ impl fmt::Debug for Slice {
6465

6566
impl fmt::Display for Slice {
6667
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
67-
// If we're the empty string then our iterator won't actually yield
68-
// anything, so perform the formatting manually
69-
if self.inner.is_empty() {
70-
return "".fmt(f);
68+
// Corresponds to `Formatter::pad`, but for `OsStr` instead of `str`.
69+
70+
// Make sure there's a fast path up front.
71+
if f.options().get_width().is_none() && f.options().get_precision().is_none() {
72+
return self.write_lossy(f);
7173
}
7274

73-
for chunk in self.inner.utf8_chunks() {
74-
let valid = chunk.valid();
75-
// If we successfully decoded the whole chunk as a valid string then
76-
// we can return a direct formatting of the string which will also
77-
// respect various formatting flags if possible.
78-
if chunk.invalid().is_empty() {
79-
return valid.fmt(f);
80-
}
75+
// The `precision` field can be interpreted as a maximum width for the
76+
// string being formatted.
77+
let max_char_count = f.options().get_precision().unwrap_or(u16::MAX);
78+
let (truncated, char_count) = truncate_chars(&self.inner, max_char_count as usize);
79+
80+
// If our string is longer than the maximum width, truncate it and
81+
// handle other flags in terms of the truncated string.
82+
// SAFETY: The truncation splits at Unicode scalar value boundaries.
83+
let s = unsafe { Slice::from_encoded_bytes_unchecked(truncated) };
8184

82-
f.write_str(valid)?;
83-
f.write_char(char::REPLACEMENT_CHARACTER)?;
85+
// The `width` field is more of a minimum width parameter at this point.
86+
if let Some(width) = f.options().get_width()
87+
&& char_count < width as usize
88+
{
89+
// If we're under the minimum width, then fill up the minimum width
90+
// with the specified string + some alignment.
91+
let post_padding = f.padding(width - char_count as u16, fmt::Alignment::Left)?;
92+
s.write_lossy(f)?;
93+
post_padding.write(f)
94+
} else {
95+
// If we're over the minimum width or there is no minimum width, we
96+
// can just emit the string.
97+
s.write_lossy(f)
8498
}
85-
Ok(())
8699
}
87100
}
88101

@@ -302,6 +315,18 @@ impl Slice {
302315
String::from_utf8_lossy(&self.inner)
303316
}
304317

318+
/// Writes the string as lossy UTF-8 like [`String::from_utf8_lossy`].
319+
/// It ignores formatter flags.
320+
fn write_lossy(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
321+
for chunk in self.inner.utf8_chunks() {
322+
f.write_str(chunk.valid())?;
323+
if !chunk.invalid().is_empty() {
324+
f.write_char(char::REPLACEMENT_CHARACTER)?;
325+
}
326+
}
327+
Ok(())
328+
}
329+
305330
#[inline]
306331
pub fn to_owned(&self) -> Buf {
307332
Buf { inner: self.inner.to_vec() }
@@ -376,3 +401,19 @@ unsafe impl CloneToUninit for Slice {
376401
unsafe { self.inner.clone_to_uninit(dst) }
377402
}
378403
}
404+
405+
/// Counts the number of Unicode scalar values in the byte string, allowing
406+
/// invalid UTF-8 sequences. For invalid sequences, the maximal prefix of a
407+
/// valid UTF-8 code unit counts as one. Only up to `max_chars` scalar values
408+
/// are scanned. Returns the character count and the byte length.
409+
fn truncate_chars(bytes: &[u8], max_chars: usize) -> (&[u8], usize) {
410+
let mut iter = bytes.iter();
411+
let mut char_count = 0;
412+
while !iter.is_empty() && char_count < max_chars {
413+
advance_utf8(&mut iter);
414+
char_count += 1;
415+
}
416+
let byte_len = bytes.len() - iter.len();
417+
let truncated = unsafe { bytes.get_unchecked(..byte_len) };
418+
(truncated, char_count)
419+
}

library/std/tests/path.rs

+14
Original file line numberDiff line numberDiff line change
@@ -1822,6 +1822,20 @@ fn display_format_flags() {
18221822
assert_eq!(format!("a{:^10}e", Path::new("bcd").display()), "a bcd e");
18231823
}
18241824

1825+
#[cfg(unix)]
1826+
#[test]
1827+
fn display_invalid_utf8_unix() {
1828+
use std::ffi::OsString;
1829+
use std::os::unix::ffi::OsStringExt;
1830+
1831+
let path_buf = PathBuf::from(OsString::from_vec(b"b\xFFd".to_vec()));
1832+
assert_eq!(format!("a{:^10}e", path_buf.display()), "a b�d e");
1833+
assert_eq!(format!("a{:^10}e", path_buf.as_path().display()), "a b�d e");
1834+
let path_buf = PathBuf::from(OsString::from_vec(b"b\xE1\xBAd".to_vec()));
1835+
assert_eq!(format!("a{:^10}e", path_buf.display()), "a b�d e");
1836+
assert_eq!(format!("a{:^10}e", path_buf.as_path().display()), "a b�d e");
1837+
}
1838+
18251839
#[cfg(windows)]
18261840
#[test]
18271841
fn display_invalid_wtf8_windows() {

0 commit comments

Comments
 (0)