Handle formatter flags in byte OsStr Display

thaliaarchi · thaliaarchi · commit bf5f172a738c · 2025-02-26T11:00:22.000-08:00
The Display implementation for `OsStr` and `Path` on non-Windows systems (the byte-oriented version) only handles formatter flags when the entire string is valid UTF-8. As most paths are valid UTF-8, the common case is formatted like `str`; however, flags are ignored when they contain an invalid UTF-8 sequence. Implement its Display with the same logic as that of `str`. Additionally, it forwards flags when formatting the last chunk of valid UTF-8, so it erroneously inserts padding within the string when it has invalid UTF-8. Fix this. Fixes #136617 for non-Windows systems.
diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs
@@ -244,8 +244,10 @@ impl fmt::Debug for Utf8Chunks<'_> {
 /// sequences. When the current sequence is invalid, the maximal prefix of a
 /// valid UTF-8 code unit sequence is consumed. Returns whether the sequence is
 /// a valid Unicode scalar value.
+#[doc(hidden)]
+#[unstable(feature = "str_internals", issue = "none")]
 #[inline]
-fn advance_utf8(bytes: &mut slice::Iter<'_, u8>) -> bool {
+pub fn advance_utf8(bytes: &mut slice::Iter<'_, u8>) -> bool {
     const TAG_CONT_U8: u8 = 128;
     #[inline]
     fn peek(bytes: &slice::Iter<'_, u8>) -> u8 {
diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs
@@ -10,6 +10,7 @@ mod converts;
 mod count;
 mod error;
 mod iter;
+mod lossy;
 mod traits;
 mod validations;
 
@@ -21,7 +22,6 @@ use crate::{ascii, mem};
 
 pub mod pattern;
 
-mod lossy;
 #[unstable(feature = "str_from_raw_parts", issue = "119206")]
 pub use converts::{from_raw_parts, from_raw_parts_mut};
 #[stable(feature = "rust1", since = "1.0.0")]
@@ -52,6 +52,8 @@ pub use iter::{Matches, RMatches};
 pub use iter::{RSplit, RSplitTerminator, Split, SplitTerminator};
 #[stable(feature = "rust1", since = "1.0.0")]
 pub use iter::{RSplitN, SplitN};
+#[unstable(feature = "str_internals", issue = "none")]
+pub use lossy::advance_utf8;
 #[stable(feature = "utf8_chunks", since = "1.79.0")]
 pub use lossy::{Utf8Chunk, Utf8Chunks};
 #[stable(feature = "rust1", since = "1.0.0")]
diff --git a/library/std/src/ffi/os_str/tests.rs b/library/std/src/ffi/os_str/tests.rs
@@ -111,6 +111,19 @@ fn display() {
     assert_eq!(format!("a{:^10}e", os_string.display()), "a   bcd    e");
 }
 
+#[cfg(unix)]
+#[test]
+fn display_invalid_utf8_unix() {
+    use crate::os::unix::ffi::OsStringExt;
+
+    let os_string = OsString::from_vec(b"b\xFFd".to_vec());
+    assert_eq!(format!("a{:^10}e", os_string.display()), "a   b�d    e");
+    assert_eq!(format!("a{:^10}e", os_string.as_os_str().display()), "a   b�d    e");
+    let os_string = OsString::from_vec(b"b\xE1\xBAd".to_vec());
+    assert_eq!(format!("a{:^10}e", os_string.display()), "a   b�d    e");
+    assert_eq!(format!("a{:^10}e", os_string.as_os_str().display()), "a   b�d    e");
+}
+
 #[cfg(windows)]
 #[test]
 fn display_invalid_wtf8_windows() {
diff --git a/library/std/src/sys/os_str/bytes.rs b/library/std/src/sys/os_str/bytes.rs
@@ -2,6 +2,7 @@
 //! systems: just a `Vec<u8>`/`[u8]`.
 
 use core::clone::CloneToUninit;
+use core::str::advance_utf8;
 
 use crate::borrow::Cow;
 use crate::collections::TryReserveError;
@@ -64,25 +65,37 @@ impl fmt::Debug for Slice {
 
 impl fmt::Display for Slice {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        // If we're the empty string then our iterator won't actually yield
-        // anything, so perform the formatting manually
-        if self.inner.is_empty() {
-            return "".fmt(f);
+        // Corresponds to `Formatter::pad`, but for `OsStr` instead of `str`.
+
+        // Make sure there's a fast path up front.
+        if f.options().get_width().is_none() && f.options().get_precision().is_none() {
+            return self.write_lossy(f);
         }
 
-        for chunk in self.inner.utf8_chunks() {
-            let valid = chunk.valid();
-            // If we successfully decoded the whole chunk as a valid string then
-            // we can return a direct formatting of the string which will also
-            // respect various formatting flags if possible.
-            if chunk.invalid().is_empty() {
-                return valid.fmt(f);
-            }
+        // The `precision` field can be interpreted as a maximum width for the
+        // string being formatted.
+        let max_char_count = f.options().get_precision().unwrap_or(usize::MAX);
+        let (truncated, char_count) = truncate_chars(&self.inner, max_char_count);
+
+        // If our string is longer than the maximum width, truncate it and
+        // handle other flags in terms of the truncated string.
+        // SAFETY: The truncation splits at Unicode scalar value boundaries.
+        let s = unsafe { Slice::from_encoded_bytes_unchecked(truncated) };
 
-            f.write_str(valid)?;
-            f.write_char(char::REPLACEMENT_CHARACTER)?;
+        // The `width` field is more of a minimum width parameter at this point.
+        if let Some(width) = f.options().get_width()
+            && char_count < width
+        {
+            // If we're under the minimum width, then fill up the minimum width
+            // with the specified string + some alignment.
+            let post_padding = f.padding(width - char_count, fmt::Alignment::Left)?;
+            s.write_lossy(f)?;
+            post_padding.write(f)
+        } else {
+            // If we're over the minimum width or there is no minimum width, we
+            // can just emit the string.
+            s.write_lossy(f)
         }
-        Ok(())
     }
 }
 
@@ -297,6 +310,18 @@ impl Slice {
         String::from_utf8_lossy(&self.inner)
     }
 
+    /// Writes the string as lossy UTF-8 like [`String::from_utf8_lossy`].
+    /// It ignores formatter flags.
+    fn write_lossy(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        for chunk in self.inner.utf8_chunks() {
+            f.write_str(chunk.valid())?;
+            if !chunk.invalid().is_empty() {
+                f.write_char(char::REPLACEMENT_CHARACTER)?;
+            }
+        }
+        Ok(())
+    }
+
     #[inline]
     pub fn to_owned(&self) -> Buf {
         Buf { inner: self.inner.to_vec() }
@@ -371,3 +396,19 @@ unsafe impl CloneToUninit for Slice {
         unsafe { self.inner.clone_to_uninit(dst) }
     }
 }
+
+/// Counts the number of Unicode scalar values in the byte string, allowing
+/// invalid UTF-8 sequences. For invalid sequences, the maximal prefix of a
+/// valid UTF-8 code unit counts as one. Only up to `max_chars` scalar values
+/// are scanned. Returns the character count and the byte length.
+fn truncate_chars(bytes: &[u8], max_chars: usize) -> (&[u8], usize) {
+    let mut iter = bytes.iter();
+    let mut char_count = 0;
+    while !iter.is_empty() && char_count < max_chars {
+        advance_utf8(&mut iter);
+        char_count += 1;
+    }
+    let byte_len = bytes.len() - iter.len();
+    let truncated = unsafe { bytes.get_unchecked(..byte_len) };
+    (truncated, char_count)
+}
diff --git a/library/std/tests/path.rs b/library/std/tests/path.rs
@@ -1822,6 +1822,20 @@ fn display_format_flags() {
     assert_eq!(format!("a{:^10}e", Path::new("bcd").display()), "a   bcd    e");
 }
 
+#[cfg(unix)]
+#[test]
+fn display_invalid_utf8_unix() {
+    use std::ffi::OsString;
+    use std::os::unix::ffi::OsStringExt;
+
+    let path_buf = PathBuf::from(OsString::from_vec(b"b\xFFd".to_vec()));
+    assert_eq!(format!("a{:^10}e", path_buf.display()), "a   b�d    e");
+    assert_eq!(format!("a{:^10}e", path_buf.as_path().display()), "a   b�d    e");
+    let path_buf = PathBuf::from(OsString::from_vec(b"b\xE1\xBAd".to_vec()));
+    assert_eq!(format!("a{:^10}e", path_buf.display()), "a   b�d    e");
+    assert_eq!(format!("a{:^10}e", path_buf.as_path().display()), "a   b�d    e");
+}
+
 #[cfg(windows)]
 #[test]
 fn display_invalid_wtf8_windows() {