From cdbe8e84b8fd6c9bf5dd859bd210c7a32da6b699 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sat, 16 Mar 2024 14:53:41 -0400 Subject: [PATCH 1/6] Extend ASCII fast paths of `char` methods beyond ASCII --- library/core/src/char/methods.rs | 33 ++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 87b328c912878..284a3eeb75dfb 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -777,8 +777,9 @@ impl char { #[inline] pub fn is_alphabetic(self) -> bool { match self { - 'a'..='z' | 'A'..='Z' => true, - c => c > '\x7f' && unicode::Alphabetic(c), + 'A'..='Z' | 'a'..='z' => true, + '\0'..='\u{A9}' => false, + _ => unicode::Alphabetic(self), } } @@ -819,7 +820,8 @@ impl char { pub const fn is_lowercase(self) -> bool { match self { 'a'..='z' => true, - c => c > '\x7f' && unicode::Lowercase(c), + '\0'..='\u{A9}' => false, + _ => unicode::Lowercase(self), } } @@ -860,7 +862,8 @@ impl char { pub const fn is_uppercase(self) -> bool { match self { 'A'..='Z' => true, - c => c > '\x7f' && unicode::Uppercase(c), + '\0'..='\u{BF}' => false, + _ => unicode::Uppercase(self), } } @@ -893,7 +896,8 @@ impl char { pub const fn is_whitespace(self) -> bool { match self { ' ' | '\x09'..='\x0d' => true, - c => c > '\x7f' && unicode::White_Space(c), + '\0'..='\u{84}' => false, + _ => unicode::White_Space(self), } } @@ -920,10 +924,10 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn is_alphanumeric(self) -> bool { - if self.is_ascii() { - self.is_ascii_alphanumeric() - } else { - unicode::Alphabetic(self) || unicode::N(self) + match self { + '0'..='9' | 'A'..='Z' | 'a'..='z' => true, + '\0'..='\u{A9}' => false, + _ => unicode::Alphabetic(self) || unicode::N(self), } } @@ -969,7 +973,7 @@ impl char { #[must_use] #[inline] pub(crate) fn is_grapheme_extended(self) -> bool { - !self.is_ascii() && unicode::Grapheme_Extend(self) + self > '\u{02FF}' && unicode::Grapheme_Extend(self) } /// Returns `true` if this `char` has the `Cased` property. @@ -985,7 +989,11 @@ impl char { #[doc(hidden)] #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] pub fn is_cased(self) -> bool { - if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) } + match self { + 'A'..='Z' | 'a'..='z' => true, + '\0'..='\u{A9}' => false, + _ => unicode::Cased(self), + } } /// Returns `true` if this `char` has the `Case_Ignorable` property. @@ -1047,7 +1055,8 @@ impl char { pub fn is_numeric(self) -> bool { match self { '0'..='9' => true, - c => c > '\x7f' && unicode::N(c), + '\0'..='\u{B1}' => false, + _ => unicode::N(self), } } From 51e3d88cda6a244fe36237844f6216457b10811e Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sat, 16 Mar 2024 15:56:18 -0400 Subject: [PATCH 2/6] Add `char::is_cased` --- library/alloc/src/lib.rs | 1 + library/core/src/char/methods.rs | 15 +++++++++++++-- library/core/src/unicode/mod.rs | 4 ++-- library/coretests/tests/char.rs | 11 +++++++++++ library/coretests/tests/lib.rs | 1 + 5 files changed, 28 insertions(+), 4 deletions(-) diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index 3d94554281d44..66f7392787175 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -148,6 +148,7 @@ #![feature(slice_range)] #![feature(std_internals)] #![feature(temporary_niche_types)] +#![feature(titlecase)] #![feature(transmutability)] #![feature(trivial_clone)] #![feature(trusted_fused)] diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 284a3eeb75dfb..4c7be5c52ba29 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -977,6 +977,7 @@ impl char { } /// Returns `true` if this `char` has the `Cased` property. + /// A character is cased if and only if it is uppercase, lowercase, or titlecase. /// /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. @@ -984,10 +985,20 @@ impl char { /// [Unicode Standard]: https://www.unicode.org/versions/latest/ /// [ucd]: https://www.unicode.org/reports/tr44/ /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(titlecase)] + /// assert!('A'.is_cased()); + /// assert!('a'.is_cased()); + /// assert!(!'京'.is_cased()); + /// ``` #[must_use] + #[unstable(feature = "titlecase", issue = "none")] #[inline] - #[doc(hidden)] - #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] pub fn is_cased(self) -> bool { match self { 'A'..='Z' | 'a'..='z' => true, diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index c71fa754e68fb..0b6a055ba167e 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -4,12 +4,12 @@ // for use in alloc, not re-exported in std. #[rustfmt::skip] -pub use unicode_data::case_ignorable::lookup as Case_Ignorable; -pub use unicode_data::cased::lookup as Cased; pub use unicode_data::conversions; #[rustfmt::skip] pub(crate) use unicode_data::alphabetic::lookup as Alphabetic; +pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable; +pub(crate) use unicode_data::cased::lookup as Cased; pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend; pub(crate) use unicode_data::lowercase::lookup as Lowercase; pub(crate) use unicode_data::n::lookup as N; diff --git a/library/coretests/tests/char.rs b/library/coretests/tests/char.rs index f0f6a24429284..8336a049c3d31 100644 --- a/library/coretests/tests/char.rs +++ b/library/coretests/tests/char.rs @@ -39,6 +39,17 @@ fn test_from_str() { assert!(char::from_str("abc").is_err()); } +#[test] +fn test_is_cased() { + assert!('a'.is_cased()); + assert!('ö'.is_cased()); + assert!('ß'.is_cased()); + assert!('Ü'.is_cased()); + assert!('P'.is_cased()); + assert!('ª'.is_cased()); + assert!(!'攂'.is_cased()); +} + #[test] fn test_is_lowercase() { assert!('a'.is_lowercase()); diff --git a/library/coretests/tests/lib.rs b/library/coretests/tests/lib.rs index b8702ee20cbb1..e9e03e1ee803b 100644 --- a/library/coretests/tests/lib.rs +++ b/library/coretests/tests/lib.rs @@ -109,6 +109,7 @@ #![feature(step_trait)] #![feature(str_internals)] #![feature(strict_provenance_lints)] +#![feature(titlecase)] #![feature(trusted_len)] #![feature(trusted_random_access)] #![feature(try_blocks)] From 6fffdc5a5e3bbd0df79af347515ec64eb1f9a1ad Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sat, 16 Mar 2024 16:05:31 -0400 Subject: [PATCH 3/6] Add `char::is_titlecase` --- library/core/src/char/methods.rs | 33 ++++++++++++++++++++++++++++++++ library/coretests/tests/char.rs | 31 ++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 4c7be5c52ba29..5c9652e6e26a0 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -825,6 +825,39 @@ impl char { } } + /// Returns `true` if this `char` has the general category for titlecase letters. + /// + /// Titlecase letters (code points with the general category of `Lt`) are described in Chapter 4 + /// (Character Properties) of the [Unicode Standard] and specified in the [Unicode Character + /// Database][ucd] [`UnicodeData.txt`]. + /// + /// [Unicode Standard]: https://www.unicode.org/versions/latest/ + /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(titlecase)] + /// assert!('Dž'.is_titlecase()); + /// assert!('ᾨ'.is_titlecase()); + /// assert!(!'D'.is_titlecase()); + /// assert!(!'z'.is_titlecase()); + /// assert!(!'中'.is_titlecase()); + /// assert!(!' '.is_titlecase()); + /// ``` + #[must_use] + #[unstable(feature = "titlecase", issue = "none")] + #[inline] + pub fn is_titlecase(self) -> bool { + match self { + '\0'..='\u{01C4}' => false, + _ => self.is_cased() && !self.is_lowercase() && !self.is_uppercase(), + } + } + /// Returns `true` if this `char` has the `Uppercase` property. /// /// `Uppercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and diff --git a/library/coretests/tests/char.rs b/library/coretests/tests/char.rs index 8336a049c3d31..20c6c675667e3 100644 --- a/library/coretests/tests/char.rs +++ b/library/coretests/tests/char.rs @@ -59,6 +59,17 @@ fn test_is_lowercase() { assert!(!'P'.is_lowercase()); } +#[test] +fn test_is_titlecase() { + assert!('Dž'.is_titlecase()); + assert!('ᾨ'.is_titlecase()); + assert!(!'h'.is_titlecase()); + assert!(!'ä'.is_titlecase()); + assert!(!'ß'.is_titlecase()); + assert!(!'Ö'.is_titlecase()); + assert!(!'T'.is_titlecase()); +} + #[test] fn test_is_uppercase() { assert!(!'h'.is_uppercase()); @@ -68,6 +79,26 @@ fn test_is_uppercase() { assert!('T'.is_uppercase()); } +#[test] +fn titlecase_fast_path() { + for c in '\0'..='\u{01C4}' { + assert!(!(c.is_cased() && !c.is_lowercase() && !c.is_uppercase())) + } +} + +#[test] +fn at_most_one_case() { + for c in '\0'..='\u{10FFFF}' { + assert_eq!( + !c.is_cased() as u8 + + c.is_lowercase() as u8 + + c.is_uppercase() as u8 + + c.is_titlecase() as u8, + 1 + ); + } +} + #[test] fn test_is_whitespace() { assert!(' '.is_whitespace()); From 9788f34e6a6aec59dd4634b82d21a37ddfb5b185 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sat, 16 Mar 2024 16:33:43 -0400 Subject: [PATCH 4/6] Add `char::case` --- library/core/src/char/methods.rs | 95 +++++++++++++++++++++----------- library/core/src/char/mod.rs | 13 +++++ library/coretests/tests/char.rs | 15 ++++- 3 files changed, 91 insertions(+), 32 deletions(-) diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 5c9652e6e26a0..eb46bb96582fd 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -783,6 +783,70 @@ impl char { } } + /// Returns `true` if this `char` has the `Cased` property. + /// A character is cased if and only if it is uppercase, lowercase, or titlecase. + /// + /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and + /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// + /// [Unicode Standard]: https://www.unicode.org/versions/latest/ + /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(titlecase)] + /// assert!('A'.is_cased()); + /// assert!('a'.is_cased()); + /// assert!(!'京'.is_cased()); + /// ``` + #[must_use] + #[unstable(feature = "titlecase", issue = "none")] + #[inline] + pub fn is_cased(self) -> bool { + match self { + 'A'..='Z' | 'a'..='z' => true, + '\0'..='\u{A9}' => false, + _ => unicode::Cased(self), + } + } + + /// Returns the case of this character: + /// [`Some(CharCase::Upper)`][`CharCase::Upper`] if [`self.is_uppercase()`][`char::is_uppercase`], + /// [`Some(CharCase::Lower)`][`CharCase::Lower`] if [`self.is_lowercase()`][`char::is_lowercase`], + /// [`Some(CharCase::Title)`][`CharCase::Title`] if [`self.is_titlecase()`][`char::is_titlecase`], and + /// `None` if [`!self.is_cased()`][`char::is_cased`]. + /// + /// # Examples + /// + /// ``` + /// #![feature(titlecase)] + /// use core::char::CharCase; + /// assert_eq!('a'.case(), Some(CharCase::Lower)); + /// assert_eq!('δ'.case(), Some(CharCase::Lower)); + /// assert_eq!('A'.case(), Some(CharCase::Upper)); + /// assert_eq!('Δ'.case(), Some(CharCase::Upper)); + /// assert_eq!('Dž'.case(), Some(CharCase::Title)); + /// assert_eq!('中'.case(), None); + /// ``` + #[must_use] + #[unstable(feature = "titlecase", issue = "none")] + #[inline] + pub fn case(self) -> Option { + match self { + 'A'..='Z' => Some(CharCase::Upper), + 'a'..='z' => Some(CharCase::Lower), + '\0'..='\u{A9}' => None, + _ if !unicode::Cased(self) => None, + _ if unicode::Lowercase(self) => Some(CharCase::Lower), + _ if unicode::Uppercase(self) => Some(CharCase::Upper), + _ => Some(CharCase::Title), + } + } + /// Returns `true` if this `char` has the `Lowercase` property. /// /// `Lowercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and @@ -1009,37 +1073,6 @@ impl char { self > '\u{02FF}' && unicode::Grapheme_Extend(self) } - /// Returns `true` if this `char` has the `Cased` property. - /// A character is cased if and only if it is uppercase, lowercase, or titlecase. - /// - /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and - /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. - /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ - /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// #![feature(titlecase)] - /// assert!('A'.is_cased()); - /// assert!('a'.is_cased()); - /// assert!(!'京'.is_cased()); - /// ``` - #[must_use] - #[unstable(feature = "titlecase", issue = "none")] - #[inline] - pub fn is_cased(self) -> bool { - match self { - 'A'..='Z' | 'a'..='z' => true, - '\0'..='\u{A9}' => false, - _ => unicode::Cased(self), - } - } - /// Returns `true` if this `char` has the `Case_Ignorable` property. /// /// `Case_Ignorable` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and diff --git a/library/core/src/char/mod.rs b/library/core/src/char/mod.rs index 82a3f6f916be3..4743847414dc1 100644 --- a/library/core/src/char/mod.rs +++ b/library/core/src/char/mod.rs @@ -603,3 +603,16 @@ impl fmt::Display for TryFromCharError { #[stable(feature = "u8_from_char", since = "1.59.0")] impl Error for TryFromCharError {} + +/// The case of a cased character, +/// as returned by [`char::case`]. +#[unstable(feature = "titlecase", issue = "none")] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum CharCase { + /// Lowercase. Corresponds to the `Lowercase` Unicode property. + Lower = 0b00, + /// Titlecase. Corresponds to the `Titlecase_Letter` Unicode general category. + Title = 0b10, + /// Uppercase. Corresponds to the `Uppercase` Unicode property. + Upper = 0b11, +} diff --git a/library/coretests/tests/char.rs b/library/coretests/tests/char.rs index 20c6c675667e3..aa20585953b7c 100644 --- a/library/coretests/tests/char.rs +++ b/library/coretests/tests/char.rs @@ -1,5 +1,6 @@ +use std::char::{self, CharCase}; +use std::str; use std::str::FromStr; -use std::{char, str}; #[test] fn test_convert() { @@ -50,6 +51,18 @@ fn test_is_cased() { assert!(!'攂'.is_cased()); } +#[test] +fn test_char_case() { + for c in '\0'..='\u{10FFFF}' { + match c.case() { + None => assert!(!c.is_cased()), + Some(CharCase::Lower) => assert!(c.is_lowercase()), + Some(CharCase::Upper) => assert!(c.is_uppercase()), + Some(CharCase::Title) => assert!(c.is_titlecase()), + } + } +} + #[test] fn test_is_lowercase() { assert!('a'.is_lowercase()); From 085e41c396bdf29d25832861679cc5cf81d09f5d Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sat, 16 Mar 2024 23:01:24 -0400 Subject: [PATCH 5/6] Add `char::to_titlecase()` --- library/core/src/char/methods.rs | 106 +++++++++++++++++- library/core/src/char/mod.rs | 63 ++++++++--- library/core/src/unicode/unicode_data.rs | 66 +++++++++++ .../src/case_mapping.rs | 24 +++- src/tools/unicode-table-generator/src/main.rs | 48 ++++---- 5 files changed, 263 insertions(+), 44 deletions(-) diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index eb46bb96582fd..d79ca25196865 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -1196,7 +1196,7 @@ impl char { /// // convert into themselves. /// assert_eq!('山'.to_lowercase().to_string(), "山"); /// ``` - #[must_use = "this returns the lowercase character as a new iterator, \ + #[must_use = "this returns the lowercased character as a new iterator, \ without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] #[inline] @@ -1204,6 +1204,104 @@ impl char { ToLowercase(CaseMappingIter::new(conversions::to_lower(self))) } + /// Returns an iterator that yields the titlecase mapping of this `char` as one or more + /// `char`s. + /// + /// If this `char` does not have an titlecase mapping, the iterator yields the same `char`. + /// + /// If this `char` has a one-to-one titlecase mapping given by the [Unicode Character + /// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`. + /// + /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + /// + /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields + /// the `char`(s) given by [`SpecialCasing.txt`]. + /// + /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt + /// + /// This operation performs an unconditional mapping without tailoring. That is, the conversion + /// is independent of context and language. + /// + /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in + /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion. + /// + /// [Unicode Standard]: https://www.unicode.org/versions/latest/ + /// + /// # Examples + /// + /// As an iterator: + /// + /// ``` + /// #![feature(titlecase)] + /// for c in 'ß'.to_titlecase() { + /// print!("{c}"); + /// } + /// println!(); + /// ``` + /// + /// Using `println!` directly: + /// + /// ``` + /// #![feature(titlecase)] + /// println!("{}", 'ß'.to_titlecase()); + /// ``` + /// + /// Both are equivalent to: + /// + /// ``` + /// #![feature(titlecase)] + /// println!("Ss"); + /// ``` + /// + /// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string): + /// + /// ``` + /// #![feature(titlecase)] + /// assert_eq!('c'.to_titlecase().to_string(), "C"); + /// + /// // Sometimes the result is more than one character: + /// assert_eq!('ß'.to_titlecase().to_string(), "Ss"); + /// + /// // Characters that do not have separate cased forms + /// // convert into themselves. + /// assert_eq!('山'.to_titlecase().to_string(), "山"); + /// ``` + /// + /// # Note on locale + /// + /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: + /// + /// * 'Dotless': I / ı, sometimes written ï + /// * 'Dotted': İ / i + /// + /// Note that the lowercase dotted 'i' is the same as the Latin. Therefore: + /// + /// ``` + /// #![feature(titlecase)] + /// let upper_i = 'i'.to_titlecase().to_string(); + /// ``` + /// + /// The value of `upper_i` here relies on the language of the text: if we're + /// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should + /// be `"İ"`. `to_titlecase()` does not take this into account, and so: + /// + /// ``` + /// #![feature(titlecase)] + /// let upper_i = 'i'.to_titlecase().to_string(); + /// + /// assert_eq!(upper_i, "I"); + /// ``` + /// + /// holds across languages. + #[must_use = "this returns the titlecased character as a new iterator, \ + without modifying the original"] + #[unstable(feature = "titlecase", issue = "none")] + #[inline] + pub fn to_titlecase(self) -> ToTitlecase { + ToTitlecase(CaseMappingIter::new(conversions::to_title(self))) + } + /// Returns an iterator that yields the uppercase mapping of this `char` as one or more /// `char`s. /// @@ -1267,7 +1365,7 @@ impl char { /// /// # Note on locale /// - /// In Turkish, the equivalent of 'i' in Latin has five forms instead of two: + /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: /// /// * 'Dotless': I / ı, sometimes written ï /// * 'Dotted': İ / i @@ -1279,7 +1377,7 @@ impl char { /// ``` /// /// The value of `upper_i` here relies on the language of the text: if we're - /// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should + /// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should /// be `"İ"`. `to_uppercase()` does not take this into account, and so: /// /// ``` @@ -1289,7 +1387,7 @@ impl char { /// ``` /// /// holds across languages. - #[must_use = "this returns the uppercase character as a new iterator, \ + #[must_use = "this returns the uppercased character as a new iterator, \ without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] #[inline] diff --git a/library/core/src/char/mod.rs b/library/core/src/char/mod.rs index 4743847414dc1..bd6e245c90c93 100644 --- a/library/core/src/char/mod.rs +++ b/library/core/src/char/mod.rs @@ -363,13 +363,21 @@ impl fmt::Display for EscapeDebug { } macro_rules! casemappingiter_impls { - ($(#[$attr:meta])* $ITER_NAME:ident) => { + ( + #[$stab:meta] + #[$dendstab:meta] + #[$fusedstab:meta] + #[$exactstab:meta] + #[$displaystab:meta] + $(#[$attr:meta])* + $ITER_NAME:ident + ) => { $(#[$attr])* - #[stable(feature = "rust1", since = "1.0.0")] + #[$stab] #[derive(Debug, Clone)] pub struct $ITER_NAME(CaseMappingIter); - #[stable(feature = "rust1", since = "1.0.0")] + #[$stab] impl Iterator for $ITER_NAME { type Item = char; fn next(&mut self) -> Option { @@ -405,7 +413,7 @@ macro_rules! casemappingiter_impls { } } - #[stable(feature = "case_mapping_double_ended", since = "1.59.0")] + #[$dendstab] impl DoubleEndedIterator for $ITER_NAME { fn next_back(&mut self) -> Option { self.0.next_back() @@ -423,10 +431,10 @@ macro_rules! casemappingiter_impls { } } - #[stable(feature = "fused", since = "1.26.0")] + #[$fusedstab] impl FusedIterator for $ITER_NAME {} - #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")] + #[$exactstab] impl ExactSizeIterator for $ITER_NAME { fn len(&self) -> usize { self.0.len() @@ -453,7 +461,7 @@ macro_rules! casemappingiter_impls { #[unstable(feature = "std_internals", issue = "none")] unsafe impl TrustedRandomAccess for $ITER_NAME {} - #[stable(feature = "char_struct_display", since = "1.16.0")] + #[$displaystab] impl fmt::Display for $ITER_NAME { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -464,23 +472,48 @@ macro_rules! casemappingiter_impls { } casemappingiter_impls! { - /// Returns an iterator that yields the lowercase equivalent of a `char`. + #[stable(feature = "rust1", since = "1.0.0")] + #[stable(feature = "case_mapping_double_ended", since = "1.59.0")] + #[stable(feature = "fused", since = "1.26.0")] + #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")] + #[stable(feature = "char_struct_display", since = "1.16.0")] + /// Returns an iterator that yields the uppercase equivalent of a `char`. /// - /// This `struct` is created by the [`to_lowercase`] method on [`char`]. See + /// This `struct` is created by the [`to_uppercase`] method on [`char`]. See /// its documentation for more. /// - /// [`to_lowercase`]: char::to_lowercase - ToLowercase + /// [`to_uppercase`]: char::to_uppercase + ToUppercase } casemappingiter_impls! { - /// Returns an iterator that yields the uppercase equivalent of a `char`. + #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] + /// Returns an iterator that yields the titlecase equivalent of a `char`. /// - /// This `struct` is created by the [`to_uppercase`] method on [`char`]. See + /// This `struct` is created by the [`to_titlecase`] method on [`char`]. See /// its documentation for more. /// - /// [`to_uppercase`]: char::to_uppercase - ToUppercase + /// [`to_titlecase`]: char::to_titlecase + ToTitlecase +} + +casemappingiter_impls! { + #[stable(feature = "rust1", since = "1.0.0")] + #[stable(feature = "case_mapping_double_ended", since = "1.59.0")] + #[stable(feature = "fused", since = "1.26.0")] + #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")] + #[stable(feature = "char_struct_display", since = "1.16.0")] + /// Returns an iterator that yields the lowercase equivalent of a `char`. + /// + /// This `struct` is created by the [`to_lowercase`] method on [`char`]. See + /// its documentation for more. + /// + /// [`to_lowercase`]: char::to_lowercase + ToLowercase } #[derive(Debug, Clone)] diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index 429b60a68f439..fd8ca1275a4c4 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -796,6 +796,23 @@ pub mod conversions { } } + pub fn to_title(c: char) -> [char; 3] { + if c.is_ascii() { + [(c as u8).to_ascii_uppercase() as char, '\0', '\0'] + } else { + TITLECASE_TABLE + .binary_search_by(|&(key, _)| key.cmp(&c)) + .map(|i| { + let u = TITLECASE_TABLE[i].1; + char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| { + // SAFETY: Index comes from statically generated table + unsafe { *TITLECASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) } + }) + }) + .unwrap_or(to_upper(c)) + } + } + static LOWERCASE_TABLE: &[(char, u32); 1462] = &[ ('\u{c0}', 224), ('\u{c1}', 225), ('\u{c2}', 226), ('\u{c3}', 227), ('\u{c4}', 228), ('\u{c5}', 229), ('\u{c6}', 230), ('\u{c7}', 231), ('\u{c8}', 232), ('\u{c9}', 233), @@ -1587,4 +1604,53 @@ pub mod conversions { ['\u{544}', '\u{53b}', '\u{0}'], ['\u{54e}', '\u{546}', '\u{0}'], ['\u{544}', '\u{53d}', '\u{0}'], ]; + + static TITLECASE_TABLE: &[(char, u32); 135] = &[ + ('\u{df}', 4194304), ('\u{1c4}', 453), ('\u{1c5}', 453), ('\u{1c6}', 453), + ('\u{1c7}', 456), ('\u{1c8}', 456), ('\u{1c9}', 456), ('\u{1ca}', 459), ('\u{1cb}', 459), + ('\u{1cc}', 459), ('\u{1f1}', 498), ('\u{1f2}', 498), ('\u{1f3}', 498), + ('\u{587}', 4194305), ('\u{10d0}', 4304), ('\u{10d1}', 4305), ('\u{10d2}', 4306), + ('\u{10d3}', 4307), ('\u{10d4}', 4308), ('\u{10d5}', 4309), ('\u{10d6}', 4310), + ('\u{10d7}', 4311), ('\u{10d8}', 4312), ('\u{10d9}', 4313), ('\u{10da}', 4314), + ('\u{10db}', 4315), ('\u{10dc}', 4316), ('\u{10dd}', 4317), ('\u{10de}', 4318), + ('\u{10df}', 4319), ('\u{10e0}', 4320), ('\u{10e1}', 4321), ('\u{10e2}', 4322), + ('\u{10e3}', 4323), ('\u{10e4}', 4324), ('\u{10e5}', 4325), ('\u{10e6}', 4326), + ('\u{10e7}', 4327), ('\u{10e8}', 4328), ('\u{10e9}', 4329), ('\u{10ea}', 4330), + ('\u{10eb}', 4331), ('\u{10ec}', 4332), ('\u{10ed}', 4333), ('\u{10ee}', 4334), + ('\u{10ef}', 4335), ('\u{10f0}', 4336), ('\u{10f1}', 4337), ('\u{10f2}', 4338), + ('\u{10f3}', 4339), ('\u{10f4}', 4340), ('\u{10f5}', 4341), ('\u{10f6}', 4342), + ('\u{10f7}', 4343), ('\u{10f8}', 4344), ('\u{10f9}', 4345), ('\u{10fa}', 4346), + ('\u{10fd}', 4349), ('\u{10fe}', 4350), ('\u{10ff}', 4351), ('\u{1f80}', 8072), + ('\u{1f81}', 8073), ('\u{1f82}', 8074), ('\u{1f83}', 8075), ('\u{1f84}', 8076), + ('\u{1f85}', 8077), ('\u{1f86}', 8078), ('\u{1f87}', 8079), ('\u{1f88}', 8072), + ('\u{1f89}', 8073), ('\u{1f8a}', 8074), ('\u{1f8b}', 8075), ('\u{1f8c}', 8076), + ('\u{1f8d}', 8077), ('\u{1f8e}', 8078), ('\u{1f8f}', 8079), ('\u{1f90}', 8088), + ('\u{1f91}', 8089), ('\u{1f92}', 8090), ('\u{1f93}', 8091), ('\u{1f94}', 8092), + ('\u{1f95}', 8093), ('\u{1f96}', 8094), ('\u{1f97}', 8095), ('\u{1f98}', 8088), + ('\u{1f99}', 8089), ('\u{1f9a}', 8090), ('\u{1f9b}', 8091), ('\u{1f9c}', 8092), + ('\u{1f9d}', 8093), ('\u{1f9e}', 8094), ('\u{1f9f}', 8095), ('\u{1fa0}', 8104), + ('\u{1fa1}', 8105), ('\u{1fa2}', 8106), ('\u{1fa3}', 8107), ('\u{1fa4}', 8108), + ('\u{1fa5}', 8109), ('\u{1fa6}', 8110), ('\u{1fa7}', 8111), ('\u{1fa8}', 8104), + ('\u{1fa9}', 8105), ('\u{1faa}', 8106), ('\u{1fab}', 8107), ('\u{1fac}', 8108), + ('\u{1fad}', 8109), ('\u{1fae}', 8110), ('\u{1faf}', 8111), ('\u{1fb2}', 4194306), + ('\u{1fb3}', 8124), ('\u{1fb4}', 4194307), ('\u{1fb7}', 4194308), ('\u{1fbc}', 8124), + ('\u{1fc2}', 4194309), ('\u{1fc3}', 8140), ('\u{1fc4}', 4194310), ('\u{1fc7}', 4194311), + ('\u{1fcc}', 8140), ('\u{1ff2}', 4194312), ('\u{1ff3}', 8188), ('\u{1ff4}', 4194313), + ('\u{1ff7}', 4194314), ('\u{1ffc}', 8188), ('\u{fb00}', 4194315), ('\u{fb01}', 4194316), + ('\u{fb02}', 4194317), ('\u{fb03}', 4194318), ('\u{fb04}', 4194319), ('\u{fb05}', 4194320), + ('\u{fb06}', 4194321), ('\u{fb13}', 4194322), ('\u{fb14}', 4194323), ('\u{fb15}', 4194324), + ('\u{fb16}', 4194325), ('\u{fb17}', 4194326), + ]; + + static TITLECASE_TABLE_MULTI: &[[char; 3]; 23] = &[ + ['S', 's', '\u{0}'], ['\u{535}', '\u{582}', '\u{0}'], ['\u{1fba}', '\u{345}', '\u{0}'], + ['\u{386}', '\u{345}', '\u{0}'], ['\u{391}', '\u{342}', '\u{345}'], + ['\u{1fca}', '\u{345}', '\u{0}'], ['\u{389}', '\u{345}', '\u{0}'], + ['\u{397}', '\u{342}', '\u{345}'], ['\u{1ffa}', '\u{345}', '\u{0}'], + ['\u{38f}', '\u{345}', '\u{0}'], ['\u{3a9}', '\u{342}', '\u{345}'], ['F', 'f', '\u{0}'], + ['F', 'i', '\u{0}'], ['F', 'l', '\u{0}'], ['F', 'f', 'i'], ['F', 'f', 'l'], + ['S', 't', '\u{0}'], ['S', 't', '\u{0}'], ['\u{544}', '\u{576}', '\u{0}'], + ['\u{544}', '\u{565}', '\u{0}'], ['\u{544}', '\u{56b}', '\u{0}'], + ['\u{54e}', '\u{576}', '\u{0}'], ['\u{544}', '\u{56d}', '\u{0}'], + ]; } diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs index 437e1e47dd706..86428a180208a 100644 --- a/src/tools/unicode-table-generator/src/case_mapping.rs +++ b/src/tools/unicode-table-generator/src/case_mapping.rs @@ -6,7 +6,7 @@ use crate::{UnicodeData, fmt_list}; const INDEX_MASK: u32 = 1 << 22; -pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) { +pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 3]) { let mut file = String::new(); write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap(); @@ -18,7 +18,10 @@ pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) file.push_str("\n\n"); let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper); file.push_str(&upper_tables); - (file, [lower_size, upper_size]) + file.push_str("\n\n"); + let (title_tables, title_size) = generate_tables("TITLE", &data.to_title); + file.push_str(&title_tables); + (file, [lower_size, upper_size, title_size]) } fn generate_tables(case: &str, data: &BTreeMap) -> (String, usize) { @@ -119,4 +122,21 @@ pub fn to_upper(c: char) -> [char; 3] { .unwrap_or([c, '\0', '\0']) } } + +pub fn to_title(c: char) -> [char; 3] { + if c.is_ascii() { + [(c as u8).to_ascii_uppercase() as char, '\0', '\0'] + } else { + TITLECASE_TABLE + .binary_search_by(|&(key, _)| key.cmp(&c)) + .map(|i| { + let u = TITLECASE_TABLE[i].1; + char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| { + // SAFETY: Index comes from statically generated table + unsafe { *TITLECASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) } + }) + }) + .unwrap_or(to_upper(c)) + } +} "; diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index ded9205ffc4b9..5c848dd1c8822 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -99,32 +99,25 @@ static PROPERTIES: &[&str] = &[ struct UnicodeData { ranges: Vec<(&'static str, Vec>)>, + /// Only stores mappings that are not to self to_upper: BTreeMap, + /// Only stores mappings that differ from `to_upper` + to_title: BTreeMap, + /// Only stores mappings that are not to self to_lower: BTreeMap, } -fn to_mapping(origin: u32, codepoints: Vec) -> Option<[u32; 3]> { - let mut a = None; - let mut b = None; - let mut c = None; - - for codepoint in codepoints { - if origin == codepoint.value() { - return None; - } - - if a.is_none() { - a = Some(codepoint.value()); - } else if b.is_none() { - b = Some(codepoint.value()); - } else if c.is_none() { - c = Some(codepoint.value()); - } else { - panic!("more than 3 mapped codepoints") - } +fn to_mapping( + if_different_from: &[ucd_parse::Codepoint], + codepoints: &[ucd_parse::Codepoint], +) -> Option<[u32; 3]> { + if codepoints == if_different_from { + return None; } - Some([a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)]) + let mut ret = [ucd_parse::Codepoint::default(); 3]; + ret[0..codepoints.len()].copy_from_slice(codepoints); + Some(ret.map(ucd_parse::Codepoint::value)) } static UNICODE_DIRECTORY: &str = "unicode-downloads"; @@ -146,6 +139,7 @@ fn load_data() -> UnicodeData { let mut to_lower = BTreeMap::new(); let mut to_upper = BTreeMap::new(); + let mut to_title = BTreeMap::new(); for row in ucd_parse::UnicodeDataExpander::new( ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(), ) { @@ -171,6 +165,11 @@ fn load_data() -> UnicodeData { { to_upper.insert(row.codepoint.value(), [mapped.value(), 0, 0]); } + if let Some(mapped) = row.simple_titlecase_mapping + && Some(mapped) != row.simple_uppercase_mapping + { + to_title.insert(row.codepoint.value(), [mapped.value(), 0, 0]); + } } for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() { @@ -180,12 +179,15 @@ fn load_data() -> UnicodeData { } let key = row.codepoint.value(); - if let Some(lower) = to_mapping(key, row.lowercase) { + if let Some(lower) = to_mapping(&[row.codepoint], &row.lowercase) { to_lower.insert(key, lower); } - if let Some(upper) = to_mapping(key, row.uppercase) { + if let Some(upper) = to_mapping(&[row.codepoint], &row.uppercase) { to_upper.insert(key, upper); } + if let Some(title) = to_mapping(&row.uppercase, &row.titlecase) { + to_title.insert(key, title); + } } let mut properties: Vec<(&'static str, Vec>)> = properties @@ -203,7 +205,7 @@ fn load_data() -> UnicodeData { .collect(); properties.sort_by_key(|p| p.0); - UnicodeData { ranges: properties, to_lower, to_upper } + UnicodeData { ranges: properties, to_lower, to_title, to_upper } } fn main() { From 108a15cb1220355e18419650699d79f060d4c58c Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sun, 17 Mar 2024 19:44:55 -0400 Subject: [PATCH 6/6] Add `PartialEq` impls to the character case iterators Allows easily checking whether a string is in a particular case --- library/core/src/char/methods.rs | 38 +++++++++++++++++++++-- library/core/src/char/mod.rs | 52 ++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index d79ca25196865..90f07470cd817 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -1138,7 +1138,8 @@ impl char { } /// Returns an iterator that yields the lowercase mapping of this `char` as one or more - /// `char`s. + /// `char`s. The iterator also has implementations of [`Display`][core::fmt::Display] + /// and [`PartialEq`]. /// /// If this `char` does not have a lowercase mapping, the iterator yields the same `char`. /// @@ -1196,6 +1197,13 @@ impl char { /// // convert into themselves. /// assert_eq!('山'.to_lowercase().to_string(), "山"); /// ``` + /// + /// Check if a string is in lowercase: + /// + /// ``` + /// let s = "abcde\u{0301} 山"; + /// assert!(s.chars().all(|c| c.to_lowercase() == c)); + /// ``` #[must_use = "this returns the lowercased character as a new iterator, \ without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] @@ -1205,7 +1213,8 @@ impl char { } /// Returns an iterator that yields the titlecase mapping of this `char` as one or more - /// `char`s. + /// `char`s. The iterator also has implementations of [`Display`][core::fmt::Display] + /// and [`PartialEq`]. /// /// If this `char` does not have an titlecase mapping, the iterator yields the same `char`. /// @@ -1268,6 +1277,21 @@ impl char { /// assert_eq!('山'.to_titlecase().to_string(), "山"); /// ``` /// + /// Check if a word is in titlecase: + /// + /// ``` + /// #![feature(titlecase)] + /// let word = "Dross"; + /// let mut chars = word.chars(); + /// let first_cased_char = chars.find(|c| c.is_cased()); + /// let word_is_in_titlecase = if let Some(f) = first_cased_char { + /// f.to_titlecase() == f && chars.all(|c| c.to_lowercase() == c) + /// } else { + /// true + /// }; + /// assert!(word_is_in_titlecase); + /// ``` + /// /// # Note on locale /// /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: @@ -1303,7 +1327,8 @@ impl char { } /// Returns an iterator that yields the uppercase mapping of this `char` as one or more - /// `char`s. + /// `char`s. The iterator also has implementations of [`Display`][core::fmt::Display] + /// and [`PartialEq`]. /// /// If this `char` does not have an uppercase mapping, the iterator yields the same `char`. /// @@ -1363,6 +1388,13 @@ impl char { /// assert_eq!('山'.to_uppercase().to_string(), "山"); /// ``` /// + /// Check if a string is in uppercase: + /// + /// ``` + /// let s = "ABCDE\u{0301} 山"; + /// assert!(s.chars().all(|c| c.to_uppercase() == c)); + /// ``` + /// /// # Note on locale /// /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: diff --git a/library/core/src/char/mod.rs b/library/core/src/char/mod.rs index bd6e245c90c93..2deae0ea82b9f 100644 --- a/library/core/src/char/mod.rs +++ b/library/core/src/char/mod.rs @@ -369,6 +369,7 @@ macro_rules! casemappingiter_impls { #[$fusedstab:meta] #[$exactstab:meta] #[$displaystab:meta] + #[$partialstab:meta] $(#[$attr:meta])* $ITER_NAME:ident ) => { @@ -468,6 +469,38 @@ macro_rules! casemappingiter_impls { fmt::Display::fmt(&self.0, f) } } + + #[$partialstab] + impl PartialEq for $ITER_NAME { + #[inline] + fn eq(&self, other: &ToUppercase) -> bool { + self.0 == other.0 + } + } + + #[unstable(feature = "titlecase", issue = "none")] + impl PartialEq for $ITER_NAME { + #[inline] + fn eq(&self, other: &ToTitlecase) -> bool { + self.0 == other.0 + } + } + + #[$partialstab] + impl PartialEq for $ITER_NAME { + #[inline] + fn eq(&self, other: &ToLowercase) -> bool { + self.0 == other.0 + } + } + + #[$partialstab] + impl PartialEq for $ITER_NAME { + #[inline] + fn eq(&self, other: &char) -> bool { + self.0 == *other + } + } } } @@ -477,6 +510,7 @@ casemappingiter_impls! { #[stable(feature = "fused", since = "1.26.0")] #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")] #[stable(feature = "char_struct_display", since = "1.16.0")] + #[stable(feature = "iter_partialeq", since = "CURRENT_RUSTC_VERSION")] /// Returns an iterator that yields the uppercase equivalent of a `char`. /// /// This `struct` is created by the [`to_uppercase`] method on [`char`]. See @@ -492,6 +526,7 @@ casemappingiter_impls! { #[unstable(feature = "titlecase", issue = "none")] #[unstable(feature = "titlecase", issue = "none")] #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] /// Returns an iterator that yields the titlecase equivalent of a `char`. /// /// This `struct` is created by the [`to_titlecase`] method on [`char`]. See @@ -507,6 +542,7 @@ casemappingiter_impls! { #[stable(feature = "fused", since = "1.26.0")] #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")] #[stable(feature = "char_struct_display", since = "1.16.0")] + #[stable(feature = "iter_partialeq", since = "CURRENT_RUSTC_VERSION")] /// Returns an iterator that yields the lowercase equivalent of a `char`. /// /// This `struct` is created by the [`to_lowercase`] method on [`char`]. See @@ -622,6 +658,22 @@ impl fmt::Display for CaseMappingIter { } } +impl PartialEq for CaseMappingIter { + #[inline] + fn eq(&self, other: &Self) -> bool { + self.0.as_slice() == other.0.as_slice() + } +} + +impl Eq for CaseMappingIter {} + +impl PartialEq for CaseMappingIter { + #[inline] + fn eq(&self, other: &char) -> bool { + self.0.as_slice() == &[*other] + } +} + /// The error type returned when a checked char conversion fails. #[stable(feature = "u8_from_char", since = "1.59.0")] #[derive(Debug, Copy, Clone, PartialEq, Eq)]