1212#pragma GCC diagnostic ignored "-Wconversion"
1313#endif
1414#include < boost/regex.hpp>
15+ #include < unicode/regex.h>
16+ #include < unicode/unistr.h>
1517#if defined(__clang__)
1618#pragma clang diagnostic pop
1719#elif defined(__GNUC__)
1820#pragma GCC diagnostic pop
1921#endif
2022
21- #include < cassert> // assert
22- #include < cstdint> // std::uint8_t, std::uint64_t
23- #include < optional> // std::optional
24- #include < regex> // std::regex
25- #include < string> // std::stoull
26- #include < utility> // std::pair
27- #include < variant> // std::variant
23+ #include < algorithm> // std::ranges::any_of
24+ #include < cassert> // assert
25+ #include < cstdint> // std::uint8_t, std::uint64_t
26+ #include < memory> // std::shared_ptr, std::unique_ptr
27+ #include < optional> // std::optional
28+ #include < regex> // std::regex
29+ #include < string> // std::stoull
30+ #include < utility> // std::pair
31+ #include < variant> // std::variant
2832
2933// / @defgroup regex Regex
3034// / @brief An opinionated regex ECMA 262 implementation for JSON Schema
@@ -59,12 +63,15 @@ struct RegexTypeNoop {
5963 auto operator ==(const RegexTypeNoop &) const noexcept -> bool = default ;
6064};
6165
66+ // / @ingroup regex
67+ using RegexTypeICU = std::shared_ptr<icu::RegexPattern>;
68+
6269// / @ingroup regex
6370template <typename T>
64- using Regex =
65- std::variant<RegexTypeBoost< typename T::value_type>, RegexTypePrefix<T>,
66- RegexTypeNonEmpty, RegexTypeRange,
67- RegexTypeStd< typename T::value_type> , RegexTypeNoop>;
71+ using Regex = std::variant<RegexTypeBoost< typename T::value_type>,
72+ RegexTypePrefix<T>, RegexTypeNonEmpty ,
73+ RegexTypeRange, RegexTypeStd< typename T::value_type> ,
74+ RegexTypeICU , RegexTypeNoop>;
6875#if !defined(DOXYGEN)
6976// For fast internal dispatching. It must stay in sync with the variant above
7077enum class RegexIndex : std::uint8_t {
@@ -73,10 +80,41 @@ enum class RegexIndex : std::uint8_t {
7380 NonEmpty,
7481 Range,
7582 Std,
83+ ICU,
7684 Noop
7785};
7886#endif
7987
88+ // / @ingroup regex
89+ // /
90+ // / Heuristically detect if a regular expression pattern seems to require
91+ // / Unicode support. Patterns appear to require Unicode support if they contain
92+ // / Unicode property escapes (`\p{}`), Unicode codepoint escapes (`\u` or
93+ // / `\u{}`), non-ASCII characters, or the dot metacharacter (which should match
94+ // / Unicode codepoints). This is a best-effort heuristic detection and may have
95+ // / false positives in edge cases. For example:
96+ // /
97+ // / ```cpp
98+ // / #include <sourcemeta/core/regex.h>
99+ // / #include <cassert>
100+ // /
101+ // / assert(sourcemeta::core::seems_unicode("\\p{Letter}"));
102+ // / assert(sourcemeta::core::seems_unicode("\\u0041"));
103+ // / assert(sourcemeta::core::seems_unicode("café"));
104+ // / assert(sourcemeta::core::seems_unicode(".+"));
105+ // / assert(!sourcemeta::core::seems_unicode("^[a-z]+$"));
106+ // / ```
107+ template <typename T> auto seems_unicode (const T &pattern) -> bool {
108+ return pattern.find (" \\ p{" ) != T::npos || pattern.find (" \\ u{" ) != T::npos ||
109+ pattern.find (" \\ u" ) != T::npos ||
110+ std::ranges::any_of (pattern,
111+ [](const auto character) {
112+ return static_cast <unsigned char >(character) >
113+ 127 ;
114+ }) ||
115+ pattern.find (" ." ) != T::npos;
116+ }
117+
80118// / @ingroup regex
81119// /
82120// / Compile a regular expression from a string. If the regular expression is
@@ -119,6 +157,35 @@ auto to_regex(const T &pattern) -> std::optional<Regex<T>> {
119157 return RegexTypeRange{minimum, maximum};
120158 }
121159
160+ if (seems_unicode (pattern)) {
161+ T icu_compatible_pattern{pattern};
162+ std::size_t position{0 };
163+ // ICU uses \x{} syntax for Unicode codepoint escapes, while ECMAScript
164+ // uses \u{}. Convert the pattern to ICU-compatible syntax.
165+ while ((position = icu_compatible_pattern.find (" \\ u{" , position)) !=
166+ T::npos) {
167+ icu_compatible_pattern.replace (position, 3 , " \\ x{" );
168+ position += 3 ;
169+ }
170+
171+ UErrorCode status{U_ZERO_ERROR};
172+ UParseError parse_error;
173+ icu::UnicodeString icu_pattern{
174+ icu::UnicodeString::fromUTF8 (icu_compatible_pattern)};
175+
176+ auto *regex_pattern{icu::RegexPattern::compile (icu_pattern, UREGEX_DOTALL,
177+ parse_error, status)};
178+
179+ if (U_FAILURE (status) || regex_pattern == nullptr ) {
180+ if (regex_pattern != nullptr ) {
181+ delete regex_pattern;
182+ }
183+ return std::nullopt ;
184+ }
185+
186+ return std::shared_ptr<icu::RegexPattern>(regex_pattern);
187+ }
188+
122189 RegexTypeBoost<typename T::value_type> result{
123190 pattern,
124191 boost::regex::no_except |
@@ -192,6 +259,29 @@ auto matches(const Regex<T> ®ex, const T &value) -> bool {
192259 case RegexIndex::Std:
193260 return std::regex_search (
194261 value, *std::get_if<RegexTypeStd<typename T::value_type>>(®ex));
262+ case RegexIndex::ICU: {
263+ const auto *icu_regex{std::get_if<RegexTypeICU>(®ex)};
264+ if (!icu_regex || !(*icu_regex)) {
265+ return false ;
266+ }
267+
268+ icu::UnicodeString icu_input{icu::UnicodeString::fromUTF8 (value)};
269+ UErrorCode status{U_ZERO_ERROR};
270+ std::unique_ptr<icu::RegexMatcher> matcher{
271+ (*icu_regex)->matcher (icu_input, status)};
272+
273+ if (U_FAILURE (status) || !matcher) {
274+ return false ;
275+ }
276+
277+ auto result{matcher->find (status)};
278+
279+ if (U_FAILURE (status)) {
280+ return false ;
281+ }
282+
283+ return result;
284+ }
195285 case RegexIndex::Noop:
196286 return true ;
197287 }
0 commit comments