diff --git a/CMakeLists.txt b/CMakeLists.txt index a281332bb..125beb5a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,6 +82,7 @@ endif() if(SOURCEMETA_CORE_REGEX) find_package(BoostRegex REQUIRED) + find_package(ICU REQUIRED) add_subdirectory(src/core/regex) endif() diff --git a/cmake/FindICU.cmake b/cmake/FindICU.cmake new file mode 100644 index 000000000..e90dbd7ef --- /dev/null +++ b/cmake/FindICU.cmake @@ -0,0 +1,54 @@ +if(NOT ICU_FOUND) + set(ICU_ROOT "/opt/homebrew/opt/icu4c") + + find_path(ICU_INCLUDE_DIR + NAMES unicode/unistr.h + PATHS ${ICU_ROOT}/include + NO_DEFAULT_PATH) + + find_library(ICU_UC_LIBRARY + NAMES icuuc + PATHS ${ICU_ROOT}/lib + NO_DEFAULT_PATH) + + find_library(ICU_I18N_LIBRARY + NAMES icui18n + PATHS ${ICU_ROOT}/lib + NO_DEFAULT_PATH) + + find_library(ICU_DATA_LIBRARY + NAMES icudata + PATHS ${ICU_ROOT}/lib + NO_DEFAULT_PATH) + + if(ICU_INCLUDE_DIR AND ICU_UC_LIBRARY AND ICU_I18N_LIBRARY AND ICU_DATA_LIBRARY) + set(ICU_FOUND ON) + set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR}) + set(ICU_LIBRARIES ${ICU_UC_LIBRARY} ${ICU_I18N_LIBRARY} ${ICU_DATA_LIBRARY}) + + if(NOT TARGET ICU::uc) + add_library(ICU::uc UNKNOWN IMPORTED) + set_target_properties(ICU::uc PROPERTIES + IMPORTED_LOCATION "${ICU_UC_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}") + endif() + + if(NOT TARGET ICU::i18n) + add_library(ICU::i18n UNKNOWN IMPORTED) + set_target_properties(ICU::i18n PROPERTIES + IMPORTED_LOCATION "${ICU_I18N_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}") + endif() + + if(NOT TARGET ICU::data) + add_library(ICU::data UNKNOWN IMPORTED) + set_target_properties(ICU::data PROPERTIES + IMPORTED_LOCATION "${ICU_DATA_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}") + endif() + + message(STATUS "ICU found") + else() + message(FATAL_ERROR "ICU not found. Please install ICU or set ICU_ROOT.") + endif() +endif() diff --git a/src/core/regex/CMakeLists.txt b/src/core/regex/CMakeLists.txt index ef0e01f2d..3bd40885d 100644 --- a/src/core/regex/CMakeLists.txt +++ b/src/core/regex/CMakeLists.txt @@ -4,4 +4,4 @@ if(SOURCEMETA_CORE_INSTALL) sourcemeta_library_install(NAMESPACE sourcemeta PROJECT core NAME regex) endif() -target_link_libraries(sourcemeta_core_regex INTERFACE Boost::regex) +target_link_libraries(sourcemeta_core_regex INTERFACE Boost::regex ICU::uc ICU::i18n ICU::data) diff --git a/src/core/regex/include/sourcemeta/core/regex.h b/src/core/regex/include/sourcemeta/core/regex.h index 10fc44ac2..aa4ef91de 100644 --- a/src/core/regex/include/sourcemeta/core/regex.h +++ b/src/core/regex/include/sourcemeta/core/regex.h @@ -12,19 +12,23 @@ #pragma GCC diagnostic ignored "-Wconversion" #endif #include +#include +#include #if defined(__clang__) #pragma clang diagnostic pop #elif defined(__GNUC__) #pragma GCC diagnostic pop #endif -#include // assert -#include // std::uint8_t, std::uint64_t -#include // std::optional -#include // std::regex -#include // std::stoull -#include // std::pair -#include // std::variant +#include // std::ranges::any_of +#include // assert +#include // std::uint8_t, std::uint64_t +#include // std::shared_ptr, std::unique_ptr +#include // std::optional +#include // std::regex +#include // std::stoull +#include // std::pair +#include // std::variant /// @defgroup regex Regex /// @brief An opinionated regex ECMA 262 implementation for JSON Schema @@ -59,12 +63,15 @@ struct RegexTypeNoop { auto operator==(const RegexTypeNoop &) const noexcept -> bool = default; }; +/// @ingroup regex +using RegexTypeICU = std::shared_ptr; + /// @ingroup regex template -using Regex = - std::variant, RegexTypePrefix, - RegexTypeNonEmpty, RegexTypeRange, - RegexTypeStd, RegexTypeNoop>; +using Regex = std::variant, + RegexTypePrefix, RegexTypeNonEmpty, + RegexTypeRange, RegexTypeStd, + RegexTypeICU, RegexTypeNoop>; #if !defined(DOXYGEN) // For fast internal dispatching. It must stay in sync with the variant above enum class RegexIndex : std::uint8_t { @@ -73,10 +80,41 @@ enum class RegexIndex : std::uint8_t { NonEmpty, Range, Std, + ICU, Noop }; #endif +/// @ingroup regex +/// +/// Heuristically detect if a regular expression pattern seems to require +/// Unicode support. Patterns appear to require Unicode support if they contain +/// Unicode property escapes (`\p{}`), Unicode codepoint escapes (`\u` or +/// `\u{}`), non-ASCII characters, or the dot metacharacter (which should match +/// Unicode codepoints). This is a best-effort heuristic detection and may have +/// false positives in edge cases. For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::seems_unicode("\\p{Letter}")); +/// assert(sourcemeta::core::seems_unicode("\\u0041")); +/// assert(sourcemeta::core::seems_unicode("café")); +/// assert(sourcemeta::core::seems_unicode(".+")); +/// assert(!sourcemeta::core::seems_unicode("^[a-z]+$")); +/// ``` +template auto seems_unicode(const T &pattern) -> bool { + return pattern.find("\\p{") != T::npos || pattern.find("\\u{") != T::npos || + pattern.find("\\u") != T::npos || + std::ranges::any_of(pattern, + [](const auto character) { + return static_cast(character) > + 127; + }) || + pattern.find(".") != T::npos; +} + /// @ingroup regex /// /// Compile a regular expression from a string. If the regular expression is @@ -119,6 +157,35 @@ auto to_regex(const T &pattern) -> std::optional> { return RegexTypeRange{minimum, maximum}; } + if (seems_unicode(pattern)) { + T icu_compatible_pattern{pattern}; + std::size_t position{0}; + // ICU uses \x{} syntax for Unicode codepoint escapes, while ECMAScript + // uses \u{}. Convert the pattern to ICU-compatible syntax. + while ((position = icu_compatible_pattern.find("\\u{", position)) != + T::npos) { + icu_compatible_pattern.replace(position, 3, "\\x{"); + position += 3; + } + + UErrorCode status{U_ZERO_ERROR}; + UParseError parse_error; + icu::UnicodeString icu_pattern{ + icu::UnicodeString::fromUTF8(icu_compatible_pattern)}; + + auto *regex_pattern{icu::RegexPattern::compile(icu_pattern, UREGEX_DOTALL, + parse_error, status)}; + + if (U_FAILURE(status) || regex_pattern == nullptr) { + if (regex_pattern != nullptr) { + delete regex_pattern; + } + return std::nullopt; + } + + return std::shared_ptr(regex_pattern); + } + RegexTypeBoost result{ pattern, boost::regex::no_except | @@ -192,6 +259,29 @@ auto matches(const Regex ®ex, const T &value) -> bool { case RegexIndex::Std: return std::regex_search( value, *std::get_if>(®ex)); + case RegexIndex::ICU: { + const auto *icu_regex{std::get_if(®ex)}; + if (!icu_regex || !(*icu_regex)) { + return false; + } + + icu::UnicodeString icu_input{icu::UnicodeString::fromUTF8(value)}; + UErrorCode status{U_ZERO_ERROR}; + std::unique_ptr matcher{ + (*icu_regex)->matcher(icu_input, status)}; + + if (U_FAILURE(status) || !matcher) { + return false; + } + + auto result{matcher->find(status)}; + + if (U_FAILURE(status)) { + return false; + } + + return result; + } case RegexIndex::Noop: return true; } diff --git a/test/regex/regex_matches_test.cc b/test/regex/regex_matches_test.cc index 0f1037058..e74130087 100644 --- a/test/regex/regex_matches_test.cc +++ b/test/regex/regex_matches_test.cc @@ -159,3 +159,141 @@ TEST(Regex_matches, match_false_6) { EXPECT_TRUE(regex.has_value()); EXPECT_FALSE(sourcemeta::core::matches(regex.value(), "bar")); } + +TEST(Regex_matches, unicode_range_arabic_indic_digit) { + const auto regex{ + sourcemeta::core::to_regex("[\\u0660-\\u0669]")}; + EXPECT_TRUE(regex.has_value()); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "\u0660")); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "\u0669")); + EXPECT_FALSE(sourcemeta::core::matches(regex.value(), "0")); +} + +TEST(Regex_matches, unicode_range_4byte_deseret) { + const auto regex{ + sourcemeta::core::to_regex("[\\u{10400}-\\u{1044F}]")}; + EXPECT_TRUE(regex.has_value()); + EXPECT_TRUE( + sourcemeta::core::matches(regex.value(), "\U00010400")); + EXPECT_TRUE( + sourcemeta::core::matches(regex.value(), "\U0001044F")); + EXPECT_FALSE(sourcemeta::core::matches(regex.value(), "A")); +} + +TEST(Regex_matches, unicode_property_letter) { + const auto regex{sourcemeta::core::to_regex( + "^\\p{Letter}[\\p{Letter}\\p{Number}]*$")}; + EXPECT_TRUE(regex.has_value()); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "hello")); + EXPECT_TRUE( + sourcemeta::core::matches(regex.value(), "hello123")); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "Àlement")); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "中文")); + EXPECT_TRUE( + sourcemeta::core::matches(regex.value(), "\U00010400test")); + EXPECT_FALSE( + sourcemeta::core::matches(regex.value(), "123hello")); +} + +TEST(Regex_matches, unicode_property_exclude_digit) { + const auto regex{sourcemeta::core::to_regex( + "^(?!\\p{Number})\\p{Letter}[\\p{Letter}\\p{Number}-_.]*$")}; + EXPECT_TRUE(regex.has_value()); + EXPECT_TRUE( + sourcemeta::core::matches(regex.value(), "element123")); + EXPECT_TRUE( + sourcemeta::core::matches(regex.value(), "element٠")); + EXPECT_FALSE( + sourcemeta::core::matches(regex.value(), "٠element")); + EXPECT_FALSE( + sourcemeta::core::matches(regex.value(), "0element")); +} + +TEST(Regex_matches, unicode_dot_matches_codepoint) { + const auto regex{sourcemeta::core::to_regex("^.$")}; + EXPECT_TRUE(regex.has_value()); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "A")); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "À")); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "中")); + EXPECT_TRUE( + sourcemeta::core::matches(regex.value(), "\U00010400")); +} + +TEST(Regex_matches, unicode_quantifier_on_codepoints) { + const auto regex{sourcemeta::core::to_regex("^.{3}$")}; + EXPECT_TRUE(regex.has_value()); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "ABC")); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "ÀÁÂ")); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "中文字")); + EXPECT_TRUE(sourcemeta::core::matches( + regex.value(), "\U00010400\U00010401\U00010402")); +} + +TEST(Regex_matches, digit_ascii_only) { + // \d should only match ASCII digits 0-9, not Unicode digits + const auto regex{sourcemeta::core::to_regex("^\\d$")}; + EXPECT_TRUE(regex.has_value()); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "0")); + EXPECT_FALSE(sourcemeta::core::matches(regex.value(), "\u07C0")); +} + +TEST(Regex_matches, word_ascii_only) { + // \w should only match ASCII [a-zA-Z0-9_], not Unicode letters + const auto regex{sourcemeta::core::to_regex("^\\w$")}; + EXPECT_TRUE(regex.has_value()); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "a")); + EXPECT_FALSE(sourcemeta::core::matches(regex.value(), "é")); +} + +TEST(Regex_matches, nonbmp_emoji_quantifier) { + // Dragon emoji (U+1F432) is 4-byte UTF-8 + // Pattern ^🐲*$ should match zero or more dragon emojis + // This tests that quantifiers work on codepoints, not bytes + const auto regex{sourcemeta::core::to_regex("^\U0001F432*$")}; + EXPECT_TRUE(regex.has_value()); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "")); + EXPECT_TRUE( + sourcemeta::core::matches(regex.value(), "\U0001F432")); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), + "\U0001F432\U0001F432")); + EXPECT_FALSE( + sourcemeta::core::matches(regex.value(), "\U0001F409")); + EXPECT_FALSE(sourcemeta::core::matches(regex.value(), "D")); +} + +TEST(Regex_matches, nonbmp_literal_match) { + // Test literal matching of 4-byte UTF-8 characters + const auto regex{sourcemeta::core::to_regex("^\U0001F432$")}; + EXPECT_TRUE(regex.has_value()); + EXPECT_TRUE( + sourcemeta::core::matches(regex.value(), "\U0001F432")); + EXPECT_FALSE( + sourcemeta::core::matches(regex.value(), "\U0001F409")); + EXPECT_FALSE(sourcemeta::core::matches(regex.value(), + "\U0001F432\U0001F432")); +} + +TEST(Regex_matches, xml_ncname_simplified) { + const auto regex{sourcemeta::core::to_regex( + "^(?![:\\p{Nd}])[\\p{L}_][\\p{L}\\p{Nd}\\-._·]*$")}; + EXPECT_TRUE(regex.has_value()); + + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "element")); + EXPECT_TRUE( + sourcemeta::core::matches(regex.value(), "_element")); + EXPECT_TRUE( + sourcemeta::core::matches(regex.value(), "element123")); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), "élément")); + EXPECT_TRUE( + sourcemeta::core::matches(regex.value(), "element٠")); + EXPECT_TRUE(sourcemeta::core::matches(regex.value(), + "\U00010400element")); + EXPECT_FALSE( + sourcemeta::core::matches(regex.value(), ":element")); + EXPECT_FALSE( + sourcemeta::core::matches(regex.value(), "0element")); + EXPECT_FALSE( + sourcemeta::core::matches(regex.value(), "٠element")); + EXPECT_FALSE(sourcemeta::core::matches(regex.value(), + "\U000104A0element")); +}