sourcemeta · jviotti · Oct 29, 2025 · Oct 29, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -82,6 +82,7 @@ endif()
 
 if(SOURCEMETA_CORE_REGEX)
   find_package(BoostRegex REQUIRED)
+  find_package(ICU REQUIRED)
   add_subdirectory(src/core/regex)
 endif()
 

diff --git a/cmake/FindICU.cmake b/cmake/FindICU.cmake
@@ -0,0 +1,54 @@
+if(NOT ICU_FOUND)
+  set(ICU_ROOT "/opt/homebrew/opt/icu4c")
+
+  find_path(ICU_INCLUDE_DIR
+    NAMES unicode/unistr.h
+    PATHS ${ICU_ROOT}/include
+    NO_DEFAULT_PATH)
+
+  find_library(ICU_UC_LIBRARY
+    NAMES icuuc
+    PATHS ${ICU_ROOT}/lib
+    NO_DEFAULT_PATH)
+
+  find_library(ICU_I18N_LIBRARY
+    NAMES icui18n
+    PATHS ${ICU_ROOT}/lib
+    NO_DEFAULT_PATH)
+
+  find_library(ICU_DATA_LIBRARY
+    NAMES icudata
+    PATHS ${ICU_ROOT}/lib
+    NO_DEFAULT_PATH)
+
+  if(ICU_INCLUDE_DIR AND ICU_UC_LIBRARY AND ICU_I18N_LIBRARY AND ICU_DATA_LIBRARY)
+    set(ICU_FOUND ON)
+    set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR})
+    set(ICU_LIBRARIES ${ICU_UC_LIBRARY} ${ICU_I18N_LIBRARY} ${ICU_DATA_LIBRARY})
+
+    if(NOT TARGET ICU::uc)
+      add_library(ICU::uc UNKNOWN IMPORTED)
+      set_target_properties(ICU::uc PROPERTIES
+        IMPORTED_LOCATION "${ICU_UC_LIBRARY}"
+        INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}")
+    endif()
+
+    if(NOT TARGET ICU::i18n)
+      add_library(ICU::i18n UNKNOWN IMPORTED)
+      set_target_properties(ICU::i18n PROPERTIES
+        IMPORTED_LOCATION "${ICU_I18N_LIBRARY}"
+        INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}")
+    endif()
+
+    if(NOT TARGET ICU::data)
+      add_library(ICU::data UNKNOWN IMPORTED)
+      set_target_properties(ICU::data PROPERTIES
+        IMPORTED_LOCATION "${ICU_DATA_LIBRARY}"
+        INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}")
+    endif()
+
+    message(STATUS "ICU found")
+  else()
+    message(FATAL_ERROR "ICU not found. Please install ICU or set ICU_ROOT.")
+  endif()
+endif()
diff --git a/src/core/regex/CMakeLists.txt b/src/core/regex/CMakeLists.txt
@@ -4,4 +4,4 @@ if(SOURCEMETA_CORE_INSTALL)
   sourcemeta_library_install(NAMESPACE sourcemeta PROJECT core NAME regex)
 endif()
 
-target_link_libraries(sourcemeta_core_regex INTERFACE Boost::regex)
+target_link_libraries(sourcemeta_core_regex INTERFACE Boost::regex ICU::uc ICU::i18n ICU::data)
diff --git a/src/core/regex/include/sourcemeta/core/regex.h b/src/core/regex/include/sourcemeta/core/regex.h
@@ -12,19 +12,23 @@
 #pragma GCC diagnostic ignored "-Wconversion"
 #endif
 #include <boost/regex.hpp>
+#include <unicode/regex.h>
+#include <unicode/unistr.h>
 #if defined(__clang__)
 #pragma clang diagnostic pop
 #elif defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif
 
-#include <cassert>  // assert
-#include <cstdint>  // std::uint8_t, std::uint64_t
-#include <optional> // std::optional
-#include <regex>    // std::regex
-#include <string>   // std::stoull
-#include <utility>  // std::pair
-#include <variant>  // std::variant
+#include <algorithm> // std::ranges::any_of
+#include <cassert>   // assert
+#include <cstdint>   // std::uint8_t, std::uint64_t
+#include <memory>    // std::shared_ptr, std::unique_ptr
+#include <optional>  // std::optional
+#include <regex>     // std::regex
+#include <string>    // std::stoull
+#include <utility>   // std::pair
+#include <variant>   // std::variant
 
 /// @defgroup regex Regex
 /// @brief An opinionated regex ECMA 262 implementation for JSON Schema
@@ -59,12 +63,15 @@ struct RegexTypeNoop {
   auto operator==(const RegexTypeNoop &) const noexcept -> bool = default;
 };
 
+/// @ingroup regex
+using RegexTypeICU = std::shared_ptr<icu::RegexPattern>;
+
 /// @ingroup regex
 template <typename T>
-using Regex =
-    std::variant<RegexTypeBoost<typename T::value_type>, RegexTypePrefix<T>,
-                 RegexTypeNonEmpty, RegexTypeRange,
-                 RegexTypeStd<typename T::value_type>, RegexTypeNoop>;
+using Regex = std::variant<RegexTypeBoost<typename T::value_type>,
+                           RegexTypePrefix<T>, RegexTypeNonEmpty,
+                           RegexTypeRange, RegexTypeStd<typename T::value_type>,
+                           RegexTypeICU, RegexTypeNoop>;
 #if !defined(DOXYGEN)
 // For fast internal dispatching. It must stay in sync with the variant above
 enum class RegexIndex : std::uint8_t {
@@ -73,10 +80,41 @@ enum class RegexIndex : std::uint8_t {
   NonEmpty,
   Range,
   Std,
+  ICU,
   Noop
 };
 #endif
 
+/// @ingroup regex
+///
+/// Heuristically detect if a regular expression pattern seems to require
+/// Unicode support. Patterns appear to require Unicode support if they contain
+/// Unicode property escapes (`\p{}`), Unicode codepoint escapes (`\u` or
+/// `\u{}`), non-ASCII characters, or the dot metacharacter (which should match
+/// Unicode codepoints). This is a best-effort heuristic detection and may have
+/// false positives in edge cases. For example:
+///
+/// ```cpp
+/// #include <sourcemeta/core/regex.h>
+/// #include <cassert>
+///
+/// assert(sourcemeta::core::seems_unicode("\\p{Letter}"));
+/// assert(sourcemeta::core::seems_unicode("\\u0041"));
+/// assert(sourcemeta::core::seems_unicode("café"));
+/// assert(sourcemeta::core::seems_unicode(".+"));
+/// assert(!sourcemeta::core::seems_unicode("^[a-z]+$"));
+/// ```
+template <typename T> auto seems_unicode(const T &pattern) -> bool {
+  return pattern.find("\\p{") != T::npos || pattern.find("\\u{") != T::npos ||
+         pattern.find("\\u") != T::npos ||
+         std::ranges::any_of(pattern,
+                             [](const auto character) {
+                               return static_cast<unsigned char>(character) >
+                                      127;
+                             }) ||
+         pattern.find(".") != T::npos;
+}
+
 /// @ingroup regex
 ///
 /// Compile a regular expression from a string. If the regular expression is
@@ -119,6 +157,35 @@ auto to_regex(const T &pattern) -> std::optional<Regex<T>> {
     return RegexTypeRange{minimum, maximum};
   }
 
+  if (seems_unicode(pattern)) {
+    T icu_compatible_pattern{pattern};
+    std::size_t position{0};
+    // ICU uses \x{} syntax for Unicode codepoint escapes, while ECMAScript
+    // uses \u{}. Convert the pattern to ICU-compatible syntax.
+    while ((position = icu_compatible_pattern.find("\\u{", position)) !=
+           T::npos) {
+      icu_compatible_pattern.replace(position, 3, "\\x{");
+      position += 3;
+    }
+
+    UErrorCode status{U_ZERO_ERROR};
+    UParseError parse_error;
+    icu::UnicodeString icu_pattern{
+        icu::UnicodeString::fromUTF8(icu_compatible_pattern)};
+
+    auto *regex_pattern{icu::RegexPattern::compile(icu_pattern, UREGEX_DOTALL,
+                                                   parse_error, status)};
+
+    if (U_FAILURE(status) || regex_pattern == nullptr) {
+      if (regex_pattern != nullptr) {
+        delete regex_pattern;
+      }
+      return std::nullopt;
+    }
+
+    return std::shared_ptr<icu::RegexPattern>(regex_pattern);
+  }
+
   RegexTypeBoost<typename T::value_type> result{
       pattern,
       boost::regex::no_except |
@@ -192,6 +259,29 @@ auto matches(const Regex<T> &regex, const T &value) -> bool {
     case RegexIndex::Std:
       return std::regex_search(
           value, *std::get_if<RegexTypeStd<typename T::value_type>>(&regex));
+    case RegexIndex::ICU: {
+      const auto *icu_regex{std::get_if<RegexTypeICU>(&regex)};
+      if (!icu_regex || !(*icu_regex)) {
+        return false;
+      }
+
+      icu::UnicodeString icu_input{icu::UnicodeString::fromUTF8(value)};
+      UErrorCode status{U_ZERO_ERROR};
+      std::unique_ptr<icu::RegexMatcher> matcher{
+          (*icu_regex)->matcher(icu_input, status)};
+
+      if (U_FAILURE(status) || !matcher) {
+        return false;
+      }
+
+      auto result{matcher->find(status)};
+
+      if (U_FAILURE(status)) {
+        return false;
+      }
+
+      return result;
+    }
     case RegexIndex::Noop:
       return true;
   }

diff --git a/test/regex/regex_matches_test.cc b/test/regex/regex_matches_test.cc
@@ -159,3 +159,141 @@ TEST(Regex_matches, match_false_6) {
   EXPECT_TRUE(regex.has_value());
   EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "bar"));
 }
+
+TEST(Regex_matches, unicode_range_arabic_indic_digit) {
+  const auto regex{
+      sourcemeta::core::to_regex<std::string>("[\\u0660-\\u0669]")};
+  EXPECT_TRUE(regex.has_value());
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "\u0660"));
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "\u0669"));
+  EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "0"));
+}
+
+TEST(Regex_matches, unicode_range_4byte_deseret) {
+  const auto regex{
+      sourcemeta::core::to_regex<std::string>("[\\u{10400}-\\u{1044F}]")};
+  EXPECT_TRUE(regex.has_value());
+  EXPECT_TRUE(
+      sourcemeta::core::matches<std::string>(regex.value(), "\U00010400"));
+  EXPECT_TRUE(
+      sourcemeta::core::matches<std::string>(regex.value(), "\U0001044F"));
+  EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "A"));
+}
+
+TEST(Regex_matches, unicode_property_letter) {
+  const auto regex{sourcemeta::core::to_regex<std::string>(
+      "^\\p{Letter}[\\p{Letter}\\p{Number}]*$")};
+  EXPECT_TRUE(regex.has_value());
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "hello"));
+  EXPECT_TRUE(
+      sourcemeta::core::matches<std::string>(regex.value(), "hello123"));
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "Àlement"));
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "中文"));
+  EXPECT_TRUE(
+      sourcemeta::core::matches<std::string>(regex.value(), "\U00010400test"));
+  EXPECT_FALSE(
+      sourcemeta::core::matches<std::string>(regex.value(), "123hello"));
+}
+
+TEST(Regex_matches, unicode_property_exclude_digit) {
+  const auto regex{sourcemeta::core::to_regex<std::string>(
+      "^(?!\\p{Number})\\p{Letter}[\\p{Letter}\\p{Number}-_.]*$")};
+  EXPECT_TRUE(regex.has_value());
+  EXPECT_TRUE(
+      sourcemeta::core::matches<std::string>(regex.value(), "element123"));
+  EXPECT_TRUE(
+      sourcemeta::core::matches<std::string>(regex.value(), "element٠"));
+  EXPECT_FALSE(
+      sourcemeta::core::matches<std::string>(regex.value(), "٠element"));
+  EXPECT_FALSE(
+      sourcemeta::core::matches<std::string>(regex.value(), "0element"));
+}
+
+TEST(Regex_matches, unicode_dot_matches_codepoint) {
+  const auto regex{sourcemeta::core::to_regex<std::string>("^.$")};
+  EXPECT_TRUE(regex.has_value());
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "A"));
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "À"));
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "中"));
+  EXPECT_TRUE(
+      sourcemeta::core::matches<std::string>(regex.value(), "\U00010400"));
+}
+
+TEST(Regex_matches, unicode_quantifier_on_codepoints) {
+  const auto regex{sourcemeta::core::to_regex<std::string>("^.{3}$")};
+  EXPECT_TRUE(regex.has_value());
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "ABC"));
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "ÀÁÂ"));
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "中文字"));
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(
+      regex.value(), "\U00010400\U00010401\U00010402"));
+}
+
+TEST(Regex_matches, digit_ascii_only) {
+  // \d should only match ASCII digits 0-9, not Unicode digits
+  const auto regex{sourcemeta::core::to_regex<std::string>("^\\d$")};
+  EXPECT_TRUE(regex.has_value());
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "0"));
+  EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "\u07C0"));
+}
+
+TEST(Regex_matches, word_ascii_only) {
+  // \w should only match ASCII [a-zA-Z0-9_], not Unicode letters
+  const auto regex{sourcemeta::core::to_regex<std::string>("^\\w$")};
+  EXPECT_TRUE(regex.has_value());
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "a"));
+  EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "é"));
+}
+
+TEST(Regex_matches, nonbmp_emoji_quantifier) {
+  // Dragon emoji (U+1F432) is 4-byte UTF-8
+  // Pattern ^🐲*$ should match zero or more dragon emojis
+  // This tests that quantifiers work on codepoints, not bytes
+  const auto regex{sourcemeta::core::to_regex<std::string>("^\U0001F432*$")};
+  EXPECT_TRUE(regex.has_value());
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), ""));
+  EXPECT_TRUE(
+      sourcemeta::core::matches<std::string>(regex.value(), "\U0001F432"));
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(),
+                                                     "\U0001F432\U0001F432"));
+  EXPECT_FALSE(
+      sourcemeta::core::matches<std::string>(regex.value(), "\U0001F409"));
+  EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "D"));
+}
+
+TEST(Regex_matches, nonbmp_literal_match) {
+  // Test literal matching of 4-byte UTF-8 characters
+  const auto regex{sourcemeta::core::to_regex<std::string>("^\U0001F432$")};
+  EXPECT_TRUE(regex.has_value());
+  EXPECT_TRUE(
+      sourcemeta::core::matches<std::string>(regex.value(), "\U0001F432"));
+  EXPECT_FALSE(
+      sourcemeta::core::matches<std::string>(regex.value(), "\U0001F409"));
+  EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(),
+                                                      "\U0001F432\U0001F432"));
+}
+
+TEST(Regex_matches, xml_ncname_simplified) {
+  const auto regex{sourcemeta::core::to_regex<std::string>(
+      "^(?![:\\p{Nd}])[\\p{L}_][\\p{L}\\p{Nd}\\-._·]*$")};
+  EXPECT_TRUE(regex.has_value());
+
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "element"));
+  EXPECT_TRUE(
+      sourcemeta::core::matches<std::string>(regex.value(), "_element"));
+  EXPECT_TRUE(
+      sourcemeta::core::matches<std::string>(regex.value(), "element123"));
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "élément"));
+  EXPECT_TRUE(
+      sourcemeta::core::matches<std::string>(regex.value(), "element٠"));
+  EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(),
+                                                     "\U00010400element"));
+  EXPECT_FALSE(
+      sourcemeta::core::matches<std::string>(regex.value(), ":element"));
+  EXPECT_FALSE(
+      sourcemeta::core::matches<std::string>(regex.value(), "0element"));
+  EXPECT_FALSE(
+      sourcemeta::core::matches<std::string>(regex.value(), "٠element"));
+  EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(),
+                                                      "\U000104A0element"));
+}