Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ endif()

if(SOURCEMETA_CORE_REGEX)
find_package(BoostRegex REQUIRED)
find_package(ICU REQUIRED)
add_subdirectory(src/core/regex)
endif()

Expand Down
54 changes: 54 additions & 0 deletions cmake/FindICU.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
if(NOT ICU_FOUND)
set(ICU_ROOT "/opt/homebrew/opt/icu4c")

find_path(ICU_INCLUDE_DIR
NAMES unicode/unistr.h
PATHS ${ICU_ROOT}/include
NO_DEFAULT_PATH)

find_library(ICU_UC_LIBRARY
NAMES icuuc
PATHS ${ICU_ROOT}/lib
NO_DEFAULT_PATH)

find_library(ICU_I18N_LIBRARY
NAMES icui18n
PATHS ${ICU_ROOT}/lib
NO_DEFAULT_PATH)

find_library(ICU_DATA_LIBRARY
NAMES icudata
PATHS ${ICU_ROOT}/lib
NO_DEFAULT_PATH)

if(ICU_INCLUDE_DIR AND ICU_UC_LIBRARY AND ICU_I18N_LIBRARY AND ICU_DATA_LIBRARY)
set(ICU_FOUND ON)
set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR})
set(ICU_LIBRARIES ${ICU_UC_LIBRARY} ${ICU_I18N_LIBRARY} ${ICU_DATA_LIBRARY})

if(NOT TARGET ICU::uc)
add_library(ICU::uc UNKNOWN IMPORTED)
set_target_properties(ICU::uc PROPERTIES
IMPORTED_LOCATION "${ICU_UC_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}")
endif()

if(NOT TARGET ICU::i18n)
add_library(ICU::i18n UNKNOWN IMPORTED)
set_target_properties(ICU::i18n PROPERTIES
IMPORTED_LOCATION "${ICU_I18N_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}")
endif()

if(NOT TARGET ICU::data)
add_library(ICU::data UNKNOWN IMPORTED)
set_target_properties(ICU::data PROPERTIES
IMPORTED_LOCATION "${ICU_DATA_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}")
endif()

message(STATUS "ICU found")
else()
message(FATAL_ERROR "ICU not found. Please install ICU or set ICU_ROOT.")
endif()
endif()
2 changes: 1 addition & 1 deletion src/core/regex/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ if(SOURCEMETA_CORE_INSTALL)
sourcemeta_library_install(NAMESPACE sourcemeta PROJECT core NAME regex)
endif()

target_link_libraries(sourcemeta_core_regex INTERFACE Boost::regex)
target_link_libraries(sourcemeta_core_regex INTERFACE Boost::regex ICU::uc ICU::i18n ICU::data)
112 changes: 101 additions & 11 deletions src/core/regex/include/sourcemeta/core/regex.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,23 @@
#pragma GCC diagnostic ignored "-Wconversion"
#endif
#include <boost/regex.hpp>
#include <unicode/regex.h>
#include <unicode/unistr.h>
#if defined(__clang__)
#pragma clang diagnostic pop
#elif defined(__GNUC__)
#pragma GCC diagnostic pop
#endif

#include <cassert> // assert
#include <cstdint> // std::uint8_t, std::uint64_t
#include <optional> // std::optional
#include <regex> // std::regex
#include <string> // std::stoull
#include <utility> // std::pair
#include <variant> // std::variant
#include <algorithm> // std::ranges::any_of
#include <cassert> // assert
#include <cstdint> // std::uint8_t, std::uint64_t
#include <memory> // std::shared_ptr, std::unique_ptr
#include <optional> // std::optional
#include <regex> // std::regex
#include <string> // std::stoull
#include <utility> // std::pair
#include <variant> // std::variant

/// @defgroup regex Regex
/// @brief An opinionated regex ECMA 262 implementation for JSON Schema
Expand Down Expand Up @@ -59,12 +63,15 @@ struct RegexTypeNoop {
auto operator==(const RegexTypeNoop &) const noexcept -> bool = default;
};

/// @ingroup regex
using RegexTypeICU = std::shared_ptr<icu::RegexPattern>;

/// @ingroup regex
template <typename T>
using Regex =
std::variant<RegexTypeBoost<typename T::value_type>, RegexTypePrefix<T>,
RegexTypeNonEmpty, RegexTypeRange,
RegexTypeStd<typename T::value_type>, RegexTypeNoop>;
using Regex = std::variant<RegexTypeBoost<typename T::value_type>,
RegexTypePrefix<T>, RegexTypeNonEmpty,
RegexTypeRange, RegexTypeStd<typename T::value_type>,
RegexTypeICU, RegexTypeNoop>;
#if !defined(DOXYGEN)
// For fast internal dispatching. It must stay in sync with the variant above
enum class RegexIndex : std::uint8_t {
Expand All @@ -73,10 +80,41 @@ enum class RegexIndex : std::uint8_t {
NonEmpty,
Range,
Std,
ICU,
Noop
};
#endif

/// @ingroup regex
///
/// Heuristically detect if a regular expression pattern seems to require
/// Unicode support. Patterns appear to require Unicode support if they contain
/// Unicode property escapes (`\p{}`), Unicode codepoint escapes (`\u` or
/// `\u{}`), non-ASCII characters, or the dot metacharacter (which should match
/// Unicode codepoints). This is a best-effort heuristic detection and may have
/// false positives in edge cases. For example:
///
/// ```cpp
/// #include <sourcemeta/core/regex.h>
/// #include <cassert>
///
/// assert(sourcemeta::core::seems_unicode("\\p{Letter}"));
/// assert(sourcemeta::core::seems_unicode("\\u0041"));
/// assert(sourcemeta::core::seems_unicode("café"));
/// assert(sourcemeta::core::seems_unicode(".+"));
/// assert(!sourcemeta::core::seems_unicode("^[a-z]+$"));
/// ```
template <typename T> auto seems_unicode(const T &pattern) -> bool {
return pattern.find("\\p{") != T::npos || pattern.find("\\u{") != T::npos ||
pattern.find("\\u") != T::npos ||
std::ranges::any_of(pattern,
[](const auto character) {
return static_cast<unsigned char>(character) >
127;
}) ||
pattern.find(".") != T::npos;
}

/// @ingroup regex
///
/// Compile a regular expression from a string. If the regular expression is
Expand Down Expand Up @@ -119,6 +157,35 @@ auto to_regex(const T &pattern) -> std::optional<Regex<T>> {
return RegexTypeRange{minimum, maximum};
}

if (seems_unicode(pattern)) {
T icu_compatible_pattern{pattern};
std::size_t position{0};
// ICU uses \x{} syntax for Unicode codepoint escapes, while ECMAScript
// uses \u{}. Convert the pattern to ICU-compatible syntax.
while ((position = icu_compatible_pattern.find("\\u{", position)) !=
T::npos) {
icu_compatible_pattern.replace(position, 3, "\\x{");
position += 3;
}

UErrorCode status{U_ZERO_ERROR};
UParseError parse_error;
icu::UnicodeString icu_pattern{
icu::UnicodeString::fromUTF8(icu_compatible_pattern)};

auto *regex_pattern{icu::RegexPattern::compile(icu_pattern, UREGEX_DOTALL,
parse_error, status)};

if (U_FAILURE(status) || regex_pattern == nullptr) {
if (regex_pattern != nullptr) {
delete regex_pattern;
}
return std::nullopt;
}

return std::shared_ptr<icu::RegexPattern>(regex_pattern);
}

RegexTypeBoost<typename T::value_type> result{
pattern,
boost::regex::no_except |
Expand Down Expand Up @@ -192,6 +259,29 @@ auto matches(const Regex<T> &regex, const T &value) -> bool {
case RegexIndex::Std:
return std::regex_search(
value, *std::get_if<RegexTypeStd<typename T::value_type>>(&regex));
case RegexIndex::ICU: {
const auto *icu_regex{std::get_if<RegexTypeICU>(&regex)};
if (!icu_regex || !(*icu_regex)) {
return false;
}

icu::UnicodeString icu_input{icu::UnicodeString::fromUTF8(value)};
UErrorCode status{U_ZERO_ERROR};
std::unique_ptr<icu::RegexMatcher> matcher{
(*icu_regex)->matcher(icu_input, status)};

if (U_FAILURE(status) || !matcher) {
return false;
}

auto result{matcher->find(status)};

if (U_FAILURE(status)) {
return false;
}

return result;
}
case RegexIndex::Noop:
return true;
}
Expand Down
138 changes: 138 additions & 0 deletions test/regex/regex_matches_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,141 @@ TEST(Regex_matches, match_false_6) {
EXPECT_TRUE(regex.has_value());
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "bar"));
}

TEST(Regex_matches, unicode_range_arabic_indic_digit) {
const auto regex{
sourcemeta::core::to_regex<std::string>("[\\u0660-\\u0669]")};
EXPECT_TRUE(regex.has_value());
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "\u0660"));
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "\u0669"));
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "0"));
}

TEST(Regex_matches, unicode_range_4byte_deseret) {
const auto regex{
sourcemeta::core::to_regex<std::string>("[\\u{10400}-\\u{1044F}]")};
EXPECT_TRUE(regex.has_value());
EXPECT_TRUE(
sourcemeta::core::matches<std::string>(regex.value(), "\U00010400"));
EXPECT_TRUE(
sourcemeta::core::matches<std::string>(regex.value(), "\U0001044F"));
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "A"));
}

TEST(Regex_matches, unicode_property_letter) {
const auto regex{sourcemeta::core::to_regex<std::string>(
"^\\p{Letter}[\\p{Letter}\\p{Number}]*$")};
EXPECT_TRUE(regex.has_value());
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "hello"));
EXPECT_TRUE(
sourcemeta::core::matches<std::string>(regex.value(), "hello123"));
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "Àlement"));
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "中文"));
EXPECT_TRUE(
sourcemeta::core::matches<std::string>(regex.value(), "\U00010400test"));
EXPECT_FALSE(
sourcemeta::core::matches<std::string>(regex.value(), "123hello"));
}

TEST(Regex_matches, unicode_property_exclude_digit) {
const auto regex{sourcemeta::core::to_regex<std::string>(
"^(?!\\p{Number})\\p{Letter}[\\p{Letter}\\p{Number}-_.]*$")};
EXPECT_TRUE(regex.has_value());
EXPECT_TRUE(
sourcemeta::core::matches<std::string>(regex.value(), "element123"));
EXPECT_TRUE(
sourcemeta::core::matches<std::string>(regex.value(), "element٠"));
EXPECT_FALSE(
sourcemeta::core::matches<std::string>(regex.value(), "٠element"));
EXPECT_FALSE(
sourcemeta::core::matches<std::string>(regex.value(), "0element"));
}

TEST(Regex_matches, unicode_dot_matches_codepoint) {
const auto regex{sourcemeta::core::to_regex<std::string>("^.$")};
EXPECT_TRUE(regex.has_value());
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "A"));
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "À"));
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "中"));
EXPECT_TRUE(
sourcemeta::core::matches<std::string>(regex.value(), "\U00010400"));
}

TEST(Regex_matches, unicode_quantifier_on_codepoints) {
const auto regex{sourcemeta::core::to_regex<std::string>("^.{3}$")};
EXPECT_TRUE(regex.has_value());
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "ABC"));
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "ÀÁÂ"));
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "中文字"));
EXPECT_TRUE(sourcemeta::core::matches<std::string>(
regex.value(), "\U00010400\U00010401\U00010402"));
}

TEST(Regex_matches, digit_ascii_only) {
// \d should only match ASCII digits 0-9, not Unicode digits
const auto regex{sourcemeta::core::to_regex<std::string>("^\\d$")};
EXPECT_TRUE(regex.has_value());
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "0"));
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "\u07C0"));
}

TEST(Regex_matches, word_ascii_only) {
// \w should only match ASCII [a-zA-Z0-9_], not Unicode letters
const auto regex{sourcemeta::core::to_regex<std::string>("^\\w$")};
EXPECT_TRUE(regex.has_value());
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "a"));
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "é"));
}

TEST(Regex_matches, nonbmp_emoji_quantifier) {
// Dragon emoji (U+1F432) is 4-byte UTF-8
// Pattern ^🐲*$ should match zero or more dragon emojis
// This tests that quantifiers work on codepoints, not bytes
const auto regex{sourcemeta::core::to_regex<std::string>("^\U0001F432*$")};
EXPECT_TRUE(regex.has_value());
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), ""));
EXPECT_TRUE(
sourcemeta::core::matches<std::string>(regex.value(), "\U0001F432"));
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(),
"\U0001F432\U0001F432"));
EXPECT_FALSE(
sourcemeta::core::matches<std::string>(regex.value(), "\U0001F409"));
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "D"));
}

TEST(Regex_matches, nonbmp_literal_match) {
// Test literal matching of 4-byte UTF-8 characters
const auto regex{sourcemeta::core::to_regex<std::string>("^\U0001F432$")};
EXPECT_TRUE(regex.has_value());
EXPECT_TRUE(
sourcemeta::core::matches<std::string>(regex.value(), "\U0001F432"));
EXPECT_FALSE(
sourcemeta::core::matches<std::string>(regex.value(), "\U0001F409"));
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(),
"\U0001F432\U0001F432"));
}

TEST(Regex_matches, xml_ncname_simplified) {
const auto regex{sourcemeta::core::to_regex<std::string>(
"^(?![:\\p{Nd}])[\\p{L}_][\\p{L}\\p{Nd}\\-._·]*$")};
EXPECT_TRUE(regex.has_value());

EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "element"));
EXPECT_TRUE(
sourcemeta::core::matches<std::string>(regex.value(), "_element"));
EXPECT_TRUE(
sourcemeta::core::matches<std::string>(regex.value(), "element123"));
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "élément"));
EXPECT_TRUE(
sourcemeta::core::matches<std::string>(regex.value(), "element٠"));
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(),
"\U00010400element"));
EXPECT_FALSE(
sourcemeta::core::matches<std::string>(regex.value(), ":element"));
EXPECT_FALSE(
sourcemeta::core::matches<std::string>(regex.value(), "0element"));
EXPECT_FALSE(
sourcemeta::core::matches<std::string>(regex.value(), "٠element"));
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(),
"\U000104A0element"));
}
Loading