Skip to content

Commit 445ecb9

Browse files
committed
WIP
Signed-off-by: Juan Cruz Viotti <[email protected]>
1 parent 529d4a6 commit 445ecb9

File tree

5 files changed

+157
-33
lines changed

5 files changed

+157
-33
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ endif()
8282

8383
if(SOURCEMETA_CORE_REGEX)
8484
find_package(BoostRegex REQUIRED)
85+
find_package(ICU REQUIRED)
8586
add_subdirectory(src/core/regex)
8687
endif()
8788

cmake/FindICU.cmake

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
if(NOT ICU_FOUND)
2+
set(ICU_ROOT "/opt/homebrew/opt/icu4c")
3+
4+
find_path(ICU_INCLUDE_DIR
5+
NAMES unicode/unistr.h
6+
PATHS ${ICU_ROOT}/include
7+
NO_DEFAULT_PATH)
8+
9+
find_library(ICU_UC_LIBRARY
10+
NAMES icuuc
11+
PATHS ${ICU_ROOT}/lib
12+
NO_DEFAULT_PATH)
13+
14+
find_library(ICU_I18N_LIBRARY
15+
NAMES icui18n
16+
PATHS ${ICU_ROOT}/lib
17+
NO_DEFAULT_PATH)
18+
19+
find_library(ICU_DATA_LIBRARY
20+
NAMES icudata
21+
PATHS ${ICU_ROOT}/lib
22+
NO_DEFAULT_PATH)
23+
24+
if(ICU_INCLUDE_DIR AND ICU_UC_LIBRARY AND ICU_I18N_LIBRARY AND ICU_DATA_LIBRARY)
25+
set(ICU_FOUND ON)
26+
set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR})
27+
set(ICU_LIBRARIES ${ICU_UC_LIBRARY} ${ICU_I18N_LIBRARY} ${ICU_DATA_LIBRARY})
28+
29+
if(NOT TARGET ICU::uc)
30+
add_library(ICU::uc UNKNOWN IMPORTED)
31+
set_target_properties(ICU::uc PROPERTIES
32+
IMPORTED_LOCATION "${ICU_UC_LIBRARY}"
33+
INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}")
34+
endif()
35+
36+
if(NOT TARGET ICU::i18n)
37+
add_library(ICU::i18n UNKNOWN IMPORTED)
38+
set_target_properties(ICU::i18n PROPERTIES
39+
IMPORTED_LOCATION "${ICU_I18N_LIBRARY}"
40+
INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}")
41+
endif()
42+
43+
if(NOT TARGET ICU::data)
44+
add_library(ICU::data UNKNOWN IMPORTED)
45+
set_target_properties(ICU::data PROPERTIES
46+
IMPORTED_LOCATION "${ICU_DATA_LIBRARY}"
47+
INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}")
48+
endif()
49+
50+
message(STATUS "ICU found")
51+
else()
52+
message(FATAL_ERROR "ICU not found. Please install ICU or set ICU_ROOT.")
53+
endif()
54+
endif()

src/core/regex/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ if(SOURCEMETA_CORE_INSTALL)
44
sourcemeta_library_install(NAMESPACE sourcemeta PROJECT core NAME regex)
55
endif()
66

7-
target_link_libraries(sourcemeta_core_regex INTERFACE Boost::regex)
7+
target_link_libraries(sourcemeta_core_regex INTERFACE Boost::regex ICU::uc ICU::i18n ICU::data)

src/core/regex/include/sourcemeta/core/regex.h

Lines changed: 101 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,23 @@
1212
#pragma GCC diagnostic ignored "-Wconversion"
1313
#endif
1414
#include <boost/regex.hpp>
15+
#include <unicode/regex.h>
16+
#include <unicode/unistr.h>
1517
#if defined(__clang__)
1618
#pragma clang diagnostic pop
1719
#elif defined(__GNUC__)
1820
#pragma GCC diagnostic pop
1921
#endif
2022

21-
#include <cassert> // assert
22-
#include <cstdint> // std::uint8_t, std::uint64_t
23-
#include <optional> // std::optional
24-
#include <regex> // std::regex
25-
#include <string> // std::stoull
26-
#include <utility> // std::pair
27-
#include <variant> // std::variant
23+
#include <algorithm> // std::ranges::any_of
24+
#include <cassert> // assert
25+
#include <cstdint> // std::uint8_t, std::uint64_t
26+
#include <memory> // std::shared_ptr, std::unique_ptr
27+
#include <optional> // std::optional
28+
#include <regex> // std::regex
29+
#include <string> // std::stoull
30+
#include <utility> // std::pair
31+
#include <variant> // std::variant
2832

2933
/// @defgroup regex Regex
3034
/// @brief An opinionated regex ECMA 262 implementation for JSON Schema
@@ -59,12 +63,15 @@ struct RegexTypeNoop {
5963
auto operator==(const RegexTypeNoop &) const noexcept -> bool = default;
6064
};
6165

66+
/// @ingroup regex
67+
using RegexTypeICU = std::shared_ptr<icu::RegexPattern>;
68+
6269
/// @ingroup regex
6370
template <typename T>
64-
using Regex =
65-
std::variant<RegexTypeBoost<typename T::value_type>, RegexTypePrefix<T>,
66-
RegexTypeNonEmpty, RegexTypeRange,
67-
RegexTypeStd<typename T::value_type>, RegexTypeNoop>;
71+
using Regex = std::variant<RegexTypeBoost<typename T::value_type>,
72+
RegexTypePrefix<T>, RegexTypeNonEmpty,
73+
RegexTypeRange, RegexTypeStd<typename T::value_type>,
74+
RegexTypeICU, RegexTypeNoop>;
6875
#if !defined(DOXYGEN)
6976
// For fast internal dispatching. It must stay in sync with the variant above
7077
enum class RegexIndex : std::uint8_t {
@@ -73,10 +80,41 @@ enum class RegexIndex : std::uint8_t {
7380
NonEmpty,
7481
Range,
7582
Std,
83+
ICU,
7684
Noop
7785
};
7886
#endif
7987

88+
/// @ingroup regex
89+
///
90+
/// Heuristically detect if a regular expression pattern seems to require
91+
/// Unicode support. Patterns appear to require Unicode support if they contain
92+
/// Unicode property escapes (`\p{}`), Unicode codepoint escapes (`\u` or
93+
/// `\u{}`), non-ASCII characters, or the dot metacharacter (which should match
94+
/// Unicode codepoints). This is a best-effort heuristic detection and may have
95+
/// false positives in edge cases. For example:
96+
///
97+
/// ```cpp
98+
/// #include <sourcemeta/core/regex.h>
99+
/// #include <cassert>
100+
///
101+
/// assert(sourcemeta::core::seems_unicode("\\p{Letter}"));
102+
/// assert(sourcemeta::core::seems_unicode("\\u0041"));
103+
/// assert(sourcemeta::core::seems_unicode("café"));
104+
/// assert(sourcemeta::core::seems_unicode(".+"));
105+
/// assert(!sourcemeta::core::seems_unicode("^[a-z]+$"));
106+
/// ```
107+
template <typename T> auto seems_unicode(const T &pattern) -> bool {
108+
return pattern.find("\\p{") != T::npos || pattern.find("\\u{") != T::npos ||
109+
pattern.find("\\u") != T::npos ||
110+
std::ranges::any_of(pattern,
111+
[](const auto character) {
112+
return static_cast<unsigned char>(character) >
113+
127;
114+
}) ||
115+
pattern.find(".") != T::npos;
116+
}
117+
80118
/// @ingroup regex
81119
///
82120
/// Compile a regular expression from a string. If the regular expression is
@@ -119,6 +157,35 @@ auto to_regex(const T &pattern) -> std::optional<Regex<T>> {
119157
return RegexTypeRange{minimum, maximum};
120158
}
121159

160+
if (seems_unicode(pattern)) {
161+
T icu_compatible_pattern{pattern};
162+
std::size_t position{0};
163+
// ICU uses \x{} syntax for Unicode codepoint escapes, while ECMAScript
164+
// uses \u{}. Convert the pattern to ICU-compatible syntax.
165+
while ((position = icu_compatible_pattern.find("\\u{", position)) !=
166+
T::npos) {
167+
icu_compatible_pattern.replace(position, 3, "\\x{");
168+
position += 3;
169+
}
170+
171+
UErrorCode status{U_ZERO_ERROR};
172+
UParseError parse_error;
173+
icu::UnicodeString icu_pattern{
174+
icu::UnicodeString::fromUTF8(icu_compatible_pattern)};
175+
176+
auto *regex_pattern{icu::RegexPattern::compile(icu_pattern, UREGEX_DOTALL,
177+
parse_error, status)};
178+
179+
if (U_FAILURE(status) || regex_pattern == nullptr) {
180+
if (regex_pattern != nullptr) {
181+
delete regex_pattern;
182+
}
183+
return std::nullopt;
184+
}
185+
186+
return std::shared_ptr<icu::RegexPattern>(regex_pattern);
187+
}
188+
122189
RegexTypeBoost<typename T::value_type> result{
123190
pattern,
124191
boost::regex::no_except |
@@ -192,6 +259,29 @@ auto matches(const Regex<T> &regex, const T &value) -> bool {
192259
case RegexIndex::Std:
193260
return std::regex_search(
194261
value, *std::get_if<RegexTypeStd<typename T::value_type>>(&regex));
262+
case RegexIndex::ICU: {
263+
const auto *icu_regex{std::get_if<RegexTypeICU>(&regex)};
264+
if (!icu_regex || !(*icu_regex)) {
265+
return false;
266+
}
267+
268+
icu::UnicodeString icu_input{icu::UnicodeString::fromUTF8(value)};
269+
UErrorCode status{U_ZERO_ERROR};
270+
std::unique_ptr<icu::RegexMatcher> matcher{
271+
(*icu_regex)->matcher(icu_input, status)};
272+
273+
if (U_FAILURE(status) || !matcher) {
274+
return false;
275+
}
276+
277+
auto result{matcher->find(status)};
278+
279+
if (U_FAILURE(status)) {
280+
return false;
281+
}
282+
283+
return result;
284+
}
195285
case RegexIndex::Noop:
196286
return true;
197287
}

test/regex/regex_matches_test.cc

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -161,9 +161,6 @@ TEST(Regex_matches, match_false_6) {
161161
}
162162

163163
TEST(Regex_matches, unicode_range_arabic_indic_digit) {
164-
// U+0660-U+0669 are Arabic-Indic digits
165-
// With /u flag: [\u0660-\u0669] would match any Arabic-Indic digit
166-
// Without /u flag: treated as UTF-8 bytes, requires complex pattern
167164
const auto regex{
168165
sourcemeta::core::to_regex<std::string>("[\\u0660-\\u0669]")};
169166
EXPECT_TRUE(regex.has_value());
@@ -173,9 +170,6 @@ TEST(Regex_matches, unicode_range_arabic_indic_digit) {
173170
}
174171

175172
TEST(Regex_matches, unicode_range_4byte_deseret) {
176-
// U+10400-U+1044F are Deseret letters (4-byte UTF-8)
177-
// With /u flag: [\u{10400}-\u{1044F}] would work
178-
// Without /u flag: cannot express this range simply
179173
const auto regex{
180174
sourcemeta::core::to_regex<std::string>("[\\u{10400}-\\u{1044F}]")};
181175
EXPECT_TRUE(regex.has_value());
@@ -187,8 +181,6 @@ TEST(Regex_matches, unicode_range_4byte_deseret) {
187181
}
188182

189183
TEST(Regex_matches, unicode_property_letter) {
190-
// With /u flag and Unicode property escapes: \p{Letter} matches any letter
191-
// This would drastically simplify XML Name validation
192184
const auto regex{sourcemeta::core::to_regex<std::string>(
193185
"^\\p{Letter}[\\p{Letter}\\p{Number}]*$")};
194186
EXPECT_TRUE(regex.has_value());
@@ -204,8 +196,6 @@ TEST(Regex_matches, unicode_property_letter) {
204196
}
205197

206198
TEST(Regex_matches, unicode_property_exclude_digit) {
207-
// With Unicode properties, we could exclude digits from start position
208-
// while allowing them in subsequent positions - exactly what XML NCName needs
209199
const auto regex{sourcemeta::core::to_regex<std::string>(
210200
"^(?!\\p{Number})\\p{Letter}[\\p{Letter}\\p{Number}-_.]*$")};
211201
EXPECT_TRUE(regex.has_value());
@@ -220,8 +210,6 @@ TEST(Regex_matches, unicode_property_exclude_digit) {
220210
}
221211

222212
TEST(Regex_matches, unicode_dot_matches_codepoint) {
223-
// With /u flag, . matches one Unicode codepoint (including 4-byte chars)
224-
// Without /u flag, . matches one byte
225213
const auto regex{sourcemeta::core::to_regex<std::string>("^.$")};
226214
EXPECT_TRUE(regex.has_value());
227215
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "A"));
@@ -232,8 +220,6 @@ TEST(Regex_matches, unicode_dot_matches_codepoint) {
232220
}
233221

234222
TEST(Regex_matches, unicode_quantifier_on_codepoints) {
235-
// With /u flag, quantifiers work on Unicode codepoints
236-
// Without /u flag, quantifiers work on bytes
237223
const auto regex{sourcemeta::core::to_regex<std::string>("^.{3}$")};
238224
EXPECT_TRUE(regex.has_value());
239225
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "ABC"));
@@ -245,7 +231,6 @@ TEST(Regex_matches, unicode_quantifier_on_codepoints) {
245231

246232
TEST(Regex_matches, digit_ascii_only) {
247233
// \d should only match ASCII digits 0-9, not Unicode digits
248-
// From: https://github.com/json-schema-org/JSON-Schema-Test-Suite
249234
const auto regex{sourcemeta::core::to_regex<std::string>("^\\d$")};
250235
EXPECT_TRUE(regex.has_value());
251236
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "0"));
@@ -289,14 +274,10 @@ TEST(Regex_matches, nonbmp_literal_match) {
289274
}
290275

291276
TEST(Regex_matches, xml_ncname_simplified) {
292-
// Current: 722KB exhaustive byte pattern
293-
// With Unicode: ~50 bytes using property escapes
294-
// NCName = letter (not colon) followed by letters/digits/punctuation
295277
const auto regex{sourcemeta::core::to_regex<std::string>(
296278
"^(?![:\\p{Nd}])[\\p{L}_][\\p{L}\\p{Nd}\\-._·]*$")};
297279
EXPECT_TRUE(regex.has_value());
298280

299-
// Valid NCNames
300281
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "element"));
301282
EXPECT_TRUE(
302283
sourcemeta::core::matches<std::string>(regex.value(), "_element"));
@@ -307,8 +288,6 @@ TEST(Regex_matches, xml_ncname_simplified) {
307288
sourcemeta::core::matches<std::string>(regex.value(), "element٠"));
308289
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(),
309290
"\U00010400element"));
310-
311-
// Invalid NCNames
312291
EXPECT_FALSE(
313292
sourcemeta::core::matches<std::string>(regex.value(), ":element"));
314293
EXPECT_FALSE(

0 commit comments

Comments
 (0)