Skip to content

Commit ceac31b

Browse files
committed
WIP
Signed-off-by: Juan Cruz Viotti <[email protected]>
1 parent 529d4a6 commit ceac31b

File tree

5 files changed

+141
-27
lines changed

5 files changed

+141
-27
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ endif()
8282

8383
if(SOURCEMETA_CORE_REGEX)
8484
find_package(BoostRegex REQUIRED)
85+
find_package(ICU REQUIRED)
8586
add_subdirectory(src/core/regex)
8687
endif()
8788

cmake/FindICU.cmake

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
if(NOT ICU_FOUND)
2+
set(ICU_ROOT "/opt/homebrew/opt/icu4c")
3+
4+
find_path(ICU_INCLUDE_DIR
5+
NAMES unicode/unistr.h
6+
PATHS ${ICU_ROOT}/include
7+
NO_DEFAULT_PATH)
8+
9+
find_library(ICU_UC_LIBRARY
10+
NAMES icuuc
11+
PATHS ${ICU_ROOT}/lib
12+
NO_DEFAULT_PATH)
13+
14+
find_library(ICU_I18N_LIBRARY
15+
NAMES icui18n
16+
PATHS ${ICU_ROOT}/lib
17+
NO_DEFAULT_PATH)
18+
19+
find_library(ICU_DATA_LIBRARY
20+
NAMES icudata
21+
PATHS ${ICU_ROOT}/lib
22+
NO_DEFAULT_PATH)
23+
24+
if(ICU_INCLUDE_DIR AND ICU_UC_LIBRARY AND ICU_I18N_LIBRARY AND ICU_DATA_LIBRARY)
25+
set(ICU_FOUND ON)
26+
set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR})
27+
set(ICU_LIBRARIES ${ICU_UC_LIBRARY} ${ICU_I18N_LIBRARY} ${ICU_DATA_LIBRARY})
28+
29+
if(NOT TARGET ICU::uc)
30+
add_library(ICU::uc UNKNOWN IMPORTED)
31+
set_target_properties(ICU::uc PROPERTIES
32+
IMPORTED_LOCATION "${ICU_UC_LIBRARY}"
33+
INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}")
34+
endif()
35+
36+
if(NOT TARGET ICU::i18n)
37+
add_library(ICU::i18n UNKNOWN IMPORTED)
38+
set_target_properties(ICU::i18n PROPERTIES
39+
IMPORTED_LOCATION "${ICU_I18N_LIBRARY}"
40+
INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}")
41+
endif()
42+
43+
if(NOT TARGET ICU::data)
44+
add_library(ICU::data UNKNOWN IMPORTED)
45+
set_target_properties(ICU::data PROPERTIES
46+
IMPORTED_LOCATION "${ICU_DATA_LIBRARY}"
47+
INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}")
48+
endif()
49+
50+
message(STATUS "ICU found")
51+
else()
52+
message(FATAL_ERROR "ICU not found. Please install ICU or set ICU_ROOT.")
53+
endif()
54+
endif()

src/core/regex/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ if(SOURCEMETA_CORE_INSTALL)
44
sourcemeta_library_install(NAMESPACE sourcemeta PROJECT core NAME regex)
55
endif()
66

7-
target_link_libraries(sourcemeta_core_regex INTERFACE Boost::regex)
7+
target_link_libraries(sourcemeta_core_regex INTERFACE Boost::regex ICU::uc ICU::i18n ICU::data)

src/core/regex/include/sourcemeta/core/regex.h

Lines changed: 85 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
#pragma GCC diagnostic ignored "-Wconversion"
1313
#endif
1414
#include <boost/regex.hpp>
15+
#include <memory> // std::unique_ptr
16+
#include <unicode/regex.h>
17+
#include <unicode/unistr.h>
1518
#if defined(__clang__)
1619
#pragma clang diagnostic pop
1720
#elif defined(__GNUC__)
@@ -20,6 +23,7 @@
2023

2124
#include <cassert> // assert
2225
#include <cstdint> // std::uint8_t, std::uint64_t
26+
#include <iostream> // std::cerr
2327
#include <optional> // std::optional
2428
#include <regex> // std::regex
2529
#include <string> // std::stoull
@@ -59,12 +63,15 @@ struct RegexTypeNoop {
5963
auto operator==(const RegexTypeNoop &) const noexcept -> bool = default;
6064
};
6165

66+
/// @ingroup regex
67+
using RegexTypeICU = std::shared_ptr<icu::RegexPattern>;
68+
6269
/// @ingroup regex
6370
template <typename T>
64-
using Regex =
65-
std::variant<RegexTypeBoost<typename T::value_type>, RegexTypePrefix<T>,
66-
RegexTypeNonEmpty, RegexTypeRange,
67-
RegexTypeStd<typename T::value_type>, RegexTypeNoop>;
71+
using Regex = std::variant<RegexTypeBoost<typename T::value_type>,
72+
RegexTypePrefix<T>, RegexTypeNonEmpty,
73+
RegexTypeRange, RegexTypeStd<typename T::value_type>,
74+
RegexTypeNoop, RegexTypeICU>;
6875
#if !defined(DOXYGEN)
6976
// For fast internal dispatching. It must stay in sync with the variant above
7077
enum class RegexIndex : std::uint8_t {
@@ -73,10 +80,31 @@ enum class RegexIndex : std::uint8_t {
7380
NonEmpty,
7481
Range,
7582
Std,
76-
Noop
83+
Noop,
84+
ICU
7785
};
7886
#endif
7987

88+
// Helper to detect if a pattern requires Unicode support
89+
template <typename T> auto requires_unicode(const T &pattern) -> bool {
90+
if (pattern.find("\\p{") != T::npos || pattern.find("\\u{") != T::npos ||
91+
pattern.find("\\u") != T::npos) {
92+
return true;
93+
}
94+
95+
for (const auto &ch : pattern) {
96+
if (static_cast<unsigned char>(ch) > 127) {
97+
return true;
98+
}
99+
}
100+
101+
if (pattern.find(".") != T::npos) {
102+
return true;
103+
}
104+
105+
return false;
106+
}
107+
80108
/// @ingroup regex
81109
///
82110
/// Compile a regular expression from a string. If the regular expression is
@@ -119,6 +147,32 @@ auto to_regex(const T &pattern) -> std::optional<Regex<T>> {
119147
return RegexTypeRange{minimum, maximum};
120148
}
121149

150+
if (requires_unicode(pattern)) {
151+
T icu_compatible_pattern{pattern};
152+
std::size_t pos{0};
153+
while ((pos = icu_compatible_pattern.find("\\u{", pos)) != T::npos) {
154+
icu_compatible_pattern.replace(pos, 3, "\\x{");
155+
pos += 3;
156+
}
157+
158+
UErrorCode status{U_ZERO_ERROR};
159+
UParseError parse_error;
160+
icu::UnicodeString icu_pattern{
161+
icu::UnicodeString::fromUTF8(icu_compatible_pattern)};
162+
163+
auto *regex_pattern{icu::RegexPattern::compile(icu_pattern, UREGEX_DOTALL,
164+
parse_error, status)};
165+
166+
if (U_FAILURE(status) || regex_pattern == nullptr) {
167+
if (regex_pattern != nullptr) {
168+
delete regex_pattern;
169+
}
170+
return std::nullopt;
171+
}
172+
173+
return std::shared_ptr<icu::RegexPattern>(regex_pattern);
174+
}
175+
122176
RegexTypeBoost<typename T::value_type> result{
123177
pattern,
124178
boost::regex::no_except |
@@ -194,6 +248,32 @@ auto matches(const Regex<T> &regex, const T &value) -> bool {
194248
value, *std::get_if<RegexTypeStd<typename T::value_type>>(&regex));
195249
case RegexIndex::Noop:
196250
return true;
251+
case RegexIndex::ICU: {
252+
const auto *icu_regex{std::get_if<RegexTypeICU>(&regex)};
253+
if (!icu_regex || !(*icu_regex)) {
254+
return false;
255+
}
256+
257+
icu::UnicodeString icu_input{icu::UnicodeString::fromUTF8(value)};
258+
UErrorCode status{U_ZERO_ERROR};
259+
auto *matcher{(*icu_regex)->matcher(icu_input, status)};
260+
261+
if (U_FAILURE(status) || matcher == nullptr) {
262+
if (matcher != nullptr) {
263+
delete matcher;
264+
}
265+
return false;
266+
}
267+
268+
auto result{matcher->find(status)};
269+
delete matcher;
270+
271+
if (U_FAILURE(status)) {
272+
return false;
273+
}
274+
275+
return result;
276+
}
197277
}
198278

199279
// See https://en.cppreference.com/w/cpp/utility/unreachable

test/regex/regex_matches_test.cc

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -161,9 +161,6 @@ TEST(Regex_matches, match_false_6) {
161161
}
162162

163163
TEST(Regex_matches, unicode_range_arabic_indic_digit) {
164-
// U+0660-U+0669 are Arabic-Indic digits
165-
// With /u flag: [\u0660-\u0669] would match any Arabic-Indic digit
166-
// Without /u flag: treated as UTF-8 bytes, requires complex pattern
167164
const auto regex{
168165
sourcemeta::core::to_regex<std::string>("[\\u0660-\\u0669]")};
169166
EXPECT_TRUE(regex.has_value());
@@ -173,9 +170,6 @@ TEST(Regex_matches, unicode_range_arabic_indic_digit) {
173170
}
174171

175172
TEST(Regex_matches, unicode_range_4byte_deseret) {
176-
// U+10400-U+1044F are Deseret letters (4-byte UTF-8)
177-
// With /u flag: [\u{10400}-\u{1044F}] would work
178-
// Without /u flag: cannot express this range simply
179173
const auto regex{
180174
sourcemeta::core::to_regex<std::string>("[\\u{10400}-\\u{1044F}]")};
181175
EXPECT_TRUE(regex.has_value());
@@ -187,8 +181,6 @@ TEST(Regex_matches, unicode_range_4byte_deseret) {
187181
}
188182

189183
TEST(Regex_matches, unicode_property_letter) {
190-
// With /u flag and Unicode property escapes: \p{Letter} matches any letter
191-
// This would drastically simplify XML Name validation
192184
const auto regex{sourcemeta::core::to_regex<std::string>(
193185
"^\\p{Letter}[\\p{Letter}\\p{Number}]*$")};
194186
EXPECT_TRUE(regex.has_value());
@@ -204,8 +196,6 @@ TEST(Regex_matches, unicode_property_letter) {
204196
}
205197

206198
TEST(Regex_matches, unicode_property_exclude_digit) {
207-
// With Unicode properties, we could exclude digits from start position
208-
// while allowing them in subsequent positions - exactly what XML NCName needs
209199
const auto regex{sourcemeta::core::to_regex<std::string>(
210200
"^(?!\\p{Number})\\p{Letter}[\\p{Letter}\\p{Number}-_.]*$")};
211201
EXPECT_TRUE(regex.has_value());
@@ -220,8 +210,6 @@ TEST(Regex_matches, unicode_property_exclude_digit) {
220210
}
221211

222212
TEST(Regex_matches, unicode_dot_matches_codepoint) {
223-
// With /u flag, . matches one Unicode codepoint (including 4-byte chars)
224-
// Without /u flag, . matches one byte
225213
const auto regex{sourcemeta::core::to_regex<std::string>("^.$")};
226214
EXPECT_TRUE(regex.has_value());
227215
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "A"));
@@ -232,8 +220,6 @@ TEST(Regex_matches, unicode_dot_matches_codepoint) {
232220
}
233221

234222
TEST(Regex_matches, unicode_quantifier_on_codepoints) {
235-
// With /u flag, quantifiers work on Unicode codepoints
236-
// Without /u flag, quantifiers work on bytes
237223
const auto regex{sourcemeta::core::to_regex<std::string>("^.{3}$")};
238224
EXPECT_TRUE(regex.has_value());
239225
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "ABC"));
@@ -245,7 +231,6 @@ TEST(Regex_matches, unicode_quantifier_on_codepoints) {
245231

246232
TEST(Regex_matches, digit_ascii_only) {
247233
// \d should only match ASCII digits 0-9, not Unicode digits
248-
// From: https://github.com/json-schema-org/JSON-Schema-Test-Suite
249234
const auto regex{sourcemeta::core::to_regex<std::string>("^\\d$")};
250235
EXPECT_TRUE(regex.has_value());
251236
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "0"));
@@ -289,14 +274,10 @@ TEST(Regex_matches, nonbmp_literal_match) {
289274
}
290275

291276
TEST(Regex_matches, xml_ncname_simplified) {
292-
// Current: 722KB exhaustive byte pattern
293-
// With Unicode: ~50 bytes using property escapes
294-
// NCName = letter (not colon) followed by letters/digits/punctuation
295277
const auto regex{sourcemeta::core::to_regex<std::string>(
296278
"^(?![:\\p{Nd}])[\\p{L}_][\\p{L}\\p{Nd}\\-._·]*$")};
297279
EXPECT_TRUE(regex.has_value());
298280

299-
// Valid NCNames
300281
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "element"));
301282
EXPECT_TRUE(
302283
sourcemeta::core::matches<std::string>(regex.value(), "_element"));
@@ -307,8 +288,6 @@ TEST(Regex_matches, xml_ncname_simplified) {
307288
sourcemeta::core::matches<std::string>(regex.value(), "element٠"));
308289
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(),
309290
"\U00010400element"));
310-
311-
// Invalid NCNames
312291
EXPECT_FALSE(
313292
sourcemeta::core::matches<std::string>(regex.value(), ":element"));
314293
EXPECT_FALSE(

0 commit comments

Comments
 (0)