Skip to content

Commit 529d4a6

Browse files
committed
[WIP] Improve Unicode handling for regexes
Signed-off-by: Juan Cruz Viotti <[email protected]>
1 parent e15cf6b commit 529d4a6

File tree

1 file changed

+159
-0
lines changed

1 file changed

+159
-0
lines changed

test/regex/regex_matches_test.cc

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,162 @@ TEST(Regex_matches, match_false_6) {
159159
EXPECT_TRUE(regex.has_value());
160160
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "bar"));
161161
}
162+
163+
TEST(Regex_matches, unicode_range_arabic_indic_digit) {
164+
// U+0660-U+0669 are Arabic-Indic digits
165+
// With /u flag: [\u0660-\u0669] would match any Arabic-Indic digit
166+
// Without /u flag: treated as UTF-8 bytes, requires complex pattern
167+
const auto regex{
168+
sourcemeta::core::to_regex<std::string>("[\\u0660-\\u0669]")};
169+
EXPECT_TRUE(regex.has_value());
170+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "\u0660"));
171+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "\u0669"));
172+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "0"));
173+
}
174+
175+
TEST(Regex_matches, unicode_range_4byte_deseret) {
176+
// U+10400-U+1044F are Deseret letters (4-byte UTF-8)
177+
// With /u flag: [\u{10400}-\u{1044F}] would work
178+
// Without /u flag: cannot express this range simply
179+
const auto regex{
180+
sourcemeta::core::to_regex<std::string>("[\\u{10400}-\\u{1044F}]")};
181+
EXPECT_TRUE(regex.has_value());
182+
EXPECT_TRUE(
183+
sourcemeta::core::matches<std::string>(regex.value(), "\U00010400"));
184+
EXPECT_TRUE(
185+
sourcemeta::core::matches<std::string>(regex.value(), "\U0001044F"));
186+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "A"));
187+
}
188+
189+
TEST(Regex_matches, unicode_property_letter) {
190+
// With /u flag and Unicode property escapes: \p{Letter} matches any letter
191+
// This would drastically simplify XML Name validation
192+
const auto regex{sourcemeta::core::to_regex<std::string>(
193+
"^\\p{Letter}[\\p{Letter}\\p{Number}]*$")};
194+
EXPECT_TRUE(regex.has_value());
195+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "hello"));
196+
EXPECT_TRUE(
197+
sourcemeta::core::matches<std::string>(regex.value(), "hello123"));
198+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "Àlement"));
199+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "中文"));
200+
EXPECT_TRUE(
201+
sourcemeta::core::matches<std::string>(regex.value(), "\U00010400test"));
202+
EXPECT_FALSE(
203+
sourcemeta::core::matches<std::string>(regex.value(), "123hello"));
204+
}
205+
206+
TEST(Regex_matches, unicode_property_exclude_digit) {
207+
// With Unicode properties, we could exclude digits from start position
208+
// while allowing them in subsequent positions - exactly what XML NCName needs
209+
const auto regex{sourcemeta::core::to_regex<std::string>(
210+
"^(?!\\p{Number})\\p{Letter}[\\p{Letter}\\p{Number}-_.]*$")};
211+
EXPECT_TRUE(regex.has_value());
212+
EXPECT_TRUE(
213+
sourcemeta::core::matches<std::string>(regex.value(), "element123"));
214+
EXPECT_TRUE(
215+
sourcemeta::core::matches<std::string>(regex.value(), "element٠"));
216+
EXPECT_FALSE(
217+
sourcemeta::core::matches<std::string>(regex.value(), "٠element"));
218+
EXPECT_FALSE(
219+
sourcemeta::core::matches<std::string>(regex.value(), "0element"));
220+
}
221+
222+
TEST(Regex_matches, unicode_dot_matches_codepoint) {
223+
// With /u flag, . matches one Unicode codepoint (including 4-byte chars)
224+
// Without /u flag, . matches one byte
225+
const auto regex{sourcemeta::core::to_regex<std::string>("^.$")};
226+
EXPECT_TRUE(regex.has_value());
227+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "A"));
228+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "À"));
229+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), ""));
230+
EXPECT_TRUE(
231+
sourcemeta::core::matches<std::string>(regex.value(), "\U00010400"));
232+
}
233+
234+
TEST(Regex_matches, unicode_quantifier_on_codepoints) {
235+
// With /u flag, quantifiers work on Unicode codepoints
236+
// Without /u flag, quantifiers work on bytes
237+
const auto regex{sourcemeta::core::to_regex<std::string>("^.{3}$")};
238+
EXPECT_TRUE(regex.has_value());
239+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "ABC"));
240+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "ÀÁÂ"));
241+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "中文字"));
242+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(
243+
regex.value(), "\U00010400\U00010401\U00010402"));
244+
}
245+
246+
TEST(Regex_matches, digit_ascii_only) {
247+
// \d should only match ASCII digits 0-9, not Unicode digits
248+
// From: https://github.com/json-schema-org/JSON-Schema-Test-Suite
249+
const auto regex{sourcemeta::core::to_regex<std::string>("^\\d$")};
250+
EXPECT_TRUE(regex.has_value());
251+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "0"));
252+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "\u07C0"));
253+
}
254+
255+
TEST(Regex_matches, word_ascii_only) {
256+
// \w should only match ASCII [a-zA-Z0-9_], not Unicode letters
257+
const auto regex{sourcemeta::core::to_regex<std::string>("^\\w$")};
258+
EXPECT_TRUE(regex.has_value());
259+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "a"));
260+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "é"));
261+
}
262+
263+
TEST(Regex_matches, nonbmp_emoji_quantifier) {
264+
// Dragon emoji (U+1F432) is 4-byte UTF-8
265+
// Pattern ^🐲*$ should match zero or more dragon emojis
266+
// This tests that quantifiers work on codepoints, not bytes
267+
const auto regex{sourcemeta::core::to_regex<std::string>("^\U0001F432*$")};
268+
EXPECT_TRUE(regex.has_value());
269+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), ""));
270+
EXPECT_TRUE(
271+
sourcemeta::core::matches<std::string>(regex.value(), "\U0001F432"));
272+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(),
273+
"\U0001F432\U0001F432"));
274+
EXPECT_FALSE(
275+
sourcemeta::core::matches<std::string>(regex.value(), "\U0001F409"));
276+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "D"));
277+
}
278+
279+
TEST(Regex_matches, nonbmp_literal_match) {
280+
// Test literal matching of 4-byte UTF-8 characters
281+
const auto regex{sourcemeta::core::to_regex<std::string>("^\U0001F432$")};
282+
EXPECT_TRUE(regex.has_value());
283+
EXPECT_TRUE(
284+
sourcemeta::core::matches<std::string>(regex.value(), "\U0001F432"));
285+
EXPECT_FALSE(
286+
sourcemeta::core::matches<std::string>(regex.value(), "\U0001F409"));
287+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(),
288+
"\U0001F432\U0001F432"));
289+
}
290+
291+
TEST(Regex_matches, xml_ncname_simplified) {
292+
// Current: 722KB exhaustive byte pattern
293+
// With Unicode: ~50 bytes using property escapes
294+
// NCName = letter (not colon) followed by letters/digits/punctuation
295+
const auto regex{sourcemeta::core::to_regex<std::string>(
296+
"^(?![:\\p{Nd}])[\\p{L}_][\\p{L}\\p{Nd}\\-._·]*$")};
297+
EXPECT_TRUE(regex.has_value());
298+
299+
// Valid NCNames
300+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "element"));
301+
EXPECT_TRUE(
302+
sourcemeta::core::matches<std::string>(regex.value(), "_element"));
303+
EXPECT_TRUE(
304+
sourcemeta::core::matches<std::string>(regex.value(), "element123"));
305+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "élément"));
306+
EXPECT_TRUE(
307+
sourcemeta::core::matches<std::string>(regex.value(), "element٠"));
308+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(),
309+
"\U00010400element"));
310+
311+
// Invalid NCNames
312+
EXPECT_FALSE(
313+
sourcemeta::core::matches<std::string>(regex.value(), ":element"));
314+
EXPECT_FALSE(
315+
sourcemeta::core::matches<std::string>(regex.value(), "0element"));
316+
EXPECT_FALSE(
317+
sourcemeta::core::matches<std::string>(regex.value(), "٠element"));
318+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(),
319+
"\U000104A0element"));
320+
}

0 commit comments

Comments
 (0)