Skip to content

Commit 43e7d1b

Browse files
committed
[WIP] Improve Unicode handling for regexes
Signed-off-by: Juan Cruz Viotti <[email protected]>
1 parent e15cf6b commit 43e7d1b

File tree

1 file changed

+173
-0
lines changed

1 file changed

+173
-0
lines changed

test/regex/regex_matches_test.cc

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,176 @@ TEST(Regex_matches, match_false_6) {
159159
EXPECT_TRUE(regex.has_value());
160160
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "bar"));
161161
}
162+
163+
TEST(Regex_matches, unicode_range_arabic_indic_digit) {
164+
// U+0660-U+0669 are Arabic-Indic digits
165+
// With /u flag: [\u0660-\u0669] would match any Arabic-Indic digit
166+
// Without /u flag: treated as UTF-8 bytes, requires complex pattern
167+
const auto regex{
168+
sourcemeta::core::to_regex<std::string>("[\\u0660-\\u0669]")};
169+
EXPECT_TRUE(regex.has_value());
170+
EXPECT_TRUE(
171+
sourcemeta::core::matches<std::string>(regex.value(), "\u0660")); // ٠
172+
EXPECT_TRUE(
173+
sourcemeta::core::matches<std::string>(regex.value(), "\u0669")); // ٩
174+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "0"));
175+
}
176+
177+
TEST(Regex_matches, unicode_range_4byte_deseret) {
178+
// U+10400-U+1044F are Deseret letters (4-byte UTF-8)
179+
// With /u flag: [\u{10400}-\u{1044F}] would work
180+
// Without /u flag: cannot express this range simply
181+
const auto regex{
182+
sourcemeta::core::to_regex<std::string>("[\\u{10400}-\\u{1044F}]")};
183+
EXPECT_TRUE(regex.has_value());
184+
EXPECT_TRUE(
185+
sourcemeta::core::matches<std::string>(regex.value(), "\U00010400")); // 𐐀
186+
EXPECT_TRUE(
187+
sourcemeta::core::matches<std::string>(regex.value(), "\U0001044F")); // 𐑏
188+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(regex.value(), "A"));
189+
}
190+
191+
TEST(Regex_matches, unicode_property_letter) {
192+
// With /u flag and Unicode property escapes: \p{Letter} matches any letter
193+
// This would drastically simplify XML Name validation
194+
const auto regex{sourcemeta::core::to_regex<std::string>(
195+
"^\\p{Letter}[\\p{Letter}\\p{Number}]*$")};
196+
EXPECT_TRUE(regex.has_value());
197+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "hello"));
198+
EXPECT_TRUE(
199+
sourcemeta::core::matches<std::string>(regex.value(), "hello123"));
200+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(),
201+
"Àlement")); // U+00C0
202+
EXPECT_TRUE(
203+
sourcemeta::core::matches<std::string>(regex.value(), "中文")); // Chinese
204+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(
205+
regex.value(), "\U00010400test")); // Deseret
206+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(
207+
regex.value(), "123hello")); // starts with digit
208+
}
209+
210+
TEST(Regex_matches, unicode_property_exclude_digit) {
211+
// With Unicode properties, we could exclude digits from start position
212+
// while allowing them in subsequent positions - exactly what XML NCName needs
213+
const auto regex{sourcemeta::core::to_regex<std::string>(
214+
"^(?!\\p{Number})\\p{Letter}[\\p{Letter}\\p{Number}-_.]*$")};
215+
EXPECT_TRUE(regex.has_value());
216+
EXPECT_TRUE(
217+
sourcemeta::core::matches<std::string>(regex.value(), "element123"));
218+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(
219+
regex.value(), "element٠")); // Arabic-Indic digit in middle
220+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(
221+
regex.value(), "٠element")); // Arabic-Indic digit at start
222+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(
223+
regex.value(), "0element")); // ASCII digit at start
224+
}
225+
226+
TEST(Regex_matches, unicode_dot_matches_codepoint) {
227+
// With /u flag, . matches one Unicode codepoint (including 4-byte chars)
228+
// Without /u flag, . matches one byte
229+
const auto regex{sourcemeta::core::to_regex<std::string>("^.$")};
230+
EXPECT_TRUE(regex.has_value());
231+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "A"));
232+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(),
233+
"À")); // 2-byte UTF-8
234+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(),
235+
"")); // 3-byte UTF-8
236+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(
237+
regex.value(), "\U00010400")); // 4-byte UTF-8
238+
}
239+
240+
TEST(Regex_matches, unicode_quantifier_on_codepoints) {
241+
// With /u flag, quantifiers work on Unicode codepoints
242+
// Without /u flag, quantifiers work on bytes
243+
const auto regex{sourcemeta::core::to_regex<std::string>("^.{3}$")};
244+
EXPECT_TRUE(regex.has_value());
245+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "ABC"));
246+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(
247+
regex.value(), "ÀÁÂ")); // 3 codepoints, 6 bytes
248+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(
249+
regex.value(), "中文字")); // 3 codepoints, 9 bytes
250+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(
251+
regex.value(),
252+
"\U00010400\U00010401\U00010402")); // 3 codepoints, 12 bytes
253+
}
254+
255+
TEST(Regex_matches, digit_ascii_only) {
256+
// \d should only match ASCII digits 0-9, not Unicode digits
257+
// From: https://github.com/json-schema-org/JSON-Schema-Test-Suite
258+
const auto regex{sourcemeta::core::to_regex<std::string>("^\\d$")};
259+
EXPECT_TRUE(regex.has_value());
260+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "0"));
261+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(
262+
regex.value(), "\u07C0")); // NKO DIGIT ZERO
263+
}
264+
265+
TEST(Regex_matches, word_ascii_only) {
266+
// \w should only match ASCII [a-zA-Z0-9_], not Unicode letters
267+
const auto regex{sourcemeta::core::to_regex<std::string>("^\\w$")};
268+
EXPECT_TRUE(regex.has_value());
269+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "a"));
270+
EXPECT_FALSE(
271+
sourcemeta::core::matches<std::string>(regex.value(), "é")); // e-acute
272+
}
273+
274+
TEST(Regex_matches, nonbmp_emoji_quantifier) {
275+
// Dragon emoji (U+1F432) is 4-byte UTF-8
276+
// Pattern ^🐲*$ should match zero or more dragon emojis
277+
// This tests that quantifiers work on codepoints, not bytes
278+
const auto regex{sourcemeta::core::to_regex<std::string>("^\U0001F432*$")};
279+
EXPECT_TRUE(regex.has_value());
280+
EXPECT_TRUE(
281+
sourcemeta::core::matches<std::string>(regex.value(), "")); // empty
282+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(
283+
regex.value(), "\U0001F432")); // one dragon
284+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(
285+
regex.value(), "\U0001F432\U0001F432")); // two dragons
286+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(
287+
regex.value(), "\U0001F409")); // different emoji
288+
EXPECT_FALSE(
289+
sourcemeta::core::matches<std::string>(regex.value(), "D")); // ASCII
290+
}
291+
292+
TEST(Regex_matches, nonbmp_literal_match) {
293+
// Test literal matching of 4-byte UTF-8 characters
294+
const auto regex{sourcemeta::core::to_regex<std::string>("^\U0001F432$")};
295+
EXPECT_TRUE(regex.has_value());
296+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(),
297+
"\U0001F432")); // dragon
298+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(
299+
regex.value(), "\U0001F409")); // different dragon
300+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(
301+
regex.value(), "\U0001F432\U0001F432")); // two dragons
302+
}
303+
304+
TEST(Regex_matches, xml_ncname_simplified) {
305+
// Current: 722KB exhaustive byte pattern
306+
// With Unicode: ~50 bytes using property escapes
307+
// NCName = letter (not colon) followed by letters/digits/punctuation
308+
const auto regex{sourcemeta::core::to_regex<std::string>(
309+
"^(?![:\\p{Nd}])[\\p{L}_][\\p{L}\\p{Nd}\\-._·]*$")};
310+
EXPECT_TRUE(regex.has_value());
311+
312+
// Valid NCNames
313+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(), "element"));
314+
EXPECT_TRUE(
315+
sourcemeta::core::matches<std::string>(regex.value(), "_element"));
316+
EXPECT_TRUE(
317+
sourcemeta::core::matches<std::string>(regex.value(), "element123"));
318+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(regex.value(),
319+
"élément")); // French
320+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(
321+
regex.value(), "element٠")); // Arabic-Indic digit in middle
322+
EXPECT_TRUE(sourcemeta::core::matches<std::string>(
323+
regex.value(), "\U00010400element")); // Deseret at start
324+
325+
// Invalid NCNames
326+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(
327+
regex.value(), ":element")); // starts with colon
328+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(
329+
regex.value(), "0element")); // starts with ASCII digit
330+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(
331+
regex.value(), "٠element")); // starts with Unicode digit
332+
EXPECT_FALSE(sourcemeta::core::matches<std::string>(
333+
regex.value(), "\U000104A0element")); // starts with Osmanya digit
334+
}

0 commit comments

Comments
 (0)