@@ -159,3 +159,162 @@ TEST(Regex_matches, match_false_6) {
159159 EXPECT_TRUE (regex.has_value ());
160160 EXPECT_FALSE (sourcemeta::core::matches<std::string>(regex.value (), " bar" ));
161161}
162+
163+ TEST (Regex_matches, unicode_range_arabic_indic_digit) {
164+ // U+0660-U+0669 are Arabic-Indic digits
165+ // With /u flag: [\u0660-\u0669] would match any Arabic-Indic digit
166+ // Without /u flag: treated as UTF-8 bytes, requires complex pattern
167+ const auto regex{
168+ sourcemeta::core::to_regex<std::string>(" [\\ u0660-\\ u0669]" )};
169+ EXPECT_TRUE (regex.has_value ());
170+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " \u0660 " ));
171+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " \u0669 " ));
172+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(regex.value (), " 0" ));
173+ }
174+
175+ TEST (Regex_matches, unicode_range_4byte_deseret) {
176+ // U+10400-U+1044F are Deseret letters (4-byte UTF-8)
177+ // With /u flag: [\u{10400}-\u{1044F}] would work
178+ // Without /u flag: cannot express this range simply
179+ const auto regex{
180+ sourcemeta::core::to_regex<std::string>(" [\\ u{10400}-\\ u{1044F}]" )};
181+ EXPECT_TRUE (regex.has_value ());
182+ EXPECT_TRUE (
183+ sourcemeta::core::matches<std::string>(regex.value (), " \U00010400 " ));
184+ EXPECT_TRUE (
185+ sourcemeta::core::matches<std::string>(regex.value (), " \U0001044F " ));
186+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(regex.value (), " A" ));
187+ }
188+
189+ TEST (Regex_matches, unicode_property_letter) {
190+ // With /u flag and Unicode property escapes: \p{Letter} matches any letter
191+ // This would drastically simplify XML Name validation
192+ const auto regex{sourcemeta::core::to_regex<std::string>(
193+ " ^\\ p{Letter}[\\ p{Letter}\\ p{Number}]*$" )};
194+ EXPECT_TRUE (regex.has_value ());
195+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " hello" ));
196+ EXPECT_TRUE (
197+ sourcemeta::core::matches<std::string>(regex.value (), " hello123" ));
198+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " Àlement" ));
199+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " 中文" ));
200+ EXPECT_TRUE (
201+ sourcemeta::core::matches<std::string>(regex.value (), " \U00010400 test" ));
202+ EXPECT_FALSE (
203+ sourcemeta::core::matches<std::string>(regex.value (), " 123hello" ));
204+ }
205+
206+ TEST (Regex_matches, unicode_property_exclude_digit) {
207+ // With Unicode properties, we could exclude digits from start position
208+ // while allowing them in subsequent positions - exactly what XML NCName needs
209+ const auto regex{sourcemeta::core::to_regex<std::string>(
210+ " ^(?!\\ p{Number})\\ p{Letter}[\\ p{Letter}\\ p{Number}-_.]*$" )};
211+ EXPECT_TRUE (regex.has_value ());
212+ EXPECT_TRUE (
213+ sourcemeta::core::matches<std::string>(regex.value (), " element123" ));
214+ EXPECT_TRUE (
215+ sourcemeta::core::matches<std::string>(regex.value (), " element٠" ));
216+ EXPECT_FALSE (
217+ sourcemeta::core::matches<std::string>(regex.value (), " ٠element" ));
218+ EXPECT_FALSE (
219+ sourcemeta::core::matches<std::string>(regex.value (), " 0element" ));
220+ }
221+
222+ TEST (Regex_matches, unicode_dot_matches_codepoint) {
223+ // With /u flag, . matches one Unicode codepoint (including 4-byte chars)
224+ // Without /u flag, . matches one byte
225+ const auto regex{sourcemeta::core::to_regex<std::string>(" ^.$" )};
226+ EXPECT_TRUE (regex.has_value ());
227+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " A" ));
228+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " À" ));
229+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " 中" ));
230+ EXPECT_TRUE (
231+ sourcemeta::core::matches<std::string>(regex.value (), " \U00010400 " ));
232+ }
233+
234+ TEST (Regex_matches, unicode_quantifier_on_codepoints) {
235+ // With /u flag, quantifiers work on Unicode codepoints
236+ // Without /u flag, quantifiers work on bytes
237+ const auto regex{sourcemeta::core::to_regex<std::string>(" ^.{3}$" )};
238+ EXPECT_TRUE (regex.has_value ());
239+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " ABC" ));
240+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " ÀÁÂ" ));
241+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " 中文字" ));
242+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(
243+ regex.value (), " \U00010400\U00010401\U00010402 " ));
244+ }
245+
246+ TEST (Regex_matches, digit_ascii_only) {
247+ // \d should only match ASCII digits 0-9, not Unicode digits
248+ // From: https://github.com/json-schema-org/JSON-Schema-Test-Suite
249+ const auto regex{sourcemeta::core::to_regex<std::string>(" ^\\ d$" )};
250+ EXPECT_TRUE (regex.has_value ());
251+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " 0" ));
252+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(regex.value (), " \u07C0 " ));
253+ }
254+
255+ TEST (Regex_matches, word_ascii_only) {
256+ // \w should only match ASCII [a-zA-Z0-9_], not Unicode letters
257+ const auto regex{sourcemeta::core::to_regex<std::string>(" ^\\ w$" )};
258+ EXPECT_TRUE (regex.has_value ());
259+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " a" ));
260+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(regex.value (), " é" ));
261+ }
262+
263+ TEST (Regex_matches, nonbmp_emoji_quantifier) {
264+ // Dragon emoji (U+1F432) is 4-byte UTF-8
265+ // Pattern ^🐲*$ should match zero or more dragon emojis
266+ // This tests that quantifiers work on codepoints, not bytes
267+ const auto regex{sourcemeta::core::to_regex<std::string>(" ^\U0001F432 *$" )};
268+ EXPECT_TRUE (regex.has_value ());
269+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " " ));
270+ EXPECT_TRUE (
271+ sourcemeta::core::matches<std::string>(regex.value (), " \U0001F432 " ));
272+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (),
273+ " \U0001F432\U0001F432 " ));
274+ EXPECT_FALSE (
275+ sourcemeta::core::matches<std::string>(regex.value (), " \U0001F409 " ));
276+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(regex.value (), " D" ));
277+ }
278+
279+ TEST (Regex_matches, nonbmp_literal_match) {
280+ // Test literal matching of 4-byte UTF-8 characters
281+ const auto regex{sourcemeta::core::to_regex<std::string>(" ^\U0001F432 $" )};
282+ EXPECT_TRUE (regex.has_value ());
283+ EXPECT_TRUE (
284+ sourcemeta::core::matches<std::string>(regex.value (), " \U0001F432 " ));
285+ EXPECT_FALSE (
286+ sourcemeta::core::matches<std::string>(regex.value (), " \U0001F409 " ));
287+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(regex.value (),
288+ " \U0001F432\U0001F432 " ));
289+ }
290+
291+ TEST (Regex_matches, xml_ncname_simplified) {
292+ // Current: 722KB exhaustive byte pattern
293+ // With Unicode: ~50 bytes using property escapes
294+ // NCName = letter (not colon) followed by letters/digits/punctuation
295+ const auto regex{sourcemeta::core::to_regex<std::string>(
296+ " ^(?![:\\ p{Nd}])[\\ p{L}_][\\ p{L}\\ p{Nd}\\ -._·]*$" )};
297+ EXPECT_TRUE (regex.has_value ());
298+
299+ // Valid NCNames
300+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " element" ));
301+ EXPECT_TRUE (
302+ sourcemeta::core::matches<std::string>(regex.value (), " _element" ));
303+ EXPECT_TRUE (
304+ sourcemeta::core::matches<std::string>(regex.value (), " element123" ));
305+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " élément" ));
306+ EXPECT_TRUE (
307+ sourcemeta::core::matches<std::string>(regex.value (), " element٠" ));
308+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (),
309+ " \U00010400 element" ));
310+
311+ // Invalid NCNames
312+ EXPECT_FALSE (
313+ sourcemeta::core::matches<std::string>(regex.value (), " :element" ));
314+ EXPECT_FALSE (
315+ sourcemeta::core::matches<std::string>(regex.value (), " 0element" ));
316+ EXPECT_FALSE (
317+ sourcemeta::core::matches<std::string>(regex.value (), " ٠element" ));
318+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(regex.value (),
319+ " \U000104A0 element" ));
320+ }
0 commit comments