@@ -159,3 +159,176 @@ TEST(Regex_matches, match_false_6) {
159159 EXPECT_TRUE (regex.has_value ());
160160 EXPECT_FALSE (sourcemeta::core::matches<std::string>(regex.value (), " bar" ));
161161}
162+
163+ TEST (Regex_matches, unicode_range_arabic_indic_digit) {
164+ // U+0660-U+0669 are Arabic-Indic digits
165+ // With /u flag: [\u0660-\u0669] would match any Arabic-Indic digit
166+ // Without /u flag: treated as UTF-8 bytes, requires complex pattern
167+ const auto regex{
168+ sourcemeta::core::to_regex<std::string>(" [\\ u0660-\\ u0669]" )};
169+ EXPECT_TRUE (regex.has_value ());
170+ EXPECT_TRUE (
171+ sourcemeta::core::matches<std::string>(regex.value (), " \u0660 " )); // ٠
172+ EXPECT_TRUE (
173+ sourcemeta::core::matches<std::string>(regex.value (), " \u0669 " )); // ٩
174+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(regex.value (), " 0" ));
175+ }
176+
177+ TEST (Regex_matches, unicode_range_4byte_deseret) {
178+ // U+10400-U+1044F are Deseret letters (4-byte UTF-8)
179+ // With /u flag: [\u{10400}-\u{1044F}] would work
180+ // Without /u flag: cannot express this range simply
181+ const auto regex{
182+ sourcemeta::core::to_regex<std::string>(" [\\ u{10400}-\\ u{1044F}]" )};
183+ EXPECT_TRUE (regex.has_value ());
184+ EXPECT_TRUE (
185+ sourcemeta::core::matches<std::string>(regex.value (), " \U00010400 " )); // 𐐀
186+ EXPECT_TRUE (
187+ sourcemeta::core::matches<std::string>(regex.value (), " \U0001044F " )); // 𐑏
188+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(regex.value (), " A" ));
189+ }
190+
191+ TEST (Regex_matches, unicode_property_letter) {
192+ // With /u flag and Unicode property escapes: \p{Letter} matches any letter
193+ // This would drastically simplify XML Name validation
194+ const auto regex{sourcemeta::core::to_regex<std::string>(
195+ " ^\\ p{Letter}[\\ p{Letter}\\ p{Number}]*$" )};
196+ EXPECT_TRUE (regex.has_value ());
197+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " hello" ));
198+ EXPECT_TRUE (
199+ sourcemeta::core::matches<std::string>(regex.value (), " hello123" ));
200+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (),
201+ " Àlement" )); // U+00C0
202+ EXPECT_TRUE (
203+ sourcemeta::core::matches<std::string>(regex.value (), " 中文" )); // Chinese
204+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(
205+ regex.value (), " \U00010400 test" )); // Deseret
206+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(
207+ regex.value (), " 123hello" )); // starts with digit
208+ }
209+
210+ TEST (Regex_matches, unicode_property_exclude_digit) {
211+ // With Unicode properties, we could exclude digits from start position
212+ // while allowing them in subsequent positions - exactly what XML NCName needs
213+ const auto regex{sourcemeta::core::to_regex<std::string>(
214+ " ^(?!\\ p{Number})\\ p{Letter}[\\ p{Letter}\\ p{Number}-_.]*$" )};
215+ EXPECT_TRUE (regex.has_value ());
216+ EXPECT_TRUE (
217+ sourcemeta::core::matches<std::string>(regex.value (), " element123" ));
218+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(
219+ regex.value (), " element٠" )); // Arabic-Indic digit in middle
220+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(
221+ regex.value (), " ٠element" )); // Arabic-Indic digit at start
222+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(
223+ regex.value (), " 0element" )); // ASCII digit at start
224+ }
225+
226+ TEST (Regex_matches, unicode_dot_matches_codepoint) {
227+ // With /u flag, . matches one Unicode codepoint (including 4-byte chars)
228+ // Without /u flag, . matches one byte
229+ const auto regex{sourcemeta::core::to_regex<std::string>(" ^.$" )};
230+ EXPECT_TRUE (regex.has_value ());
231+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " A" ));
232+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (),
233+ " À" )); // 2-byte UTF-8
234+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (),
235+ " 中" )); // 3-byte UTF-8
236+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(
237+ regex.value (), " \U00010400 " )); // 4-byte UTF-8
238+ }
239+
240+ TEST (Regex_matches, unicode_quantifier_on_codepoints) {
241+ // With /u flag, quantifiers work on Unicode codepoints
242+ // Without /u flag, quantifiers work on bytes
243+ const auto regex{sourcemeta::core::to_regex<std::string>(" ^.{3}$" )};
244+ EXPECT_TRUE (regex.has_value ());
245+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " ABC" ));
246+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(
247+ regex.value (), " ÀÁÂ" )); // 3 codepoints, 6 bytes
248+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(
249+ regex.value (), " 中文字" )); // 3 codepoints, 9 bytes
250+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(
251+ regex.value (),
252+ " \U00010400\U00010401\U00010402 " )); // 3 codepoints, 12 bytes
253+ }
254+
255+ TEST (Regex_matches, digit_ascii_only) {
256+ // \d should only match ASCII digits 0-9, not Unicode digits
257+ // From: https://github.com/json-schema-org/JSON-Schema-Test-Suite
258+ const auto regex{sourcemeta::core::to_regex<std::string>(" ^\\ d$" )};
259+ EXPECT_TRUE (regex.has_value ());
260+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " 0" ));
261+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(
262+ regex.value (), " \u07C0 " )); // NKO DIGIT ZERO
263+ }
264+
265+ TEST (Regex_matches, word_ascii_only) {
266+ // \w should only match ASCII [a-zA-Z0-9_], not Unicode letters
267+ const auto regex{sourcemeta::core::to_regex<std::string>(" ^\\ w$" )};
268+ EXPECT_TRUE (regex.has_value ());
269+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " a" ));
270+ EXPECT_FALSE (
271+ sourcemeta::core::matches<std::string>(regex.value (), " é" )); // e-acute
272+ }
273+
274+ TEST (Regex_matches, nonbmp_emoji_quantifier) {
275+ // Dragon emoji (U+1F432) is 4-byte UTF-8
276+ // Pattern ^🐲*$ should match zero or more dragon emojis
277+ // This tests that quantifiers work on codepoints, not bytes
278+ const auto regex{sourcemeta::core::to_regex<std::string>(" ^\U0001F432 *$" )};
279+ EXPECT_TRUE (regex.has_value ());
280+ EXPECT_TRUE (
281+ sourcemeta::core::matches<std::string>(regex.value (), " " )); // empty
282+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(
283+ regex.value (), " \U0001F432 " )); // one dragon
284+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(
285+ regex.value (), " \U0001F432\U0001F432 " )); // two dragons
286+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(
287+ regex.value (), " \U0001F409 " )); // different emoji
288+ EXPECT_FALSE (
289+ sourcemeta::core::matches<std::string>(regex.value (), " D" )); // ASCII
290+ }
291+
292+ TEST (Regex_matches, nonbmp_literal_match) {
293+ // Test literal matching of 4-byte UTF-8 characters
294+ const auto regex{sourcemeta::core::to_regex<std::string>(" ^\U0001F432 $" )};
295+ EXPECT_TRUE (regex.has_value ());
296+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (),
297+ " \U0001F432 " )); // dragon
298+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(
299+ regex.value (), " \U0001F409 " )); // different dragon
300+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(
301+ regex.value (), " \U0001F432\U0001F432 " )); // two dragons
302+ }
303+
304+ TEST (Regex_matches, xml_ncname_simplified) {
305+ // Current: 722KB exhaustive byte pattern
306+ // With Unicode: ~50 bytes using property escapes
307+ // NCName = letter (not colon) followed by letters/digits/punctuation
308+ const auto regex{sourcemeta::core::to_regex<std::string>(
309+ " ^(?![:\\ p{Nd}])[\\ p{L}_][\\ p{L}\\ p{Nd}\\ -._·]*$" )};
310+ EXPECT_TRUE (regex.has_value ());
311+
312+ // Valid NCNames
313+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (), " element" ));
314+ EXPECT_TRUE (
315+ sourcemeta::core::matches<std::string>(regex.value (), " _element" ));
316+ EXPECT_TRUE (
317+ sourcemeta::core::matches<std::string>(regex.value (), " element123" ));
318+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(regex.value (),
319+ " élément" )); // French
320+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(
321+ regex.value (), " element٠" )); // Arabic-Indic digit in middle
322+ EXPECT_TRUE (sourcemeta::core::matches<std::string>(
323+ regex.value (), " \U00010400 element" )); // Deseret at start
324+
325+ // Invalid NCNames
326+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(
327+ regex.value (), " :element" )); // starts with colon
328+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(
329+ regex.value (), " 0element" )); // starts with ASCII digit
330+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(
331+ regex.value (), " ٠element" )); // starts with Unicode digit
332+ EXPECT_FALSE (sourcemeta::core::matches<std::string>(
333+ regex.value (), " \U000104A0 element" )); // starts with Osmanya digit
334+ }
0 commit comments