@@ -1405,29 +1405,38 @@ test "ArrayList functions on a re-used list" {
1405
1405
}
1406
1406
}
1407
1407
1408
- /// Converts a UTF-8 string literal into a UTF-16LE string literal.
1409
- pub fn utf8ToUtf16LeStringLiteral (comptime utf8 : []const u8 ) * const [calcUtf16LeLen (utf8 ) catch | err | @compileError (err ):0 ]u16 {
1408
+ fn utf8ToUtf16LeStringLiteralImpl (comptime utf8 : []const u8 , comptime surrogates : Surrogates ) * const [calcUtf16LeLenImpl (utf8 , surrogates ) catch | err | @compileError (err ):0 ]u16 {
1410
1409
return comptime blk : {
1411
- const len : usize = calcUtf16LeLen (utf8 ) catch unreachable ;
1410
+ const len : usize = calcUtf16LeLenImpl (utf8 , surrogates ) catch unreachable ;
1412
1411
var utf16le : [len :0 ]u16 = [_ :0 ]u16 {0 } ** len ;
1413
- const utf16le_len = utf8ToUtf16Le (& utf16le , utf8 [0.. ]) catch | err | @compileError (err );
1412
+ const utf16le_len = utf8ToUtf16LeImpl (& utf16le , utf8 [0.. ], surrogates ) catch | err | @compileError (err );
1414
1413
assert (len == utf16le_len );
1415
1414
const final = utf16le ;
1416
1415
break :blk & final ;
1417
1416
};
1418
1417
}
1419
1418
1420
- const CalcUtf16LeLenError = Utf8DecodeError || error {Utf8InvalidStartByte };
1419
+ /// Converts a UTF-8 string literal into a UTF-16LE string literal.
1420
+ pub fn utf8ToUtf16LeStringLiteral (comptime utf8 : []const u8 ) * const [calcUtf16LeLen (utf8 ) catch | err | @compileError (err ):0 ]u16 {
1421
+ return utf8ToUtf16LeStringLiteralImpl (utf8 , .cannot_encode_surrogate_half );
1422
+ }
1421
1423
1422
- /// Returns length in UTF-16 of UTF-8 slice as length of []u16.
1423
- /// Length in []u8 is 2*len16.
1424
- pub fn calcUtf16LeLen (utf8 : []const u8 ) CalcUtf16LeLenError ! usize {
1424
+ /// Converts a WTF-8 string literal into a WTF-16LE string literal.
1425
+ pub fn wtf8ToWtf16LeStringLiteral (comptime wtf8 : []const u8 ) * const [calcWtf16LeLen (wtf8 ) catch | err | @compileError (err ):0 ]u16 {
1426
+ return utf8ToUtf16LeStringLiteralImpl (wtf8 , .can_encode_surrogate_half );
1427
+ }
1428
+
1429
+ pub fn calcUtf16LeLenImpl (utf8 : []const u8 , comptime surrogates : Surrogates ) ! usize {
1430
+ const utf8DecodeImpl = switch (surrogates ) {
1431
+ .cannot_encode_surrogate_half = > utf8Decode ,
1432
+ .can_encode_surrogate_half = > wtf8Decode ,
1433
+ };
1425
1434
var src_i : usize = 0 ;
1426
1435
var dest_len : usize = 0 ;
1427
1436
while (src_i < utf8 .len ) {
1428
1437
const n = try utf8ByteSequenceLength (utf8 [src_i ]);
1429
1438
const next_src_i = src_i + n ;
1430
- const codepoint = try utf8Decode (utf8 [src_i .. next_src_i ]);
1439
+ const codepoint = try utf8DecodeImpl (utf8 [src_i .. next_src_i ]);
1431
1440
if (codepoint < 0x10000 ) {
1432
1441
dest_len += 1 ;
1433
1442
} else {
@@ -1438,16 +1447,37 @@ pub fn calcUtf16LeLen(utf8: []const u8) CalcUtf16LeLenError!usize {
1438
1447
return dest_len ;
1439
1448
}
1440
1449
1441
- fn testCalcUtf16LeLen () ! void {
1442
- try testing .expectEqual (@as (usize , 1 ), try calcUtf16LeLen ("a" ));
1443
- try testing .expectEqual (@as (usize , 10 ), try calcUtf16LeLen ("abcdefghij" ));
1444
- try testing .expectEqual (@as (usize , 10 ), try calcUtf16LeLen ("äåéëþüúíóö" ));
1445
- try testing .expectEqual (@as (usize , 5 ), try calcUtf16LeLen ("こんにちは" ));
1450
+ const CalcUtf16LeLenError = Utf8DecodeError || error {Utf8InvalidStartByte };
1451
+
1452
+ /// Returns length in UTF-16LE of UTF-8 slice as length of []u16.
1453
+ /// Length in []u8 is 2*len16.
1454
+ pub fn calcUtf16LeLen (utf8 : []const u8 ) CalcUtf16LeLenError ! usize {
1455
+ return calcUtf16LeLenImpl (utf8 , .cannot_encode_surrogate_half );
1456
+ }
1457
+
1458
+ const CalcWtf16LeLenError = Wtf8DecodeError || error {Utf8InvalidStartByte };
1459
+
1460
+ /// Returns length in WTF-16LE of WTF-8 slice as length of []u16.
1461
+ /// Length in []u8 is 2*len16.
1462
+ pub fn calcWtf16LeLen (wtf8 : []const u8 ) CalcWtf16LeLenError ! usize {
1463
+ return calcUtf16LeLenImpl (wtf8 , .can_encode_surrogate_half );
1446
1464
}
1447
1465
1448
- test "calculate utf16 string length of given utf8 string in u16" {
1449
- try testCalcUtf16LeLen ();
1450
- try comptime testCalcUtf16LeLen ();
1466
+ fn testCalcUtf16LeLenImpl (calcUtf16LeLenImpl_ : anytype ) ! void {
1467
+ try testing .expectEqual (@as (usize , 1 ), try calcUtf16LeLenImpl_ ("a" ));
1468
+ try testing .expectEqual (@as (usize , 10 ), try calcUtf16LeLenImpl_ ("abcdefghij" ));
1469
+ try testing .expectEqual (@as (usize , 10 ), try calcUtf16LeLenImpl_ ("äåéëþüúíóö" ));
1470
+ try testing .expectEqual (@as (usize , 5 ), try calcUtf16LeLenImpl_ ("こんにちは" ));
1471
+ }
1472
+
1473
+ test calcUtf16LeLen {
1474
+ try testCalcUtf16LeLenImpl (calcUtf16LeLen );
1475
+ try comptime testCalcUtf16LeLenImpl (calcUtf16LeLen );
1476
+ }
1477
+
1478
+ test calcWtf16LeLen {
1479
+ try testCalcUtf16LeLenImpl (calcWtf16LeLen );
1480
+ try comptime testCalcUtf16LeLenImpl (calcWtf16LeLen );
1451
1481
}
1452
1482
1453
1483
/// Print the given `utf16le` string, encoded as UTF-8 bytes.
@@ -1487,8 +1517,10 @@ pub fn fmtUtf16Le(utf16le: []const u16) std.fmt.Formatter(formatUtf16Le) {
1487
1517
test fmtUtf16Le {
1488
1518
const expectFmt = testing .expectFmt ;
1489
1519
try expectFmt ("" , "{}" , .{fmtUtf16Le (utf8ToUtf16LeStringLiteral ("" ))});
1520
+ try expectFmt ("" , "{}" , .{fmtUtf16Le (wtf8ToWtf16LeStringLiteral ("" ))});
1490
1521
try expectFmt ("foo" , "{}" , .{fmtUtf16Le (utf8ToUtf16LeStringLiteral ("foo" ))});
1491
- try expectFmt ("𐐷" , "{}" , .{fmtUtf16Le (utf8ToUtf16LeStringLiteral ("𐐷" ))});
1522
+ try expectFmt ("foo" , "{}" , .{fmtUtf16Le (wtf8ToWtf16LeStringLiteral ("foo" ))});
1523
+ try expectFmt ("𐐷" , "{}" , .{fmtUtf16Le (wtf8ToWtf16LeStringLiteral ("𐐷" ))});
1492
1524
try expectFmt ("" , "{}" , .{fmtUtf16Le (&[_ ]u16 {mem .readInt (u16 , "\xff\xd7 " , native_endian )})});
1493
1525
try expectFmt ("�" , "{}" , .{fmtUtf16Le (&[_ ]u16 {mem .readInt (u16 , "\x00\xd8 " , native_endian )})});
1494
1526
try expectFmt ("�" , "{}" , .{fmtUtf16Le (&[_ ]u16 {mem .readInt (u16 , "\xff\xdb " , native_endian )})});
@@ -1497,12 +1529,12 @@ test fmtUtf16Le {
1497
1529
try expectFmt ("" , "{}" , .{fmtUtf16Le (&[_ ]u16 {mem .readInt (u16 , "\x00\xe0 " , native_endian )})});
1498
1530
}
1499
1531
1500
- test utf8ToUtf16LeStringLiteral {
1532
+ fn testUtf8ToUtf16LeStringLiteral ( utf8ToUtf16LeStringLiteral_ : anytype ) ! void {
1501
1533
{
1502
1534
const bytes = [_ :0 ]u16 {
1503
1535
mem .nativeToLittle (u16 , 0x41 ),
1504
1536
};
1505
- const utf16 = utf8ToUtf16LeStringLiteral ("A" );
1537
+ const utf16 = utf8ToUtf16LeStringLiteral_ ("A" );
1506
1538
try testing .expectEqualSlices (u16 , & bytes , utf16 );
1507
1539
try testing .expect (utf16 [1 ] == 0 );
1508
1540
}
@@ -1511,31 +1543,31 @@ test utf8ToUtf16LeStringLiteral {
1511
1543
mem .nativeToLittle (u16 , 0xD801 ),
1512
1544
mem .nativeToLittle (u16 , 0xDC37 ),
1513
1545
};
1514
- const utf16 = utf8ToUtf16LeStringLiteral ("𐐷" );
1546
+ const utf16 = utf8ToUtf16LeStringLiteral_ ("𐐷" );
1515
1547
try testing .expectEqualSlices (u16 , & bytes , utf16 );
1516
1548
try testing .expect (utf16 [2 ] == 0 );
1517
1549
}
1518
1550
{
1519
1551
const bytes = [_ :0 ]u16 {
1520
1552
mem .nativeToLittle (u16 , 0x02FF ),
1521
1553
};
1522
- const utf16 = utf8ToUtf16LeStringLiteral ("\u{02FF} " );
1554
+ const utf16 = utf8ToUtf16LeStringLiteral_ ("\u{02FF} " );
1523
1555
try testing .expectEqualSlices (u16 , & bytes , utf16 );
1524
1556
try testing .expect (utf16 [1 ] == 0 );
1525
1557
}
1526
1558
{
1527
1559
const bytes = [_ :0 ]u16 {
1528
1560
mem .nativeToLittle (u16 , 0x7FF ),
1529
1561
};
1530
- const utf16 = utf8ToUtf16LeStringLiteral ("\u{7FF} " );
1562
+ const utf16 = utf8ToUtf16LeStringLiteral_ ("\u{7FF} " );
1531
1563
try testing .expectEqualSlices (u16 , & bytes , utf16 );
1532
1564
try testing .expect (utf16 [1 ] == 0 );
1533
1565
}
1534
1566
{
1535
1567
const bytes = [_ :0 ]u16 {
1536
1568
mem .nativeToLittle (u16 , 0x801 ),
1537
1569
};
1538
- const utf16 = utf8ToUtf16LeStringLiteral ("\u{801} " );
1570
+ const utf16 = utf8ToUtf16LeStringLiteral_ ("\u{801} " );
1539
1571
try testing .expectEqualSlices (u16 , & bytes , utf16 );
1540
1572
try testing .expect (utf16 [1 ] == 0 );
1541
1573
}
@@ -1544,12 +1576,20 @@ test utf8ToUtf16LeStringLiteral {
1544
1576
mem .nativeToLittle (u16 , 0xDBFF ),
1545
1577
mem .nativeToLittle (u16 , 0xDFFF ),
1546
1578
};
1547
- const utf16 = utf8ToUtf16LeStringLiteral ("\u{10FFFF} " );
1579
+ const utf16 = utf8ToUtf16LeStringLiteral_ ("\u{10FFFF} " );
1548
1580
try testing .expectEqualSlices (u16 , & bytes , utf16 );
1549
1581
try testing .expect (utf16 [2 ] == 0 );
1550
1582
}
1551
1583
}
1552
1584
1585
+ test utf8ToUtf16LeStringLiteral {
1586
+ try testUtf8ToUtf16LeStringLiteral (utf8ToUtf16LeStringLiteral );
1587
+ }
1588
+
1589
+ test wtf8ToWtf16LeStringLiteral {
1590
+ try testUtf8ToUtf16LeStringLiteral (wtf8ToWtf16LeStringLiteral );
1591
+ }
1592
+
1553
1593
fn testUtf8CountCodepoints () ! void {
1554
1594
try testing .expectEqual (@as (usize , 10 ), try utf8CountCodepoints ("abcdefghij" ));
1555
1595
try testing .expectEqual (@as (usize , 10 ), try utf8CountCodepoints ("äåéëþüúíóö" ));
@@ -1795,6 +1835,30 @@ pub fn wtf8ToWtf16Le(wtf16le: []u16, wtf8: []const u8) error{InvalidWtf8}!usize
1795
1835
return utf8ToUtf16LeImpl (wtf16le , wtf8 , .can_encode_surrogate_half );
1796
1836
}
1797
1837
1838
+ fn checkUtf8ToUtf16LeOverflowImpl (utf8 : []const u8 , utf16le : []const u16 , comptime surrogates : Surrogates ) ! bool {
1839
+ // Each u8 in UTF-8/WTF-8 correlates to at most one u16 in UTF-16LE/WTF-16LE.
1840
+ if (utf16le .len >= utf8 .len ) return false ;
1841
+ const utf16_len = calcUtf16LeLenImpl (utf8 , surrogates ) catch {
1842
+ return switch (surrogates ) {
1843
+ .cannot_encode_surrogate_half = > error .InvalidUtf8 ,
1844
+ .can_encode_surrogate_half = > error .InvalidWtf8 ,
1845
+ };
1846
+ };
1847
+ return utf16_len > utf16le .len ;
1848
+ }
1849
+
1850
+ /// Checks if calling `utf8ToUtf16Le` would overflow. Might fail if utf8 is not
1851
+ /// valid UTF-8.
1852
+ pub fn checkUtf8ToUtf16LeOverflow (utf8 : []const u8 , utf16le : []const u16 ) error {InvalidUtf8 }! bool {
1853
+ return checkUtf8ToUtf16LeOverflowImpl (utf8 , utf16le , .cannot_encode_surrogate_half );
1854
+ }
1855
+
1856
+ /// Checks if calling `utf8ToUtf16Le` would overflow. Might fail if wtf8 is not
1857
+ /// valid WTF-8.
1858
+ pub fn checkWtf8ToWtf16LeOverflow (wtf8 : []const u8 , wtf16le : []const u16 ) error {InvalidWtf8 }! bool {
1859
+ return checkUtf8ToUtf16LeOverflowImpl (wtf8 , wtf16le , .can_encode_surrogate_half );
1860
+ }
1861
+
1798
1862
/// Surrogate codepoints (U+D800 to U+DFFF) are replaced by the Unicode replacement
1799
1863
/// character (U+FFFD).
1800
1864
/// All surrogate codepoints and the replacement character are encoded as three
@@ -2000,6 +2064,8 @@ fn testRoundtripWtf8(wtf8: []const u8) !void {
2000
2064
{
2001
2065
var wtf16_buf : [32 ]u16 = undefined ;
2002
2066
const wtf16_len = try wtf8ToWtf16Le (& wtf16_buf , wtf8 );
2067
+ try testing .expectEqual (wtf16_len , calcWtf16LeLen (wtf8 ));
2068
+ try testing .expectEqual (false , checkWtf8ToWtf16LeOverflow (wtf8 , & wtf16_buf ));
2003
2069
const wtf16 = wtf16_buf [0.. wtf16_len ];
2004
2070
2005
2071
var roundtripped_buf : [32 ]u8 = undefined ;
0 commit comments