Skip to content

Commit 08e83fe

Browse files
authored
Merge pull request #20297 from sno2/wtf8-conversion-buffer-overflows
std: fix buffer overflows from improper WTF encoding
2 parents 219acaa + 0b35080 commit 08e83fe

File tree

4 files changed

+112
-29
lines changed

4 files changed

+112
-29
lines changed

lib/std/fs/Dir.zig

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1789,6 +1789,9 @@ pub fn symLink(
17891789
// when converting to an NT namespaced path. CreateSymbolicLink in
17901790
// symLinkW will handle the necessary conversion.
17911791
var target_path_w: windows.PathSpace = undefined;
1792+
if (try std.unicode.checkWtf8ToWtf16LeOverflow(target_path, &target_path_w.data)) {
1793+
return error.NameTooLong;
1794+
}
17921795
target_path_w.len = try std.unicode.wtf8ToWtf16Le(&target_path_w.data, target_path);
17931796
target_path_w.data[target_path_w.len] = 0;
17941797
// However, we need to canonicalize any path separators to `\`, since if

lib/std/posix.zig

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3124,8 +3124,10 @@ pub fn chdir(dir_path: []const u8) ChangeCurDirError!void {
31243124
@compileError("WASI does not support os.chdir");
31253125
} else if (native_os == .windows) {
31263126
var wtf16_dir_path: [windows.PATH_MAX_WIDE]u16 = undefined;
3127-
const len = try std.unicode.wtf8ToWtf16Le(wtf16_dir_path[0..], dir_path);
3128-
if (len > wtf16_dir_path.len) return error.NameTooLong;
3127+
if (try std.unicode.checkWtf8ToWtf16LeOverflow(dir_path, &wtf16_dir_path)) {
3128+
return error.NameTooLong;
3129+
}
3130+
const len = try std.unicode.wtf8ToWtf16Le(&wtf16_dir_path, dir_path);
31293131
return chdirW(wtf16_dir_path[0..len]);
31303132
} else {
31313133
const dir_path_c = try toPosixPath(dir_path);
@@ -3139,9 +3141,12 @@ pub fn chdir(dir_path: []const u8) ChangeCurDirError!void {
31393141
/// On other platforms, `dir_path` is an opaque sequence of bytes with no particular encoding.
31403142
pub fn chdirZ(dir_path: [*:0]const u8) ChangeCurDirError!void {
31413143
if (native_os == .windows) {
3144+
const dir_path_span = mem.span(dir_path);
31423145
var wtf16_dir_path: [windows.PATH_MAX_WIDE]u16 = undefined;
3143-
const len = try std.unicode.wtf8ToWtf16Le(wtf16_dir_path[0..], mem.span(dir_path));
3144-
if (len > wtf16_dir_path.len) return error.NameTooLong;
3146+
if (try std.unicode.checkWtf8ToWtf16LeOverflow(dir_path_span, &wtf16_dir_path)) {
3147+
return error.NameTooLong;
3148+
}
3149+
const len = try std.unicode.wtf8ToWtf16Le(&wtf16_dir_path, dir_path_span);
31453150
return chdirW(wtf16_dir_path[0..len]);
31463151
} else if (native_os == .wasi and !builtin.link_libc) {
31473152
return chdir(mem.span(dir_path));

lib/std/posix/test.zig

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,15 @@ const tmpDir = std.testing.tmpDir;
2222
const Dir = std.fs.Dir;
2323
const ArenaAllocator = std.heap.ArenaAllocator;
2424

25+
// https://github.com/ziglang/zig/issues/20288
26+
test "WTF-8 to WTF-16 conversion buffer overflows" {
27+
if (native_os != .windows) return error.SkipZigTest;
28+
29+
const input_wtf8 = "\u{10FFFF}" ** 16385;
30+
try expectError(error.NameTooLong, posix.chdir(input_wtf8));
31+
try expectError(error.NameTooLong, posix.chdirZ(input_wtf8));
32+
}
33+
2534
test "chdir smoke test" {
2635
if (native_os == .wasi) return error.SkipZigTest;
2736

lib/std/unicode.zig

Lines changed: 91 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1405,29 +1405,38 @@ test "ArrayList functions on a re-used list" {
14051405
}
14061406
}
14071407

1408-
/// Converts a UTF-8 string literal into a UTF-16LE string literal.
1409-
pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch |err| @compileError(err):0]u16 {
1408+
fn utf8ToUtf16LeStringLiteralImpl(comptime utf8: []const u8, comptime surrogates: Surrogates) *const [calcUtf16LeLenImpl(utf8, surrogates) catch |err| @compileError(err):0]u16 {
14101409
return comptime blk: {
1411-
const len: usize = calcUtf16LeLen(utf8) catch unreachable;
1410+
const len: usize = calcUtf16LeLenImpl(utf8, surrogates) catch unreachable;
14121411
var utf16le: [len:0]u16 = [_:0]u16{0} ** len;
1413-
const utf16le_len = utf8ToUtf16Le(&utf16le, utf8[0..]) catch |err| @compileError(err);
1412+
const utf16le_len = utf8ToUtf16LeImpl(&utf16le, utf8[0..], surrogates) catch |err| @compileError(err);
14141413
assert(len == utf16le_len);
14151414
const final = utf16le;
14161415
break :blk &final;
14171416
};
14181417
}
14191418

1420-
const CalcUtf16LeLenError = Utf8DecodeError || error{Utf8InvalidStartByte};
1419+
/// Converts a UTF-8 string literal into a UTF-16LE string literal.
1420+
pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch |err| @compileError(err):0]u16 {
1421+
return utf8ToUtf16LeStringLiteralImpl(utf8, .cannot_encode_surrogate_half);
1422+
}
14211423

1422-
/// Returns length in UTF-16 of UTF-8 slice as length of []u16.
1423-
/// Length in []u8 is 2*len16.
1424-
pub fn calcUtf16LeLen(utf8: []const u8) CalcUtf16LeLenError!usize {
1424+
/// Converts a WTF-8 string literal into a WTF-16LE string literal.
1425+
pub fn wtf8ToWtf16LeStringLiteral(comptime wtf8: []const u8) *const [calcWtf16LeLen(wtf8) catch |err| @compileError(err):0]u16 {
1426+
return utf8ToUtf16LeStringLiteralImpl(wtf8, .can_encode_surrogate_half);
1427+
}
1428+
1429+
pub fn calcUtf16LeLenImpl(utf8: []const u8, comptime surrogates: Surrogates) !usize {
1430+
const utf8DecodeImpl = switch (surrogates) {
1431+
.cannot_encode_surrogate_half => utf8Decode,
1432+
.can_encode_surrogate_half => wtf8Decode,
1433+
};
14251434
var src_i: usize = 0;
14261435
var dest_len: usize = 0;
14271436
while (src_i < utf8.len) {
14281437
const n = try utf8ByteSequenceLength(utf8[src_i]);
14291438
const next_src_i = src_i + n;
1430-
const codepoint = try utf8Decode(utf8[src_i..next_src_i]);
1439+
const codepoint = try utf8DecodeImpl(utf8[src_i..next_src_i]);
14311440
if (codepoint < 0x10000) {
14321441
dest_len += 1;
14331442
} else {
@@ -1438,16 +1447,37 @@ pub fn calcUtf16LeLen(utf8: []const u8) CalcUtf16LeLenError!usize {
14381447
return dest_len;
14391448
}
14401449

1441-
fn testCalcUtf16LeLen() !void {
1442-
try testing.expectEqual(@as(usize, 1), try calcUtf16LeLen("a"));
1443-
try testing.expectEqual(@as(usize, 10), try calcUtf16LeLen("abcdefghij"));
1444-
try testing.expectEqual(@as(usize, 10), try calcUtf16LeLen("äåéëþüúíóö"));
1445-
try testing.expectEqual(@as(usize, 5), try calcUtf16LeLen("こんにちは"));
1450+
const CalcUtf16LeLenError = Utf8DecodeError || error{Utf8InvalidStartByte};
1451+
1452+
/// Returns length in UTF-16LE of UTF-8 slice as length of []u16.
1453+
/// Length in []u8 is 2*len16.
1454+
pub fn calcUtf16LeLen(utf8: []const u8) CalcUtf16LeLenError!usize {
1455+
return calcUtf16LeLenImpl(utf8, .cannot_encode_surrogate_half);
1456+
}
1457+
1458+
const CalcWtf16LeLenError = Wtf8DecodeError || error{Utf8InvalidStartByte};
1459+
1460+
/// Returns length in WTF-16LE of WTF-8 slice as length of []u16.
1461+
/// Length in []u8 is 2*len16.
1462+
pub fn calcWtf16LeLen(wtf8: []const u8) CalcWtf16LeLenError!usize {
1463+
return calcUtf16LeLenImpl(wtf8, .can_encode_surrogate_half);
14461464
}
14471465

1448-
test "calculate utf16 string length of given utf8 string in u16" {
1449-
try testCalcUtf16LeLen();
1450-
try comptime testCalcUtf16LeLen();
1466+
fn testCalcUtf16LeLenImpl(calcUtf16LeLenImpl_: anytype) !void {
1467+
try testing.expectEqual(@as(usize, 1), try calcUtf16LeLenImpl_("a"));
1468+
try testing.expectEqual(@as(usize, 10), try calcUtf16LeLenImpl_("abcdefghij"));
1469+
try testing.expectEqual(@as(usize, 10), try calcUtf16LeLenImpl_("äåéëþüúíóö"));
1470+
try testing.expectEqual(@as(usize, 5), try calcUtf16LeLenImpl_("こんにちは"));
1471+
}
1472+
1473+
test calcUtf16LeLen {
1474+
try testCalcUtf16LeLenImpl(calcUtf16LeLen);
1475+
try comptime testCalcUtf16LeLenImpl(calcUtf16LeLen);
1476+
}
1477+
1478+
test calcWtf16LeLen {
1479+
try testCalcUtf16LeLenImpl(calcWtf16LeLen);
1480+
try comptime testCalcUtf16LeLenImpl(calcWtf16LeLen);
14511481
}
14521482

14531483
/// Print the given `utf16le` string, encoded as UTF-8 bytes.
@@ -1487,8 +1517,10 @@ pub fn fmtUtf16Le(utf16le: []const u16) std.fmt.Formatter(formatUtf16Le) {
14871517
test fmtUtf16Le {
14881518
const expectFmt = testing.expectFmt;
14891519
try expectFmt("", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral(""))});
1520+
try expectFmt("", "{}", .{fmtUtf16Le(wtf8ToWtf16LeStringLiteral(""))});
14901521
try expectFmt("foo", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral("foo"))});
1491-
try expectFmt("𐐷", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral("𐐷"))});
1522+
try expectFmt("foo", "{}", .{fmtUtf16Le(wtf8ToWtf16LeStringLiteral("foo"))});
1523+
try expectFmt("𐐷", "{}", .{fmtUtf16Le(wtf8ToWtf16LeStringLiteral("𐐷"))});
14921524
try expectFmt("퟿", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xd7", native_endian)})});
14931525
try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xd8", native_endian)})});
14941526
try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xdb", native_endian)})});
@@ -1497,12 +1529,12 @@ test fmtUtf16Le {
14971529
try expectFmt("", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xe0", native_endian)})});
14981530
}
14991531

1500-
test utf8ToUtf16LeStringLiteral {
1532+
fn testUtf8ToUtf16LeStringLiteral(utf8ToUtf16LeStringLiteral_: anytype) !void {
15011533
{
15021534
const bytes = [_:0]u16{
15031535
mem.nativeToLittle(u16, 0x41),
15041536
};
1505-
const utf16 = utf8ToUtf16LeStringLiteral("A");
1537+
const utf16 = utf8ToUtf16LeStringLiteral_("A");
15061538
try testing.expectEqualSlices(u16, &bytes, utf16);
15071539
try testing.expect(utf16[1] == 0);
15081540
}
@@ -1511,31 +1543,31 @@ test utf8ToUtf16LeStringLiteral {
15111543
mem.nativeToLittle(u16, 0xD801),
15121544
mem.nativeToLittle(u16, 0xDC37),
15131545
};
1514-
const utf16 = utf8ToUtf16LeStringLiteral("𐐷");
1546+
const utf16 = utf8ToUtf16LeStringLiteral_("𐐷");
15151547
try testing.expectEqualSlices(u16, &bytes, utf16);
15161548
try testing.expect(utf16[2] == 0);
15171549
}
15181550
{
15191551
const bytes = [_:0]u16{
15201552
mem.nativeToLittle(u16, 0x02FF),
15211553
};
1522-
const utf16 = utf8ToUtf16LeStringLiteral("\u{02FF}");
1554+
const utf16 = utf8ToUtf16LeStringLiteral_("\u{02FF}");
15231555
try testing.expectEqualSlices(u16, &bytes, utf16);
15241556
try testing.expect(utf16[1] == 0);
15251557
}
15261558
{
15271559
const bytes = [_:0]u16{
15281560
mem.nativeToLittle(u16, 0x7FF),
15291561
};
1530-
const utf16 = utf8ToUtf16LeStringLiteral("\u{7FF}");
1562+
const utf16 = utf8ToUtf16LeStringLiteral_("\u{7FF}");
15311563
try testing.expectEqualSlices(u16, &bytes, utf16);
15321564
try testing.expect(utf16[1] == 0);
15331565
}
15341566
{
15351567
const bytes = [_:0]u16{
15361568
mem.nativeToLittle(u16, 0x801),
15371569
};
1538-
const utf16 = utf8ToUtf16LeStringLiteral("\u{801}");
1570+
const utf16 = utf8ToUtf16LeStringLiteral_("\u{801}");
15391571
try testing.expectEqualSlices(u16, &bytes, utf16);
15401572
try testing.expect(utf16[1] == 0);
15411573
}
@@ -1544,12 +1576,20 @@ test utf8ToUtf16LeStringLiteral {
15441576
mem.nativeToLittle(u16, 0xDBFF),
15451577
mem.nativeToLittle(u16, 0xDFFF),
15461578
};
1547-
const utf16 = utf8ToUtf16LeStringLiteral("\u{10FFFF}");
1579+
const utf16 = utf8ToUtf16LeStringLiteral_("\u{10FFFF}");
15481580
try testing.expectEqualSlices(u16, &bytes, utf16);
15491581
try testing.expect(utf16[2] == 0);
15501582
}
15511583
}
15521584

1585+
test utf8ToUtf16LeStringLiteral {
1586+
try testUtf8ToUtf16LeStringLiteral(utf8ToUtf16LeStringLiteral);
1587+
}
1588+
1589+
test wtf8ToWtf16LeStringLiteral {
1590+
try testUtf8ToUtf16LeStringLiteral(wtf8ToWtf16LeStringLiteral);
1591+
}
1592+
15531593
fn testUtf8CountCodepoints() !void {
15541594
try testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij"));
15551595
try testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö"));
@@ -1795,6 +1835,30 @@ pub fn wtf8ToWtf16Le(wtf16le: []u16, wtf8: []const u8) error{InvalidWtf8}!usize
17951835
return utf8ToUtf16LeImpl(wtf16le, wtf8, .can_encode_surrogate_half);
17961836
}
17971837

1838+
fn checkUtf8ToUtf16LeOverflowImpl(utf8: []const u8, utf16le: []const u16, comptime surrogates: Surrogates) !bool {
1839+
// Each u8 in UTF-8/WTF-8 correlates to at most one u16 in UTF-16LE/WTF-16LE.
1840+
if (utf16le.len >= utf8.len) return false;
1841+
const utf16_len = calcUtf16LeLenImpl(utf8, surrogates) catch {
1842+
return switch (surrogates) {
1843+
.cannot_encode_surrogate_half => error.InvalidUtf8,
1844+
.can_encode_surrogate_half => error.InvalidWtf8,
1845+
};
1846+
};
1847+
return utf16_len > utf16le.len;
1848+
}
1849+
1850+
/// Checks if calling `utf8ToUtf16Le` would overflow. Might fail if utf8 is not
1851+
/// valid UTF-8.
1852+
pub fn checkUtf8ToUtf16LeOverflow(utf8: []const u8, utf16le: []const u16) error{InvalidUtf8}!bool {
1853+
return checkUtf8ToUtf16LeOverflowImpl(utf8, utf16le, .cannot_encode_surrogate_half);
1854+
}
1855+
1856+
/// Checks if calling `utf8ToUtf16Le` would overflow. Might fail if wtf8 is not
1857+
/// valid WTF-8.
1858+
pub fn checkWtf8ToWtf16LeOverflow(wtf8: []const u8, wtf16le: []const u16) error{InvalidWtf8}!bool {
1859+
return checkUtf8ToUtf16LeOverflowImpl(wtf8, wtf16le, .can_encode_surrogate_half);
1860+
}
1861+
17981862
/// Surrogate codepoints (U+D800 to U+DFFF) are replaced by the Unicode replacement
17991863
/// character (U+FFFD).
18001864
/// All surrogate codepoints and the replacement character are encoded as three
@@ -2000,6 +2064,8 @@ fn testRoundtripWtf8(wtf8: []const u8) !void {
20002064
{
20012065
var wtf16_buf: [32]u16 = undefined;
20022066
const wtf16_len = try wtf8ToWtf16Le(&wtf16_buf, wtf8);
2067+
try testing.expectEqual(wtf16_len, calcWtf16LeLen(wtf8));
2068+
try testing.expectEqual(false, checkWtf8ToWtf16LeOverflow(wtf8, &wtf16_buf));
20032069
const wtf16 = wtf16_buf[0..wtf16_len];
20042070

20052071
var roundtripped_buf: [32]u8 = undefined;

0 commit comments

Comments
 (0)