Resolves unecessary copying of strings between ZonGen & parse

Will give API more thought in followup
ziglang · Jan 7, 2025 · 59f12f4 · 59f12f4
1 parent 1980ecb
commit 59f12f4
Show file tree

Hide file tree

Showing 9 changed files with 236 additions and 93 deletions.
diff --git a/lib/std/zig/ZonGen.zig b/lib/std/zig/ZonGen.zig
@@ -3,6 +3,8 @@
 gpa: Allocator,
 tree: Ast,
 
+parse_str_lits: bool,
+
 nodes: std.MultiArrayList(Zoir.Node.Repr),
 extra: std.ArrayListUnmanaged(u32),
 limbs: std.ArrayListUnmanaged(std.math.big.Limb),
@@ -12,12 +14,13 @@ string_table: std.HashMapUnmanaged(u32, void, StringIndexContext, std.hash_map.d
 compile_errors: std.ArrayListUnmanaged(Zoir.CompileError),
 error_notes: std.ArrayListUnmanaged(Zoir.CompileError.Note),
 
-pub fn generate(gpa: Allocator, tree: Ast) Allocator.Error!Zoir {
+pub fn generate(gpa: Allocator, tree: Ast, parse_str_lits: bool) Allocator.Error!Zoir {
     assert(tree.mode == .zon);
 
     var zg: ZonGen = .{
         .gpa = gpa,
         .tree = tree,
+        .parse_str_lits = parse_str_lits,
         .nodes = .empty,
         .extra = .empty,
         .limbs = .empty,
@@ -429,46 +432,6 @@ fn expr(zg: *ZonGen, node: Ast.Node.Index, dest_node: Zoir.Node.Index) Allocator
     }
 }
 
-fn parseStrLit(zg: *ZonGen, token: Ast.TokenIndex, offset: u32) !u32 {
-    const raw_string = zg.tree.tokenSlice(token)[offset..];
-    const start = zg.string_bytes.items.len;
-    switch (try std.zig.string_literal.parseWrite(zg.string_bytes.writer(zg.gpa), raw_string)) {
-        .success => return @intCast(start),
-        .failure => |err| {
-            try zg.lowerStrLitError(err, token, raw_string, offset);
-            return error.BadString;
-        },
-    }
-}
-
-fn parseMultilineStrLit(zg: *ZonGen, node: Ast.Node.Index) !u32 {
-    const gpa = zg.gpa;
-    const tree = zg.tree;
-    const string_bytes = &zg.string_bytes;
-
-    const first_tok, const last_tok = bounds: {
-        const node_data = tree.nodes.items(.data)[node];
-        break :bounds .{ node_data.lhs, node_data.rhs };
-    };
-
-    const str_index: u32 = @intCast(string_bytes.items.len);
-
-    // First line: do not append a newline.
-    {
-        const line_bytes = tree.tokenSlice(first_tok)[2..];
-        try string_bytes.appendSlice(gpa, line_bytes);
-    }
-    // Following lines: each line prepends a newline.
-    for (first_tok + 1..last_tok + 1) |tok_idx| {
-        const line_bytes = tree.tokenSlice(@intCast(tok_idx))[2..];
-        try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len + 1);
-        string_bytes.appendAssumeCapacity('\n');
-        string_bytes.appendSliceAssumeCapacity(line_bytes);
-    }
-
-    return @intCast(str_index);
-}
-
 fn appendIdentStr(zg: *ZonGen, ident_token: Ast.TokenIndex) !u32 {
     const tree = zg.tree;
     assert(tree.tokens.items(.tag)[ident_token] == .identifier);
@@ -478,7 +441,18 @@ fn appendIdentStr(zg: *ZonGen, ident_token: Ast.TokenIndex) !u32 {
         try zg.string_bytes.appendSlice(zg.gpa, ident_name);
         return @intCast(start);
     } else {
-        const start = try zg.parseStrLit(ident_token, 1);
+        const offset = 1;
+        const start: u32 = @intCast(zg.string_bytes.items.len);
+        const raw_string = zg.tree.tokenSlice(ident_token)[offset..];
+        try zg.string_bytes.ensureUnusedCapacity(zg.gpa, raw_string.len);
+        switch (try std.zig.string_literal.parseWrite(zg.string_bytes.writer(zg.gpa), raw_string)) {
+            .success => {},
+            .failure => |err| {
+                try zg.lowerStrLitError(err, ident_token, raw_string, offset);
+                return error.BadString;
+            },
+        }
+
         const slice = zg.string_bytes.items[start..];
         if (mem.indexOfScalar(u8, slice, 0) != null) {
             try zg.addErrorTok(ident_token, "identifier cannot contain null bytes", .{});
@@ -491,19 +465,93 @@ fn appendIdentStr(zg: *ZonGen, ident_token: Ast.TokenIndex) !u32 {
     }
 }
 
+/// Estimates the size of a string node without parsing it.
+pub fn strLitSizeHint(tree: Ast, node: Ast.Node.Index) usize {
+    switch (tree.nodes.items(.tag)[node]) {
+        // Parsed string literals are typically around the size of the raw strings.
+        .string_literal => {
+            const token = tree.nodes.items(.main_token)[node];
+            const raw_string = tree.tokenSlice(token);
+            return raw_string.len;
+        },
+        // Multiline string literal lengths can be computed exactly.
+        .multiline_string_literal => {
+            const first_tok, const last_tok = bounds: {
+                const node_data = tree.nodes.items(.data)[node];
+                break :bounds .{ node_data.lhs, node_data.rhs };
+            };
+
+            var size = tree.tokenSlice(first_tok)[2..].len;
+            for (first_tok + 1..last_tok + 1) |tok_idx| {
+                size += 1; // Newline
+                size += tree.tokenSlice(@intCast(tok_idx))[2..].len;
+            }
+            return size;
+        },
+        else => unreachable,
+    }
+}
+
+/// Parses the given node as a string literal.
+pub fn parseStrLit(
+    tree: Ast,
+    node: Ast.Node.Index,
+    writer: anytype,
+) error{OutOfMemory}!std.zig.string_literal.Result {
+    switch (tree.nodes.items(.tag)[node]) {
+        .string_literal => {
+            const token = tree.nodes.items(.main_token)[node];
+            const raw_string = tree.tokenSlice(token);
+            return std.zig.string_literal.parseWrite(writer, raw_string);
+        },
+        .multiline_string_literal => {
+            const first_tok, const last_tok = bounds: {
+                const node_data = tree.nodes.items(.data)[node];
+                break :bounds .{ node_data.lhs, node_data.rhs };
+            };
+
+            // First line: do not append a newline.
+            {
+                const line_bytes = tree.tokenSlice(first_tok)[2..];
+                try writer.writeAll(line_bytes);
+            }
+
+            // Following lines: each line prepends a newline.
+            for (first_tok + 1..last_tok + 1) |tok_idx| {
+                const line_bytes = tree.tokenSlice(@intCast(tok_idx))[2..];
+                try writer.writeByte('\n');
+                try writer.writeAll(line_bytes);
+            }
+
+            return .success;
+        },
+        // Node must represent a string
+        else => unreachable,
+    }
+}
+
 const StringLiteralResult = union(enum) {
     nts: Zoir.NullTerminatedString,
     slice: struct { start: u32, len: u32 },
 };
 
 fn strLitAsString(zg: *ZonGen, str_node: Ast.Node.Index) !StringLiteralResult {
+    if (!zg.parse_str_lits) return .{ .slice = .{ .start = 0, .len = 0 } };
+
     const gpa = zg.gpa;
     const string_bytes = &zg.string_bytes;
-    const str_index = switch (zg.tree.nodes.items(.tag)[str_node]) {
-        .string_literal => try zg.parseStrLit(zg.tree.nodes.items(.main_token)[str_node], 0),
-        .multiline_string_literal => try zg.parseMultilineStrLit(str_node),
-        else => unreachable,
-    };
+    const str_index: u32 = @intCast(zg.string_bytes.items.len);
+    const size_hint = strLitSizeHint(zg.tree, str_node);
+    try string_bytes.ensureUnusedCapacity(zg.gpa, size_hint);
+    switch (try parseStrLit(zg.tree, str_node, zg.string_bytes.writer(zg.gpa))) {
+        .success => {},
+        .failure => |err| {
+            const token = zg.tree.nodes.items(.main_token)[str_node];
+            const raw_string = zg.tree.tokenSlice(token);
+            try zg.lowerStrLitError(err, token, raw_string, 0);
+            return error.BadString;
+        },
+    }
     const key: []const u8 = string_bytes.items[str_index..];
     if (std.mem.indexOfScalar(u8, key, 0) != null) return .{ .slice = .{
         .start = str_index,

diff --git a/lib/std/zig/string_literal.zig b/lib/std/zig/string_literal.zig
@@ -43,7 +43,7 @@ pub const Error = union(enum) {
     pub fn lower(
         err: Error,
         raw_string: []const u8,
-        offset: u32,
+        index_offset: u32,
         comptime func: anytype,
         first_args: anytype,
     ) @typeInfo(@TypeOf(func)).@"fn".return_type.? {
@@ -61,18 +61,34 @@ pub const Error = union(enum) {
                     .invalid_unicode_codepoint => .{ "unicode escape does not correspond to a valid unicode scalar value", .{} },
                     .expected_lbrace => .{ "expected '{{', found '{c}'", .{raw_string[bad_index]} },
                     .expected_rbrace => .{ "expected '}}', found '{c}'", .{raw_string[bad_index]} },
-                    .expected_single_quote => .{ "expected singel quote ('), found '{c}'", .{raw_string[bad_index]} },
+                    .expected_single_quote => .{ "expected single quote ('), found '{c}'", .{raw_string[bad_index]} },
                     .invalid_character => .{ "invalid byte in string or character literal: '{c}'", .{raw_string[bad_index]} },
                     .empty_char_literal => .{ "empty character literal", .{} },
                 };
                 return @call(.auto, func, first_args ++ .{
-                    offset + bad_index,
+                    index_offset + bad_index,
                     fmt_str,
                     args,
                 });
             },
         }
     }
+
+    pub fn offset(err: Error) usize {
+        return switch (err) {
+            inline .invalid_escape_character,
+            .expected_hex_digit,
+            .empty_unicode_escape_sequence,
+            .expected_hex_digit_or_rbrace,
+            .invalid_unicode_codepoint,
+            .expected_lbrace,
+            .expected_rbrace,
+            .expected_single_quote,
+            .invalid_character,
+            => |n| n,
+            .empty_char_literal => 0,
+        };
+    }
 };
 
 /// Asserts the slice starts and ends with single-quotes.