Skip to content

Commit

Permalink
Resolves unecessary copying of strings between ZonGen & parse
Browse files Browse the repository at this point in the history
Will give API more thought in followup
  • Loading branch information
MasonRemaley committed Jan 7, 2025
1 parent 1980ecb commit 59f12f4
Show file tree
Hide file tree
Showing 9 changed files with 236 additions and 93 deletions.
142 changes: 95 additions & 47 deletions lib/std/zig/ZonGen.zig
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
gpa: Allocator,
tree: Ast,

parse_str_lits: bool,

nodes: std.MultiArrayList(Zoir.Node.Repr),
extra: std.ArrayListUnmanaged(u32),
limbs: std.ArrayListUnmanaged(std.math.big.Limb),
Expand All @@ -12,12 +14,13 @@ string_table: std.HashMapUnmanaged(u32, void, StringIndexContext, std.hash_map.d
compile_errors: std.ArrayListUnmanaged(Zoir.CompileError),
error_notes: std.ArrayListUnmanaged(Zoir.CompileError.Note),

pub fn generate(gpa: Allocator, tree: Ast) Allocator.Error!Zoir {
pub fn generate(gpa: Allocator, tree: Ast, parse_str_lits: bool) Allocator.Error!Zoir {
assert(tree.mode == .zon);

var zg: ZonGen = .{
.gpa = gpa,
.tree = tree,
.parse_str_lits = parse_str_lits,
.nodes = .empty,
.extra = .empty,
.limbs = .empty,
Expand Down Expand Up @@ -429,46 +432,6 @@ fn expr(zg: *ZonGen, node: Ast.Node.Index, dest_node: Zoir.Node.Index) Allocator
}
}

fn parseStrLit(zg: *ZonGen, token: Ast.TokenIndex, offset: u32) !u32 {
const raw_string = zg.tree.tokenSlice(token)[offset..];
const start = zg.string_bytes.items.len;
switch (try std.zig.string_literal.parseWrite(zg.string_bytes.writer(zg.gpa), raw_string)) {
.success => return @intCast(start),
.failure => |err| {
try zg.lowerStrLitError(err, token, raw_string, offset);
return error.BadString;
},
}
}

fn parseMultilineStrLit(zg: *ZonGen, node: Ast.Node.Index) !u32 {
const gpa = zg.gpa;
const tree = zg.tree;
const string_bytes = &zg.string_bytes;

const first_tok, const last_tok = bounds: {
const node_data = tree.nodes.items(.data)[node];
break :bounds .{ node_data.lhs, node_data.rhs };
};

const str_index: u32 = @intCast(string_bytes.items.len);

// First line: do not append a newline.
{
const line_bytes = tree.tokenSlice(first_tok)[2..];
try string_bytes.appendSlice(gpa, line_bytes);
}
// Following lines: each line prepends a newline.
for (first_tok + 1..last_tok + 1) |tok_idx| {
const line_bytes = tree.tokenSlice(@intCast(tok_idx))[2..];
try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len + 1);
string_bytes.appendAssumeCapacity('\n');
string_bytes.appendSliceAssumeCapacity(line_bytes);
}

return @intCast(str_index);
}

fn appendIdentStr(zg: *ZonGen, ident_token: Ast.TokenIndex) !u32 {
const tree = zg.tree;
assert(tree.tokens.items(.tag)[ident_token] == .identifier);
Expand All @@ -478,7 +441,18 @@ fn appendIdentStr(zg: *ZonGen, ident_token: Ast.TokenIndex) !u32 {
try zg.string_bytes.appendSlice(zg.gpa, ident_name);
return @intCast(start);
} else {
const start = try zg.parseStrLit(ident_token, 1);
const offset = 1;
const start: u32 = @intCast(zg.string_bytes.items.len);
const raw_string = zg.tree.tokenSlice(ident_token)[offset..];
try zg.string_bytes.ensureUnusedCapacity(zg.gpa, raw_string.len);
switch (try std.zig.string_literal.parseWrite(zg.string_bytes.writer(zg.gpa), raw_string)) {
.success => {},
.failure => |err| {
try zg.lowerStrLitError(err, ident_token, raw_string, offset);
return error.BadString;
},
}

const slice = zg.string_bytes.items[start..];
if (mem.indexOfScalar(u8, slice, 0) != null) {
try zg.addErrorTok(ident_token, "identifier cannot contain null bytes", .{});
Expand All @@ -491,19 +465,93 @@ fn appendIdentStr(zg: *ZonGen, ident_token: Ast.TokenIndex) !u32 {
}
}

/// Estimates the size of a string node without parsing it.
pub fn strLitSizeHint(tree: Ast, node: Ast.Node.Index) usize {
switch (tree.nodes.items(.tag)[node]) {
// Parsed string literals are typically around the size of the raw strings.
.string_literal => {
const token = tree.nodes.items(.main_token)[node];
const raw_string = tree.tokenSlice(token);
return raw_string.len;
},
// Multiline string literal lengths can be computed exactly.
.multiline_string_literal => {
const first_tok, const last_tok = bounds: {
const node_data = tree.nodes.items(.data)[node];
break :bounds .{ node_data.lhs, node_data.rhs };
};

var size = tree.tokenSlice(first_tok)[2..].len;
for (first_tok + 1..last_tok + 1) |tok_idx| {
size += 1; // Newline
size += tree.tokenSlice(@intCast(tok_idx))[2..].len;
}
return size;
},
else => unreachable,
}
}

/// Parses the given node as a string literal.
pub fn parseStrLit(
tree: Ast,
node: Ast.Node.Index,
writer: anytype,
) error{OutOfMemory}!std.zig.string_literal.Result {
switch (tree.nodes.items(.tag)[node]) {
.string_literal => {
const token = tree.nodes.items(.main_token)[node];
const raw_string = tree.tokenSlice(token);
return std.zig.string_literal.parseWrite(writer, raw_string);
},
.multiline_string_literal => {
const first_tok, const last_tok = bounds: {
const node_data = tree.nodes.items(.data)[node];
break :bounds .{ node_data.lhs, node_data.rhs };
};

// First line: do not append a newline.
{
const line_bytes = tree.tokenSlice(first_tok)[2..];
try writer.writeAll(line_bytes);
}

// Following lines: each line prepends a newline.
for (first_tok + 1..last_tok + 1) |tok_idx| {
const line_bytes = tree.tokenSlice(@intCast(tok_idx))[2..];
try writer.writeByte('\n');
try writer.writeAll(line_bytes);
}

return .success;
},
// Node must represent a string
else => unreachable,
}
}

const StringLiteralResult = union(enum) {
nts: Zoir.NullTerminatedString,
slice: struct { start: u32, len: u32 },
};

fn strLitAsString(zg: *ZonGen, str_node: Ast.Node.Index) !StringLiteralResult {
if (!zg.parse_str_lits) return .{ .slice = .{ .start = 0, .len = 0 } };

const gpa = zg.gpa;
const string_bytes = &zg.string_bytes;
const str_index = switch (zg.tree.nodes.items(.tag)[str_node]) {
.string_literal => try zg.parseStrLit(zg.tree.nodes.items(.main_token)[str_node], 0),
.multiline_string_literal => try zg.parseMultilineStrLit(str_node),
else => unreachable,
};
const str_index: u32 = @intCast(zg.string_bytes.items.len);
const size_hint = strLitSizeHint(zg.tree, str_node);
try string_bytes.ensureUnusedCapacity(zg.gpa, size_hint);
switch (try parseStrLit(zg.tree, str_node, zg.string_bytes.writer(zg.gpa))) {
.success => {},
.failure => |err| {
const token = zg.tree.nodes.items(.main_token)[str_node];
const raw_string = zg.tree.tokenSlice(token);
try zg.lowerStrLitError(err, token, raw_string, 0);
return error.BadString;
},
}
const key: []const u8 = string_bytes.items[str_index..];
if (std.mem.indexOfScalar(u8, key, 0) != null) return .{ .slice = .{
.start = str_index,
Expand Down
22 changes: 19 additions & 3 deletions lib/std/zig/string_literal.zig
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ pub const Error = union(enum) {
pub fn lower(
err: Error,
raw_string: []const u8,
offset: u32,
index_offset: u32,
comptime func: anytype,
first_args: anytype,
) @typeInfo(@TypeOf(func)).@"fn".return_type.? {
Expand All @@ -61,18 +61,34 @@ pub const Error = union(enum) {
.invalid_unicode_codepoint => .{ "unicode escape does not correspond to a valid unicode scalar value", .{} },
.expected_lbrace => .{ "expected '{{', found '{c}'", .{raw_string[bad_index]} },
.expected_rbrace => .{ "expected '}}', found '{c}'", .{raw_string[bad_index]} },
.expected_single_quote => .{ "expected singel quote ('), found '{c}'", .{raw_string[bad_index]} },
.expected_single_quote => .{ "expected single quote ('), found '{c}'", .{raw_string[bad_index]} },
.invalid_character => .{ "invalid byte in string or character literal: '{c}'", .{raw_string[bad_index]} },
.empty_char_literal => .{ "empty character literal", .{} },
};
return @call(.auto, func, first_args ++ .{
offset + bad_index,
index_offset + bad_index,
fmt_str,
args,
});
},
}
}

pub fn offset(err: Error) usize {
return switch (err) {
inline .invalid_escape_character,
.expected_hex_digit,
.empty_unicode_escape_sequence,
.expected_hex_digit_or_rbrace,
.invalid_unicode_codepoint,
.expected_lbrace,
.expected_rbrace,
.expected_single_quote,
.invalid_character,
=> |n| n,
.empty_char_literal => 0,
};
}
};

/// Asserts the slice starts and ends with single-quotes.
Expand Down
Loading

0 comments on commit 59f12f4

Please sign in to comment.