diff --git a/build.bat b/build.bat index f636ea9cd..d4e797971 100644 --- a/build.bat +++ b/build.bat @@ -46,6 +46,7 @@ if "%spall%"=="1" set auto_compile_flags=%auto_compile_flags% if "%asan%"=="1" set auto_compile_flags=%auto_compile_flags% -fsanitize=address && echo [asan enabled] if "%ubsan%"=="1" set auto_compile_flags=%auto_compile_flags% -fsanitize=undefined && echo [ubsan enabled] if "%opengl%"=="1" set auto_compile_flags=%auto_compile_flags% -DR_BACKEND=R_BACKEND_OPENGL && echo [opengl render backend] +if "%worklist_selfcheck%"=="1" set auto_compile_flags=%auto_compile_flags% -DICF_WORKLIST_SELFCHECK=1 && echo [icf worklist selfcheck enabled] if "%dwarf%"=="1" if "%clang%"=="1" set auto_compile_flags=%auto_compile_flags% -gdwarf && echo [dwarf debug info] if "%dwarf%"=="" if "%clang%"=="1" set auto_compile_flags=%auto_compile_flags% -gcodeview if "%pgo%"=="1" ( @@ -142,7 +143,14 @@ pushd build if "%raddbg%"=="1" set didbuild=1 && %compile% ..\src\raddbg\raddbg_main.c %compile_link% %link_icon% %out%raddbg.exe || exit /b 1 if "%raddbg_non_graphical%"=="1" set didbuild=1 && %compile% -DWM_STUB=1 -DR_BACKEND=R_BACKEND_STUB ..\src\raddbg\raddbg_main.c %compile_link% %link_icon% %out%raddbg_non_graphical.exe || exit /b 1 if "%com_shim%"=="1" set didbuild=1 && %compile% ..\src\com_shim\com_shim_main.c %compile_link% %out%com_shim.exe || exit /b 1 -if "%radlink%"=="1" set didbuild=1 && %compile% ..\src\linker\lnk.c %compile_link% %linker% /NOIMPLIB %linker% /NATVIS:"%~dp0\src\linker\linker.natvis" %out%radlink.exe || exit /b 1 +:: NOTE: -DBLAKE3_ATOMICS=1 makes BLAKE3 use C11 _Atomic (plain atomic load) for +:: get_cpu_features instead of MSVC's _InterlockedOr `lock or` barrier on every +:: compress dispatch (was a ~5.5s main-thread hot spot). MSVC C11 atomics require +:: /std:c11 /experimental:c11atomics. Kept external so the vendored blake3 source +:: stays pristine. +set radlink_msvc_flags= +if "%msvc%"=="1" set radlink_msvc_flags=/std:c11 /experimental:c11atomics -DBLAKE3_ATOMICS=1 +if "%radlink%"=="1" set didbuild=1 && %compile% %radlink_msvc_flags% ..\src\linker\lnk.c %compile_link% %linker% /NOIMPLIB %linker% /NATVIS:"%~dp0\src\linker\linker.natvis" %out%radlink.exe || exit /b 1 if "%radbin%"=="1" set didbuild=1 && %compile% ..\src\radbin\radbin_main.c %compile_link% %out%radbin.exe || exit /b 1 if "%raddump%"=="1" set didbuild=1 && %compile% ..\src\raddump\raddump_main.c %compile_link% %out%raddump.exe || exit /b 1 if "%ryan_scratch%"=="1" set didbuild=1 && %compile% ..\src\scratch\ryan_scratch.c %compile_link% %out%ryan_scratch.exe || exit /b 1 diff --git a/src/base/base_arena.c b/src/base/base_arena.c index f765f7005..bc7f34a8e 100644 --- a/src/base/base_arena.c +++ b/src/base/base_arena.c @@ -346,6 +346,54 @@ arena_pop_to(Arena *arena, U64 pos) } +//- rjf: arena decommit of unused (rewound/free) pages + +internal void +arena_decommit_unused(Arena *arena) +{ + // NOTE(perf): decommit committed-but-unused pages so they stop counting against + // working set, while keeping the reservation. Only touches pages strictly above + // the live `pos` high-water of each block in the active chain, and the unused + // bodies of free-list blocks. Live data (<= pos) is never touched. The push path + // re-commits on demand (arena_push grows `cmt`), so reuse is transparent. + if(arena->flags & ArenaFlag_LargePages) + { + // large pages cannot be partially decommitted safely; skip. + return; + } + U64 page_size = get_system_info()->page_size; + + // rjf: active chain -- decommit committed region above each block's live pos + for(Arena *n = arena->current; n != 0; n = n->prev) + { + U64 pos_aligned = AlignPow2(n->pos, page_size); + if(pos_aligned < n->cmt) + { + U8 *decommit_ptr = (U8 *)n + pos_aligned; + U64 decommit_size = n->cmt - pos_aligned; + AsanPoisonMemoryRegion(decommit_ptr, decommit_size); + decommit_memory(decommit_ptr, decommit_size); + n->cmt = pos_aligned; + } + } + +#if ARENA_FREE_LIST + // rjf: free chain -- decommit everything above the first (header) page + for(Arena *n = arena->free_last; n != 0; n = n->prev) + { + U64 keep = AlignPow2(ARENA_HEADER_SIZE, page_size); + if(keep < n->cmt) + { + U8 *decommit_ptr = (U8 *)n + keep; + U64 decommit_size = n->cmt - keep; + AsanPoisonMemoryRegion(decommit_ptr, decommit_size); + decommit_memory(decommit_ptr, decommit_size); + n->cmt = keep; + } + } +#endif +} + //- rjf: arena push/pop helpers internal void diff --git a/src/base/base_arena.h b/src/base/base_arena.h index b3ba37827..a61f5bd3f 100644 --- a/src/base/base_arena.h +++ b/src/base/base_arena.h @@ -76,6 +76,9 @@ internal void *arena_push(Arena *arena, U64 size, U64 align, B32 zero); internal U64 arena_pos(Arena *arena); internal void arena_pop_to(Arena *arena, U64 pos); +//- rjf: arena decommit of unused (rewound/free) pages +internal void arena_decommit_unused(Arena *arena); + //- rjf: arena push/pop helpers internal void arena_clear(Arena *arena); internal void arena_pop(Arena *arena, U64 amt); diff --git a/src/base/base_thread_context.c b/src/base/base_thread_context.c index 421636101..d42be68ed 100644 --- a/src/base/base_thread_context.c +++ b/src/base/base_thread_context.c @@ -82,6 +82,21 @@ tctx_get_scratch(Arena **conflicts, U64 count) return result; } +//- rjf: scratch decommit (release committed-but-unused scratch pages back to OS) + +internal void +tctx_scratch_decommit(void) +{ + TCTX *tctx = tctx_selected(); + for(U64 i = 0; i < ArrayCount(tctx->arenas); i += 1) + { + if(tctx->arenas[i] != 0) + { + arena_decommit_unused(tctx->arenas[i]); + } + } +} + //- rjf: lane metadata internal LaneCtx diff --git a/src/base/base_thread_context.h b/src/base/base_thread_context.h index 494d1804f..e52b36476 100644 --- a/src/base/base_thread_context.h +++ b/src/base/base_thread_context.h @@ -90,6 +90,7 @@ internal TCTX *tctx_selected(void); //- rjf: scratch arenas internal Arena *tctx_get_scratch(Arena **conflicts, U64 count); +internal void tctx_scratch_decommit(void); #define scratch_begin(conflicts, count) temp_begin(tctx_get_scratch((conflicts), (count))) #define scratch_end(scratch) temp_end(scratch) diff --git a/src/coff/coff_parse.c b/src/coff/coff_parse.c index 1db0b5826..85835b977 100644 --- a/src/coff/coff_parse.c +++ b/src/coff/coff_parse.c @@ -166,11 +166,16 @@ coff_section_header_array_from_name(Arena *arena, String8 string_table, COFF_Sec } +// NOTE: name-skipping variants. coff_read_symbol_name does a cstr scan over the +// memory-mapped string table, which is the dominant cost when parsing symbols in +// bulk; callers that only need the scalar fields (value/section/storage_class/aux, +// e.g. symbol-value interpretation) should use these to avoid that scan. The full +// coff_parse_symbol{16,32} below are these plus the name read, so the scalar-field +// logic lives in exactly one place. internal COFF_ParsedSymbol -coff_parse_symbol32(String8 string_table, COFF_Symbol32 *sym32) +coff_parse_symbol32_no_name(COFF_Symbol32 *sym32) { COFF_ParsedSymbol result = {0}; - result.name = coff_read_symbol_name(string_table, &sym32->name); result.value = sym32->value; result.section_number = sym32->section_number; result.type = sym32->type; @@ -181,10 +186,9 @@ coff_parse_symbol32(String8 string_table, COFF_Symbol32 *sym32) } internal COFF_ParsedSymbol -coff_parse_symbol16(String8 string_table, COFF_Symbol16 *sym16) +coff_parse_symbol16_no_name(COFF_Symbol16 *sym16) { COFF_ParsedSymbol result = {0}; - result.name = coff_read_symbol_name(string_table, &sym16->name); result.value = sym16->value; if (sym16->section_number == COFF_Symbol_DebugSection16) { result.section_number = COFF_Symbol_DebugSection32; @@ -200,6 +204,22 @@ coff_parse_symbol16(String8 string_table, COFF_Symbol16 *sym16) return result; } +internal COFF_ParsedSymbol +coff_parse_symbol32(String8 string_table, COFF_Symbol32 *sym32) +{ + COFF_ParsedSymbol result = coff_parse_symbol32_no_name(sym32); + result.name = coff_read_symbol_name(string_table, &sym32->name); + return result; +} + +internal COFF_ParsedSymbol +coff_parse_symbol16(String8 string_table, COFF_Symbol16 *sym16) +{ + COFF_ParsedSymbol result = coff_parse_symbol16_no_name(sym16); + result.name = coff_read_symbol_name(string_table, &sym16->name); + return result; +} + internal COFF_ParsedSymbol coff_parse_symbol(COFF_FileHeaderInfo header, String8 string_table, String8 symbol_table, U32 symbol_idx) { diff --git a/src/coff/coff_parse.h b/src/coff/coff_parse.h index edcb29520..028e292f5 100644 --- a/src/coff/coff_parse.h +++ b/src/coff/coff_parse.h @@ -261,6 +261,8 @@ internal String8 coff_name_from_section_header (String8 str internal COFF_ParsedSymbol coff_parse_symbol32(String8 string_table, COFF_Symbol32 *sym32); internal COFF_ParsedSymbol coff_parse_symbol16(String8 string_table, COFF_Symbol16 *sym16); +internal COFF_ParsedSymbol coff_parse_symbol32_no_name(COFF_Symbol32 *sym32); +internal COFF_ParsedSymbol coff_parse_symbol16_no_name(COFF_Symbol16 *sym16); internal COFF_ParsedSymbol coff_parse_symbol (COFF_FileHeaderInfo header, String8 string_table, String8 symbol_table, U32 symbol_idx); internal COFF_Symbol32Array coff_symbol_array_from_data_16(Arena *arena, String8 data, U64 symbol_array_off, U64 symbol_count); diff --git a/src/linker/codeview_ext/ifc.c b/src/linker/codeview_ext/ifc.c new file mode 100644 index 000000000..e5693740f --- /dev/null +++ b/src/linker/codeview_ext/ifc.c @@ -0,0 +1,107 @@ +// Copyright (c) Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +read_only global U8 g_ifc_signature[4] = { 0x54, 0x51, 0x45, 0x1A }; +read_only global U8 g_uba_signature[4] = { 0x55, 0x42, 0x41, 0x01 }; // "UBA\x01" + +internal IFC_File +ifc_file_read(Arena *arena, String8 path, String8 *error_out) +{ + IFC_File ifc = {0}; + ifc.path = push_str8_copy(arena, path); + + String8 data = lnk_read_data_from_file_path(arena, 0, path); + ifc.data = data; + if (data.size < 4) { + *error_out = push_str8f(arena, "IFC '%S' is too small (%llu bytes)", path, data.size); + return ifc; + } + + // detect UBA-compressed input (out of scope) + if (MemoryMatch(data.str, g_uba_signature, sizeof(g_uba_signature))) { + *error_out = push_str8f(arena, "IFC '%S' is UBA-compressed (magic 'UBA\\x01'); materialize a raw .ifc (UBA decompress unsupported)", path); + return ifc; + } + + // validate signature + if ( ! MemoryMatch(data.str, g_ifc_signature, sizeof(g_ifc_signature))) { + *error_out = push_str8f(arena, "IFC '%S' has bad signature (expected 54 51 45 1A)", path); + return ifc; + } + + // --- parse header --- + U64 off = 4; + if (off + 32 > data.size) { goto truncated; } + MemoryCopy(ifc.content_hash, data.str + off, 32); + off += 32; + + if (off + 4 > data.size) { goto truncated; } + U8 major = data.str[off+0]; + U8 minor = data.str[off+1]; + U8 abi = data.str[off+2]; (void)abi; + U8 arch = data.str[off+3]; + off += 4; + + // assert version 0.44 + x64; error otherwise (encoding proven only for these) + if ( ! (major == 0 && minor == 44)) { + *error_out = push_str8f(arena, "IFC '%S' unsupported version %u.%u (expected 0.44)", path, major, minor); + return ifc; + } + if (arch != 2) { + *error_out = push_str8f(arena, "IFC '%S' unsupported architecture %u (expected 2 == x64)", path, arch); + return ifc; + } + + U32 cplusplus; off += str8_deserial_read_struct(data, off, &cplusplus); (void)cplusplus; + U32 string_table_bytes; off += str8_deserial_read_struct(data, off, &string_table_bytes); + U32 string_table_size; off += str8_deserial_read_struct(data, off, &string_table_size); + U32 unit; off += str8_deserial_read_struct(data, off, &unit); (void)unit; + U32 src_path; off += str8_deserial_read_struct(data, off, &src_path); (void)src_path; + U32 global_scope; off += str8_deserial_read_struct(data, off, &global_scope); (void)global_scope; + U32 toc; off += str8_deserial_read_struct(data, off, &toc); + U32 partition_count; off += str8_deserial_read_struct(data, off, &partition_count); + if (off > data.size) { goto truncated; } + + // string table + if ((U64)string_table_bytes + string_table_size > data.size) { + *error_out = push_str8f(arena, "IFC '%S' string table out of bounds", path); + return ifc; + } + String8 string_table = str8(data.str + string_table_bytes, string_table_size); + + // --- partition summary table --- + String8 needle = str8_lit(".msvc.trait.debug-records"); + U64 po = toc; + for (U32 i = 0; i < partition_count; ++i, po += 16) { + if (po + 16 > data.size) { goto truncated; } + U32 name_off, p_off, count, entity_size; + str8_deserial_read_struct(data, po + 0, &name_off); + str8_deserial_read_struct(data, po + 4, &p_off); + str8_deserial_read_struct(data, po + 8, &count); + str8_deserial_read_struct(data, po + 12, &entity_size); + + if (name_off >= string_table.size) { continue; } + String8 name = str8_cstring((char *)string_table.str + name_off); + if (str8_match(name, needle, 0)) { + // entity_size == 1 -> count is a byte length + if ((U64)p_off + count > data.size) { + *error_out = push_str8f(arena, "IFC '%S' debug-records partition out of bounds", path); + return ifc; + } + ifc.debug_records = str8(data.str + p_off, count); + break; + } + } + + if (ifc.debug_records.size == 0) { + *error_out = push_str8f(arena, "IFC '%S' has no '.msvc.trait.debug-records' partition", path); + return ifc; + } + + ifc.is_valid = 1; + return ifc; + +truncated: + *error_out = push_str8f(arena, "IFC '%S' is truncated", path); + return ifc; +} diff --git a/src/linker/codeview_ext/ifc.h b/src/linker/codeview_ext/ifc.h new file mode 100644 index 000000000..fb3169b75 --- /dev/null +++ b/src/linker/codeview_ext/ifc.h @@ -0,0 +1,43 @@ +// Copyright (c) Epic Games Tools +// Licensed under the MIT license (https://opensource.org/license/mit/) + +#pragma once + +//////////////////////////////// +// MSVC IFC (header-unit module interface) reader +// +// radlink consumes the `.msvc.trait.debug-records` partition embedded in an +// MSVC `.ifc` (C++20 module / header-unit interface) file. That partition is a +// raw CodeView type-leaf stream (entity_size == 1, NO u32 signature, first leaf +// at offset 0, TI base 0x1000). A consuming `.obj` references this stream via +// LF_IFC_RECORD (0x1522) leaves -- see lnk_debug_info.c. +// +// File layout (microsoft/ifc-spec, FileHeader): +// u8[4] signature = { 0x54,0x51,0x45,0x1A } ("TQE\x1a") +// u8[32] content_hash (sha256) -- record.GUID(16)++record.hash(16) == first 32 bytes here +// u8 major, minor -- assert 0.44 +// u8 abi +// u8 arch -- 2 == x64 +// u32 cplusplus +// u32 string_table_bytes (off), u32 string_table_size +// u32 unit +// u32 src_path (textoffset) +// u32 global_scope +// u32 toc -- offset to partition summary table +// u32 partition_count +// u8 internal_partition +// Partition summary entry (16 bytes): { u32 name(textoffset); u32 offset; u32 count; u32 entity_size } + +typedef struct IFC_File +{ + String8 data; // whole .ifc bytes (owning view into arena) + String8 path; // .ifc path (copied) + U8 content_hash[32]; + String8 debug_records; // .msvc.trait.debug-records blob {ptr,size}; size 0 if absent + B32 is_valid; +} IFC_File; + +// Reads `path`, validates magic/version/arch, locates `.msvc.trait.debug-records`. +// On error fills *error_out and returns is_valid=0. Detects UBA-compressed inputs +// (magic "UBA\x01") and reports them (decompression is out of scope). +internal IFC_File ifc_file_read(Arena *arena, String8 path, String8 *error_out); diff --git a/src/linker/lnk.c b/src/linker/lnk.c index 2850b08b9..c37033e23 100644 --- a/src/linker/lnk.c +++ b/src/linker/lnk.c @@ -78,6 +78,8 @@ #include "pdb_ext/msf_builder.c" #include "pdb_ext/pdb.c" #include "pdb_ext/pdb_helpers.c" +// fwd decl: parallel radix sort (defined later in this TU) so pdb_builder.c can use it +internal void lnk_radix_sort_u64_pairs(TP_Context *tp, Arena *arena, U64 n, U64 *keys, U32 *vals); #include "pdb_ext/pdb_builder.c" // --- RDI --------------------------------------------------------------------- @@ -110,6 +112,7 @@ #include "lnk_debug_helper.h" #include "lnk_obj.h" #include "lnk_lib.h" +#include "codeview_ext/ifc.h" #include "lnk_debug_info.h" #include "lnk.h" @@ -123,6 +126,7 @@ #include "lnk_obj.c" #include "lnk_debug_helper.c" #include "lnk_lib.c" +#include "codeview_ext/ifc.c" #include "lnk_debug_info.c" // ----------------------------------------------------------------------------- @@ -1641,19 +1645,35 @@ THREAD_POOL_TASK_FUNC(lnk_search_lib_task) LNK_LibMemberInfo *lib_member_infos = task->lib_member_infos; LNK_LibMemberRefList *member_ref_list = &task->member_ref_lists[task_id]; - for EachNode(c, LNK_SymbolHashTrieChunk, symtab->search_chunks[task_id].first) { - for EachIndex(i, c->count) { + // FRONTIER cursor: resume from where this worker last left off for this lib. search_chunks is + // append-only across the lib-search loop, so slots before the cursor were already searched and + // (search being a pure fn of (lib, symbol->name), member-queue dedup idempotent) can never resolve + // a new member. reset_cursor (anti-dep mode flip) forces a full rescan from list first. + LNK_SymbolHashTrieChunk *start_chunk = task->reset_cursor ? 0 : lib->search_cursor_chunk[task_id]; + U64 start_idx = task->reset_cursor ? 0 : lib->search_cursor_idx[task_id]; + + LNK_SymbolHashTrieChunk *first_chunk = start_chunk ? start_chunk : symtab->search_chunks[task_id].first; + LNK_SymbolHashTrieChunk *end_chunk = symtab->search_chunks[task_id].last; + U64 end_count = end_chunk ? end_chunk->count : 0; + + for (LNK_SymbolHashTrieChunk *c = first_chunk; c != 0; c = c->next) { + U64 i_begin = (c == start_chunk) ? start_idx : 0; + U64 i_end = c->count; + for (U64 i = i_begin; i < i_end; i += 1) { LNK_Symbol *symbol = c->v[i].symbol; - LNK_ObjSymbolRef symbol_ref = lnk_ref_from_symbol(symbol); - COFF_ParsedSymbol symbol_parsed = lnk_parsed_from_symbol(symbol); - COFF_SymbolValueInterpType symbol_interp = coff_interp_from_parsed_symbol(symbol_parsed); + // interp is cached on the symbol at push time, so the common case (resolved symbols, which + // stay in search_chunks but can never match a lib) costs one field read instead of re-parsing + // -- and page-faulting -- the COFF symbol record out of the mmap'd obj on every lib pass. + COFF_SymbolValueInterpType symbol_interp = symbol->interp; if (symbol_interp == COFF_SymbolValueInterp_Undefined) { U32 member_idx; if (lnk_search_lib(lib, symbol->name, &member_idx)) { lnk_queue_lib_member(arena, task->imports_hm, task->link->lib_member_infos_hm, member_ref_list, symbol, lib, lib_member_infos, member_idx); } } else if (symbol_interp == COFF_SymbolValueInterp_Weak) { + LNK_ObjSymbolRef symbol_ref = lnk_ref_from_symbol(symbol); + COFF_ParsedSymbol symbol_parsed = lnk_parsed_symbol_from_coff_symbol_idx_no_name(symbol_ref.obj, symbol_ref.symbol_idx); COFF_SymbolWeakExt *weak_ext = coff_parse_weak_tag(symbol_parsed, symbol_ref.obj->header.is_big_obj); if (weak_ext->characteristics == COFF_WeakExt_SearchLibrary) { U32 member_idx; @@ -1664,11 +1684,11 @@ THREAD_POOL_TASK_FUNC(lnk_search_lib_task) if (search_anti_deps) { LNK_ObjSymbolRef dep_symbol = {0}; if (lnk_resolve_weak_symbol(symtab, symbol_ref, &dep_symbol)) { - COFF_ParsedSymbol dep_parsed = lnk_parsed_symbol_from_coff_symbol_idx(dep_symbol.obj, dep_symbol.symbol_idx); + COFF_ParsedSymbol dep_parsed = lnk_parsed_symbol_from_coff_symbol_idx_no_name(dep_symbol.obj, dep_symbol.symbol_idx); COFF_SymbolValueInterpType dep_interp = coff_interp_from_parsed_symbol(dep_parsed); if (dep_interp == COFF_SymbolValueInterp_Weak) { U32 member_idx; - if (lnk_search_lib(lib, symbol_parsed.name, &member_idx)) { + if (lnk_search_lib(lib, symbol->name, &member_idx)) { lnk_queue_lib_member(arena, task->imports_hm, task->link->lib_member_infos_hm, member_ref_list, symbol, lib, lib_member_infos, member_idx); } } @@ -1678,6 +1698,13 @@ THREAD_POOL_TASK_FUNC(lnk_search_lib_task) } } } + + // advance cursor to the end of search_chunks as observed at the start of this scan. the tp dispatch + // is a barrier and load_inputs runs serially between dispatches, so no concurrent append occurs; + // we stamp the (end_chunk, end_count) snapshot taken before the loop so any slot appended after the + // snapshot is rescanned next time, never skipped. + lib->search_cursor_chunk[task_id] = end_chunk; + lib->search_cursor_idx[task_id] = end_count; } internal LNK_Lib * @@ -1816,16 +1843,43 @@ lnk_link_inputs(TP_Context *tp, } else { // search symbols in lib MemoryZeroTyped(member_ref_lists, tp->worker_count); - LNK_SearchLibTask search_task = { - .search_anti_deps = search_anti_deps, - .link = link, - .imports_hm = &imports_hm, - .lib = lib, - .symtab = symtab, - .lib_member_infos = lib_member_infos, - .member_ref_lists = member_ref_lists - }; - tp_for_parallel(tp, arena, tp->worker_count, lnk_search_lib_task, &search_task); + + // barrier elision: the search task scans every undefined/weak symbol in search_chunks and + // queues any member that resolves one. search_chunks only grows during this loop and the + // dedup against already-queued members is idempotent, so if neither the symbol set nor the + // anti-dep mode changed since this lib was last searched, the dispatch can only re-queue + // (deduped to) nothing. skipping it avoids waking+joining every worker for no work. + U64 search_symbol_count = lnk_symbol_table_search_symbol_count(symtab); + B32 can_skip_search = lib->was_searched && + lib->searched_symbol_count == search_symbol_count && + lib->searched_anti_deps == search_anti_deps; + if ( ! can_skip_search) { + // lazily alloc the per-worker frontier cursor arrays (cleared = unsearched/start). + if (lib->search_cursor_chunk == 0) { + lib->search_cursor_chunk = push_array(link->arena, LNK_SymbolHashTrieChunk *, tp->worker_count); + lib->search_cursor_idx = push_array(link->arena, U64, tp->worker_count); + } + + // anti-dep mode flipped since last search of this lib: anti-dep weak symbols before the + // cursor were skipped under the old mode, so they must be re-tried -> rescan from start. + B32 reset_cursor = lib->was_searched && lib->searched_anti_deps != search_anti_deps; + + LNK_SearchLibTask search_task = { + .search_anti_deps = search_anti_deps, + .reset_cursor = reset_cursor, + .link = link, + .imports_hm = &imports_hm, + .lib = lib, + .symtab = symtab, + .lib_member_infos = lib_member_infos, + .member_ref_lists = member_ref_lists + }; + tp_for_parallel(tp, arena, tp->worker_count, lnk_search_lib_task, &search_task); + + lib->was_searched = 1; + lib->searched_symbol_count = search_symbol_count; + lib->searched_anti_deps = search_anti_deps; + } } LNK_LibMemberRefList queued_members = {0}; @@ -2405,6 +2459,13 @@ lnk_link_image(TP_Context *tp, TP_Arena *arena, LNK_Config *config, LNK_Inputer ProfEnd(); } + // + // fold identical code COMDATs (routes followers to a leader; /OPT:REF then GCs them) + // + if (config->opt_icf == LNK_SwitchState_Yes) { + lnk_opt_icf(tp, arena->v[0], symtab, config, link->objs); + } + // // discard COMDAT sections that are not referenced // @@ -2517,6 +2578,7 @@ THREAD_POOL_TASK_FUNC(lnk_walk_relocs_and_mark_ref_sections_task) LNK_ObjList objs = task->objs; U8 **is_live = 0; + LNK_Obj **objs_by_idx = 0; // input_idx -> obj, for /OPT:ICF static-fold redirect U64 *active_thread_count = 0; LNK_RelocRefsBatchList *global_batch_list = 0; if (task_id == 0) { @@ -2524,11 +2586,13 @@ THREAD_POOL_TASK_FUNC(lnk_walk_relocs_and_mark_ref_sections_task) global_batch_list = push_array(scratch.arena, LNK_RelocRefsBatchList, 1); // alloc live flags and set live status on every non-COMDAT section - is_live = push_array_no_zero(scratch.arena, U8 *, objs.count); + is_live = push_array_no_zero(scratch.arena, U8 *, objs.count); + objs_by_idx = push_array_no_zero(scratch.arena, LNK_Obj *, objs.count ? objs.count : 1); { U64 obj_idx = 0; for EachNode(n, LNK_ObjNode, task->objs.first) { - is_live[obj_idx] = push_array(scratch.arena, U8, n->data.header.section_count_no_null + 1); + is_live[obj_idx] = push_array(scratch.arena, U8, n->data.header.section_count_no_null + 1); + objs_by_idx[obj_idx] = &n->data; for EachIndex(sect_idx, n->data.header.section_count_no_null) { is_live[obj_idx][sect_idx + 1] = !(n->data.section_flags[sect_idx] & COFF_SectionFlag_LnkCOMDAT); @@ -2589,6 +2653,7 @@ THREAD_POOL_TASK_FUNC(lnk_walk_relocs_and_mark_ref_sections_task) } } tp_broadcast(&is_live); + tp_broadcast(&objs_by_idx); tp_broadcast(&global_batch_list); tp_broadcast(&active_thread_count); @@ -2652,32 +2717,86 @@ THREAD_POOL_TASK_FUNC(lnk_walk_relocs_and_mark_ref_sections_task) if (ref_interp == COFF_SymbolValueInterp_Regular) { Temp temp = temp_begin(scratch2.arena); + // /OPT:ICF static-COMDAT redirect: if the referenced section was folded into a leader, + // mark the LEADER section live (and walk the LEADER's relocs/associated sections) instead + // of the dead follower. The follower then dead-strips, taking its .pdata/.xdata with it. + LNK_Obj *walk_obj = ref_symbol.obj; + U32 seed_sn = ref_parsed.section_number; + if (walk_obj->icf_fold && seed_sn != 0 && seed_sn <= walk_obj->header.section_count_no_null && + walk_obj->icf_fold[seed_sn - 1].set) { + LNK_ICFFold fold = walk_obj->icf_fold[seed_sn - 1]; + walk_obj = objs_by_idx[fold.leader_obj_idx]; + seed_sn = fold.leader_sn; + } + HashMap visited_sections_hm = {0}; U32Node *stack = push_array(temp.arena, U32Node, 1); - stack->data = ref_parsed.section_number; + stack->data = seed_sn; do { U32 section_number = stack->data; SLLStackPop(stack); // is section number valid? - if (section_number == 0 || section_number > ref_symbol.obj->header.section_count_no_null) { continue; } + if (section_number == 0 || section_number > walk_obj->header.section_count_no_null) { continue; } // detect cyclic associative sections if (hash_map_search_u64_u64(&visited_sections_hm, section_number)) { continue; } hash_map_push_u64_u64(temp.arena, &visited_sections_hm, section_number, 1); // push associated section - for EachNode(associated_n, U32Node, ref_symbol.obj->associated_sections[section_number]) { - if (hash_map_search_u64_u64(&visited_sections_hm, associated_n->data)) { continue; } + for EachNode(associated_n, U32Node, walk_obj->associated_sections[section_number]) { + U32 assoc_sn = associated_n->data; + + // /OPT:ICF static-COMDAT: an associative section can itself be a folded static + // follower (e.g. an associative .text$ funclet/thunk). Keeping the follower live here + // would defeat the fold and leave it with relocs into removed targets. Instead skip + // the follower and mark its LEADER live (cross-obj), so the follower dead-strips while + // its address still resolves to the identical leader. + if (walk_obj->icf_fold && assoc_sn >= 1 && assoc_sn <= walk_obj->header.section_count_no_null && + walk_obj->icf_fold[assoc_sn - 1].set) { + LNK_ICFFold afold = walk_obj->icf_fold[assoc_sn - 1]; + LNK_Obj *leader_obj = objs_by_idx[afold.leader_obj_idx]; + U32 leader_sn = afold.leader_sn; + // mark the leader .text and each of its associative sections (.pdata/.xdata) live, + // enqueuing each eligible one's relocs so their targets (handler .text, .xdata) are + // kept live too -- exactly as a normal seed-walk of the leader would do. + U32 leader_sns[64]; + U32 leader_sn_count = 0; + leader_sns[leader_sn_count++] = leader_sn; + for EachNode(lan, U32Node, leader_obj->associated_sections[leader_sn]) { + if (leader_sn_count < ArrayCount(leader_sns)) { leader_sns[leader_sn_count++] = lan->data; } + } + for EachIndex(lsi, leader_sn_count) { + U32 lsn = leader_sns[lsi]; + if (lsn == 0 || lsn > leader_obj->header.section_count_no_null) { continue; } + U8 lseen = ins_atomic_u8_eval_assign(&is_live[leader_obj->input_idx][lsn], 1); + if (lseen) { continue; } + COFF_SectionFlags lflags = leader_obj->section_flags[lsn - 1]; + if (lflags & (COFF_SectionFlag_LnkRemove | COFF_SectionFlag_LnkInfo | LNK_SECTION_FLAG_DEBUG)) { continue; } + LNK_RelocRefs lrefs = {0}; + lrefs.obj = leader_obj; + lrefs.relocs = lnk_coff_reloc_info_from_section_number(leader_obj, lsn); + LNK_RelocRefsBatch *lbatch = last_batch; + if (last_batch == 0 || last_batch->count >= ArrayCount(last_batch->v)) { + if (free_list.head.node) { lbatch = lnk_reloc_ref_batch_list_pop(&free_list); MemoryZeroStruct(lbatch); } + else { lbatch = push_array(scratch.arena, LNK_RelocRefsBatch, 1); } + SLLQueuePush(first_batch, last_batch, lbatch); + } + lbatch->v[lbatch->count++] = lrefs; + } + continue; // do not mark the follower live + } + + if (hash_map_search_u64_u64(&visited_sections_hm, assoc_sn)) { continue; } U32Node *stack_n = push_array(temp.arena, U32Node, 1); - stack_n->data = associated_n->data; + stack_n->data = assoc_sn; SLLStackPush(stack, stack_n); } - COFF_SectionFlags section_flags = ref_symbol.obj->section_flags[section_number-1]; + COFF_SectionFlags section_flags = walk_obj->section_flags[section_number-1]; // on first section visit, set live flag and enqueue section - U8 was_visited = ins_atomic_u8_eval_assign(&is_live[ref_symbol.obj->input_idx][section_number], 1); + U8 was_visited = ins_atomic_u8_eval_assign(&is_live[walk_obj->input_idx][section_number], 1); if (was_visited) { continue; } // is section eligible for walking? @@ -2686,8 +2805,8 @@ THREAD_POOL_TASK_FUNC(lnk_walk_relocs_and_mark_ref_sections_task) if (section_flags & LNK_SECTION_FLAG_DEBUG) { continue; } LNK_RelocRefs refs = {0}; - refs.obj = ref_symbol.obj; - refs.relocs = lnk_coff_reloc_info_from_section_number(ref_symbol.obj, section_number); + refs.obj = walk_obj; + refs.relocs = lnk_coff_reloc_info_from_section_number(walk_obj, section_number); // get a batch node LNK_RelocRefsBatch *batch = last_batch; @@ -2793,6 +2912,1611 @@ THREAD_POOL_TASK_FUNC(lnk_walk_relocs_and_mark_ref_sections_task) ProfEnd(); } +internal U64 +lnk_icf_mix(U64 h, U64 x) +{ + h ^= x; + h *= 0x100000001b3ull; + h ^= h >> 29; + return h; +} + +// Flat open-addressing U64 -> U64 map used by /OPT:ICF. The tree-based HashMap is far too slow +// for the millions of per-round dense-id lookups; this linear-probing table keeps the hot loops +// cache-friendly. Empty slots hold key == LNK_ICF_EMPTY (a value the stored hashes never take). +#define LNK_ICF_EMPTY 0xffffffffffffffffull +typedef struct LNK_ICFMap +{ + U64 *keys; + U64 *vals; + U64 mask; +} LNK_ICFMap; + +// avalanche so structured keys (e.g. (input_idx<<32)|section_number, whose low bits repeat +// across objects) scatter across slots instead of clustering into long probe chains +internal U64 +lnk_icf_scramble(U64 x) +{ + x ^= x >> 33; + x *= 0xff51afd7ed558ccdull; + x ^= x >> 33; + x *= 0xc4ceb9fe1a85ec53ull; + x ^= x >> 33; + return x; +} + +internal LNK_ICFMap +lnk_icf_map_make(Arena *arena, U64 capacity) +{ + U64 cap = 1; + while (cap < capacity*2) { cap <<= 1; } + LNK_ICFMap m = {0}; + m.keys = push_array_no_zero(arena, U64, cap); + m.vals = push_array_no_zero(arena, U64, cap); + m.mask = cap - 1; + // LNK_ICF_EMPTY is all-0xFF bytes -> a single memset (bandwidth-optimal, vectorized) instead of a + // scalar per-U64 store loop. cap can be 32M+ entries (256MB) on the monolithic link; the scalar + // loop was a serial first-touch page-fault sink. + MemorySet(m.keys, 0xff, cap*sizeof(U64)); + return m; +} + +// look up key; returns stored value or fallback if absent +internal U64 +lnk_icf_map_get(LNK_ICFMap *m, U64 key, U64 fallback) +{ + U64 slot = lnk_icf_scramble(key) & m->mask; + for (;;) { + U64 k = m->keys[slot]; + if (k == key) { return m->vals[slot]; } + if (k == LNK_ICF_EMPTY) { return fallback; } + slot = (slot + 1) & m->mask; + } +} + +internal void +lnk_icf_map_put(LNK_ICFMap *m, U64 key, U64 val) +{ + U64 slot = lnk_icf_scramble(key) & m->mask; + for (;;) { + U64 k = m->keys[slot]; + if (k == LNK_ICF_EMPTY || k == key) { m->keys[slot] = key; m->vals[slot] = val; return; } + slot = (slot + 1) & m->mask; + } +} + +// Thread-safe insert for the cand_map build (lnk_opt_icf): the keys are UNIQUE (one entry per +// (input_idx, sn) candidate -- a 1:1 map), so each insert claims a distinct empty slot. Claim it +// with an atomic CAS EMPTY->key; only the CAS winner writes vals[slot]. Because keys are unique no +// two writers ever target the same key, so the CAS is the sole arbiter of slot ownership and the +// vals[] store has exactly one writer. The map is only READ later (lnk_icf_map_get, after the build +// barrier), and a key-keyed probe finds its key regardless of WHICH slot collisions placed it in -- +// so the resulting key->val mapping (hence ICF output) is identical for any insertion order. +internal void +lnk_icf_map_put_atomic(LNK_ICFMap *m, U64 key, U64 val) +{ + U64 slot = lnk_icf_scramble(key) & m->mask; + for (;;) { + U64 k = m->keys[slot]; + if (k == LNK_ICF_EMPTY) { + // CAS returns the prior value; if it was EMPTY we won the slot. + if (ins_atomic_u64_eval_cond_assign(&m->keys[slot], key, LNK_ICF_EMPTY) == LNK_ICF_EMPTY) { + m->vals[slot] = val; // sole writer of this slot's value (keys are unique -> one CAS winner) + return; + } + // lost the race for this slot: re-read the same slot (another key may now own it, or it is + // still EMPTY-but-someone-else-is-mid-CAS -> the re-read resolves it). do NOT advance yet. + continue; + } + slot = (slot + 1) & m->mask; + } +} + +typedef struct LNK_ICFCand +{ + LNK_Obj *obj; + U32 sn; // section number + U32 reloc_first; // index into flattened reloc-target arrays + U32 reloc_count; + U64 key0; // round-0 content key + B8 is_static; // static (internal-linkage) COMDAT: folded via icf_fold, not symbol redirect + // NOTE: per-candidate equivalence-class "color" lives in a SEPARATE dense U64 colors[] array + // (see lnk_opt_icf), not here -- the refine/scan loops gather colors by candidate index millions + // of times; an 8B dense array keeps them cache-dense vs pulling this whole 40B struct per access. +} LNK_ICFCand; + +typedef struct LNK_ICFCandMapPutTask +{ + Rng1U64 *ranges; + LNK_ICFCand *cands; + LNK_ICFMap *cand_map; +} LNK_ICFCandMapPutTask; + +// Parallel build of the (input_idx, sn) -> cand_idx+1 lookup. The serial build was cache-miss-bound +// (~642ms): each lnk_icf_map_put probes a scrambled (random) slot. Spreading the inserts across the +// pool with the atomic-CAS claim (lnk_icf_map_put_atomic) parallelizes those misses. Output-neutral +// because keys are unique and the map is read only by key afterward (see lnk_icf_map_put_atomic). +internal +THREAD_POOL_TASK_FUNC(lnk_icf_candmap_put_task) +{ + LNK_ICFCandMapPutTask *task = raw_task; + for EachInRange(ci, task->ranges[task_id]) { + lnk_icf_map_put_atomic(task->cand_map, Compose64Bit(task->cands[ci].obj->input_idx, task->cands[ci].sn), ci + 1); + } +} + +typedef struct LNK_ICFHashTask +{ + Rng1U64 *ranges; + LNK_ICFCand *cands; + LNK_ICFMap *cand_map; + LNK_SymbolTable *symtab; + U8 *rt_iscand; + U64 *rt_target; +} LNK_ICFHashTask; + +// per-candidate content hash + relocation-target resolution (parallel; each candidate writes +// its own disjoint reloc slice and key0, all reads are of immutable structures) +internal +THREAD_POOL_TASK_FUNC(lnk_icf_hash_task) +{ + LNK_ICFHashTask *task = raw_task; + for EachInRange(ci, task->ranges[task_id]) { + LNK_ICFCand *c = &task->cands[ci]; + COFF_SectionHeader *header = lnk_coff_section_header_from_section_number(c->obj, c->sn); + COFF_RelocArray relocs = lnk_coff_relocs_from_section_header(c->obj, header); + String8 data = str8_substr(c->obj->data, rng_1u64(header->foff, header->foff + header->fsize)); + + blake3_hasher h; blake3_hasher_init(&h); + U32 flags_for_hash = c->obj->section_flags[c->sn - 1] & ~(COFF_SectionFlag_LnkCOMDAT | COFF_SectionFlag_LnkRemove); + blake3_hasher_update(&h, &flags_for_hash, sizeof(flags_for_hash)); + blake3_hasher_update(&h, &header->fsize, sizeof(header->fsize)); + blake3_hasher_update(&h, data.str, data.size); + + for EachIndex(ri, relocs.count) { + COFF_Reloc *reloc = &relocs.v[ri]; + blake3_hasher_update(&h, &reloc->type, sizeof(reloc->type)); + blake3_hasher_update(&h, &reloc->apply_off, sizeof(reloc->apply_off)); + + COFF_ParsedSymbol tp = lnk_parsed_symbol_from_coff_symbol_idx(c->obj, reloc->isymbol); + COFF_SymbolValueInterpType ti = coff_interp_from_parsed_symbol(tp); + LNK_ObjSymbolRef tref = { c->obj, reloc->isymbol }; + if (ti == COFF_SymbolValueInterp_Undefined || ti == COFF_SymbolValueInterp_Weak) { + LNK_ObjSymbolRef resolved = {0}; + if (lnk_resolve_symbol(task->symtab, tref, &resolved)) { + tref = resolved; + tp = lnk_parsed_symbol_from_coff_symbol_idx(tref.obj, tref.symbol_idx); + ti = coff_interp_from_parsed_symbol(tp); + } + } + + U8 iscand = 0; + U64 target = 0; + if (ti == COFF_SymbolValueInterp_Regular && tref.obj != 0) { + // Canonicalize a COMDAT definition to its selected leader so the SAME logical symbol keys + // identically regardless of which obj's local copy this reloc happens to name. Two byte- + // identical functions that each reference their own copy of a shared COMDAT (writable + // static guards, vtables, selectany globals) must otherwise get distinct per-obj keys and + // never fold. The symlink leader is the post-resolution canonical definition (built before + // ICF, read-only here), so keying by it is strictly more correct than per-obj. + // Keep the reloc's own target offset (tp.value): canonicalize only the SECTION IDENTITY + // (obj,sn) to the leader, never the offset, so distinct offsets into the same section stay + // distinct. The leader symbol itself always has value 0 (the symlink picks the value==0 + // definition), but a reloc may name an interior offset. + LNK_Obj *kobj = tref.obj; + U64 ksn = tp.section_number; + if (ksn >= 1 && ksn <= kobj->header.section_count_no_null && + (kobj->section_flags[ksn - 1] & COFF_SectionFlag_LnkCOMDAT)) { + LNK_Symbol *leader = lnk_obj_get_comdat_symlink(kobj, ksn); + if (leader) { + LNK_ObjSymbolRef lref = lnk_ref_from_symbol(leader); + // only section_number is needed here; skip the string-table name decode that the + // full lnk_parsed_from_symbol does per COMDAT reloc (millions of relocs in this phase). + if (lref.obj != 0) { + COFF_ParsedSymbol lp = lnk_parsed_symbol_from_coff_symbol_idx_no_name(lref.obj, lref.symbol_idx); + kobj = lref.obj; ksn = lp.section_number; + } + } + } + U64 cv = lnk_icf_map_get(task->cand_map, Compose64Bit(kobj->input_idx, ksn), 0); + if (cv) { iscand = 1; target = cv - 1; } + else { target = lnk_icf_mix(Compose64Bit(kobj->input_idx, ksn), tp.value); } + } else { + U64 nh = 14695981039346656037ull; + for (U64 i = 0; i < tp.name.size; i += 1) { nh = lnk_icf_mix(nh, tp.name.str[i]); } + target = lnk_icf_mix(nh, tp.value); + } + + U64 idx = (U64)c->reloc_first + ri; + task->rt_iscand[idx] = iscand; + task->rt_target[idx] = target; + if (!iscand) { blake3_hasher_update(&h, &target, sizeof(target)); } + } + + U8 out[16]; blake3_hasher_finalize(&h, out, sizeof(out)); + U64 lo = *(U64 *)&out[0], hi = *(U64 *)&out[8]; + c->key0 = lnk_icf_mix(lo, hi); + } +} + +// Split active[0,n) into worker_count contiguous ranges of ~equal RELOC WEIGHT (refine/verify cost +// per candidate ~= its reloc_count), instead of equal candidate count. Fixes the load imbalance where +// a worker holding high-reloc candidates runs long while others idle. Ranges are a pure function of +// reloc_count[active[]] -> deterministic. Returns worker_count+1 boundary array (dummy tail like +// tp_divide_work). active==0 means weight by cands[i] directly over [0,n). +internal Rng1U64 * +lnk_icf_divide_by_reloc(Arena *arena, U32 *active, U64 n, U32 worker_count, LNK_ICFCand *cands) +{ + Rng1U64 *ranges = push_array_no_zero(arena, Rng1U64, worker_count + 1); + U64 total_w = 0; + for EachIndex(i, n) { total_w += cands[active ? active[i] : i].reloc_count + 1; } // +1 so zero-reloc cands still count + U64 per_w = (total_w + worker_count - 1) / (worker_count ? worker_count : 1); + U64 cursor = 0, acc = 0, w = 0; + for (; w < worker_count; w += 1) { + U64 begin = cursor; + U64 target = (w + 1) * per_w; + while (cursor < n && acc < target) { acc += cands[active ? active[cursor] : cursor].reloc_count + 1; cursor += 1; } + ranges[w] = rng_1u64(begin, cursor); + } + // any remainder from rounding -> last worker + if (cursor < n) { ranges[worker_count - 1].max = n; } + ranges[worker_count] = rng_1u64(n, n); + return ranges; +} + +typedef struct LNK_ICFRefineTask +{ + Rng1U64 *ranges; + LNK_ICFCand *cands; + U32 *colors; // dense per-candidate color array, U32 (color ids < color_base < 2^32); + // separate from the 40B LNK_ICFCand so the hot color gathers touch 4B/elem. + U8 *rt_iscand; + U64 *rt_target; + U64 *newkey; + U32 *active; // when set, ranges index into active[] rather than cands[] directly +} LNK_ICFRefineTask; + +// recompute each candidate's refinement key from its current color and its targets' colors +// (parallel; reads the immutable color snapshot, writes its own newkey slot) +internal +THREAD_POOL_TASK_FUNC(lnk_icf_refine_task) +{ + LNK_ICFRefineTask *task = raw_task; + U32 *colors = task->colors; + for EachInRange(ai, task->ranges[task_id]) { + U64 ci = task->active ? task->active[ai] : ai; + LNK_ICFCand *c = &task->cands[ci]; + U64 k = lnk_icf_mix(0x9e3779b97f4a7c15ull, colors[ci]); + for EachIndex(j, c->reloc_count) { + U64 idx = (U64)c->reloc_first + j; + U64 t = task->rt_iscand[idx] ? colors[task->rt_target[idx]] : task->rt_target[idx]; + k = lnk_icf_mix(k, t); + } + task->newkey[ci] = k; + } +} + +// Parallel ICF fold: each group of same-colored candidates is independent (its members are distinct +// (obj,sn) sections -> distinct symlink nodes and distinct icf_fold slots), so leader election + +// byte/reloc verification + the fold writes can run concurrently across groups with no shared-state +// races. The dominant cost folded in here is the per-follower byte-compare (str8_match/memcmp), which +// was the largest single serial main-thread sink. Determinism is preserved: each group elects the +// same leader (smallest (is_static,input_idx,sn)) and writes the same disjoint slots regardless of +// which worker runs it. fold_count is accumulated per-worker. +// parallel helpers for lnk_icf_dense_colors_active: the per-round gather (sk[i]=newkey[active[i]]) +// and the final scatter (cands[active[sv[k]]].color = ...) are random-access over the candidate +// arrays (cache-miss bound). Spreading them across workers hides the memory latency. The boundary +// counting between them stays a cheap sequential pass over the sorted keys. +typedef struct LNK_ICFDenseTask +{ + Rng1U64 *ranges; + U32 *active; + U64 *newkey; + U64 *sk; + U32 *sv; + U32 *color_at; // per-sorted-position final color (filled serially, scattered in parallel) + U32 *colors; // dense per-candidate color array (scatter target) + U8 *keep; // per-sorted-position: 1 if its key-run has size>=2 (survives to next round) + U32 *out_slot; // per-sorted-position: exclusive prefix of keep[] -> deterministic emit slot + U32 *next_active; // emit target (survivors, in sorted-color order) + // parallel prefix-sum group scan (replaces the per-round serial scan over sk[] -- the refine-window + // sawtooth bottleneck, lnk_link_image self). boundary (run start) and keep (run size>=2 survivor) + // are per-position bits reading only sk[k], sk[k-1], sk[k+1] -> chunk-local. mark: write keep[] + + // per-chunk local boundary/keep/surviving-class counts. serial: tiny exclusive prefix over the + // worker_count chunk totals (NOT over n). apply: each chunk re-derives color_at[]/out_slot[] from + // its prefixed base -> identical values to the old serial running counter regardless of chunk split. + U64 *chunk_nc; // [worker_count] in: local boundary count; serial-prefix -> chunk's exclusive base + U64 *chunk_keep; // [worker_count] in: local keep count; serial-prefix -> chunk's exclusive base + U64 *chunk_scls; // [worker_count] local surviving-class count (boundary && keep), summed serially + U64 base; // color id base for this round + U64 n_total; // n (for the k+1==N end-of-run test; spans chunk edges read-only) +} LNK_ICFDenseTask; + +internal +THREAD_POOL_TASK_FUNC(lnk_icf_dense_gather_task) +{ + LNK_ICFDenseTask *t = raw_task; + for EachInRange(i, t->ranges[task_id]) { t->sk[i] = t->newkey[t->active[i]]; t->sv[i] = (U32)i; } +} + +// mark pass: per sorted position compute boundary (run start) and keep (run size>=2 -> survives); +// write keep[k]; accumulate this chunk's local boundary/keep/surviving-class counts. boundary/keep +// read only sk[k-1..k+1] -> chunk-local, so each chunk's local sums are independent (the run that +// straddles a chunk edge is counted once -- by whichever chunk owns its run-start position). +internal +THREAD_POOL_TASK_FUNC(lnk_icf_scan_mark_task) +{ + LNK_ICFDenseTask *t = raw_task; + Rng1U64 r = t->ranges[task_id]; + U64 N = t->n_total; + U64 nloc = 0, kloc = 0, sloc = 0; + for (U64 k = r.min; k < r.max; k += 1) { + B32 boundary = (k == 0 || t->sk[k] != t->sk[k - 1]); + B32 next_boundary = (k + 1 == N || t->sk[k + 1] != t->sk[k]); // start of next run (or end) + U8 survive = (U8)!(boundary && next_boundary); // singleton iff boundary && next_boundary + t->keep[k] = survive; + if (boundary) { nloc += 1; } + if (survive) { kloc += 1; } + if (boundary && survive) { sloc += 1; } // run-start of a surviving run -> one surviving class + } + t->chunk_nc[task_id] = nloc; + t->chunk_keep[task_id] = kloc; + t->chunk_scls[task_id] = sloc; +} + +// apply pass: chunk_nc[task_id]/chunk_keep[task_id] now hold this chunk's exclusive base (the running +// class id / emit slot at the chunk's first position). Re-derive the per-position bits and run the +// counters locally from that base to write color_at[k]/out_slot[k] -- byte-identical to the old serial +// scan regardless of how the chunks split, so the downstream scatter/emit stay deterministic. +internal +THREAD_POOL_TASK_FUNC(lnk_icf_scan_apply_task) +{ + LNK_ICFDenseTask *t = raw_task; + Rng1U64 r = t->ranges[task_id]; + U64 nc = t->chunk_nc[task_id]; // boundaries before this chunk (exclusive base) + U64 slot = t->chunk_keep[task_id]; // emit slot at chunk start + for (U64 k = r.min; k < r.max; k += 1) { + B32 boundary = (k == 0 || t->sk[k] != t->sk[k - 1]); + if (boundary) { nc += 1; } + t->color_at[k] = (U32)(t->base + (nc - 1)); + t->out_slot[k] = (U32)slot; + slot += t->keep[k]; + } +} + +internal +THREAD_POOL_TASK_FUNC(lnk_icf_dense_scatter_task) +{ + LNK_ICFDenseTask *t = raw_task; + for EachInRange(k, t->ranges[task_id]) { t->colors[t->active[t->sv[k]]] = t->color_at[k]; } +} + +// parallel survivor emit: each kept sorted-position writes active[sv[k]] to its precomputed slot +// out_slot[k]. Slots are a serial exclusive-prefix of keep[] (deterministic), so the emit order is +// identical regardless of which worker runs which k -> reproducible. This moves the random-access +// active[sv[k]] gather (the expensive serial part of the old group scan) onto the workers. +internal +THREAD_POOL_TASK_FUNC(lnk_icf_dense_emit_task) +{ + LNK_ICFDenseTask *t = raw_task; + for EachInRange(k, t->ranges[task_id]) { + if (t->keep[k]) { t->next_active[t->out_slot[k]] = t->active[t->sv[k]]; } + } +} + +// densify the keys of just the active subset into class ids drawn from `base` upward (so they never +// collide with the finalized colors of candidates that already dropped out as singletons). Returns +// the number of distinct classes among the active set. Equal keys land in one class regardless of +// sort tie order, so color values are deterministic. +// Densify active subset into class ids from `base` up. ALSO emits, in the same sorted pass (run +// lengths are free), the compacted next-round active set (members of classes that still have size>=2) +// into next_active[], and the count of such classes. This fuses the per-round singleton-drop scan +// (was two random-access colors[active[i]] gathers + a cls_size histogram) into the group scan. +// Determinism: equal keys land in one class regardless of sort tie order; next_active is emitted in +// sorted-color order (deterministic). Returns distinct class count nc. +internal U64 +lnk_icf_dense_colors_active(TP_Context *tp, Arena *arena, U32 *active, U64 n, U64 *newkey, U32 *colors, U64 base, + U32 *next_active, U64 *out_next_count, U64 *out_next_class_count) +{ + Temp t = temp_begin(arena); + U64 *sk = push_array_no_zero(t.arena, U64, n ? n : 1); + U32 *sv = push_array_no_zero(t.arena, U32, n ? n : 1); // index into active[] + U32 *color_at = push_array_no_zero(t.arena, U32, n ? n : 1); // per-sorted-position color + + LNK_ICFDenseTask dt = {0}; + dt.active = active; dt.newkey = newkey; dt.sk = sk; dt.sv = sv; dt.color_at = color_at; dt.colors = colors; + + // parallel gather: random-access newkey[active[i]] -> sk[i], spread across workers. NOTE radix + // (next line) allocates scratch on t.arena, so ranges built before radix would be stale -- rebuild + // ranges AFTER radix for the scatter. (This stale-ranges trap is why the parallel path was wrong.) + dt.ranges = tp_divide_work(t.arena, n, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_dense_gather_task, &dt); + lnk_radix_sort_u64_pairs(tp, t.arena, n, sk, sv); + dt.ranges = tp_divide_work(t.arena, n, tp->worker_count); // rebuilt AFTER radix for scatter + + // PARALLEL prefix-sum group scan over the SORTED keys (replaces the per-round serial scan that was + // the refine-window sawtooth bottleneck). mark: per-position keep bit + per-chunk local counts -> + // serial: tiny exclusive prefix over the worker_count chunk totals (NOT over n) -> apply: each chunk + // re-derives color_at[]/out_slot[] from its prefixed base. The result is a pure prefix of per-position + // bits, byte-identical to the old serial running counter regardless of chunk split -> deterministic. + U8 *keep = push_array_no_zero(t.arena, U8, n ? n : 1); + U32 *out_slot = push_array_no_zero(t.arena, U32, n ? n : 1); + U64 W = tp->worker_count; + U64 *chunk_nc = push_array_no_zero(t.arena, U64, W ? W : 1); + U64 *chunk_kp = push_array_no_zero(t.arena, U64, W ? W : 1); + U64 *chunk_sc = push_array_no_zero(t.arena, U64, W ? W : 1); + + dt.keep = keep; dt.out_slot = out_slot; dt.next_active = next_active; + dt.chunk_nc = chunk_nc; dt.chunk_keep = chunk_kp; dt.chunk_scls = chunk_sc; + dt.base = base; dt.n_total = n; + tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_scan_mark_task, &dt); + + // exclusive prefix over the W chunk totals -> chunk_nc[w]/chunk_kp[w] become the running class id / + // emit slot at chunk w's first position (the exclusive base apply counts from); sum surviving classes. + U64 nc = 0, next_count = 0, next_class_count = 0; + for EachIndex(w, W) { + U64 cn = chunk_nc[w], ck = chunk_kp[w], cs = chunk_sc[w]; + chunk_nc[w] = nc; chunk_kp[w] = next_count; + nc += cn; next_count += ck; next_class_count += cs; + } + tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_scan_apply_task, &dt); + +#if defined(ICF_SCAN_SELFCHECK) + { // SELF-CHECK (debug-only, gated): recompute the serial scan into shadow arrays, assert byte-exact + Temp tc = temp_begin(t.arena); + U8 *xkeep = push_array_no_zero(tc.arena, U8, n ? n : 1); + U32 *xslot = push_array_no_zero(tc.arena, U32, n ? n : 1); + U32 *xcol = push_array_no_zero(tc.arena, U32, n ? n : 1); + U64 xnc=0, xnext=0, xncls=0, run0=0, mism=0; + for EachIndex(k, n) { + B32 b = (k == 0 || sk[k] != sk[k - 1]); + if (b && k != 0) { U8 s=(k-run0>=2)?1:0; if(s)xncls++; for(U64 m=run0;m=2)?1:0; if(s)xncls++; for(U64 m=run0;mworker_count, lnk_icf_dense_scatter_task, &dt); // parallel color scatter + tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_dense_emit_task, &dt); // parallel survivor emit + + temp_end(t); + if (out_next_count) { *out_next_count = next_count; } + if (out_next_class_count) { *out_next_class_count = next_class_count; } + return nc; +} + +// radix digit width (also defined at the radix-sort section below; hoisted here so the persistent +// refine region can size its histogram). Keep both in sync. +#if !defined(LNK_RADIX_BITS) +#define LNK_RADIX_BITS 8 +#define LNK_RADIX_SIZE (1 << LNK_RADIX_BITS) +#endif + +// ============================================================================================ +// PERSISTENT-WORKER ICF REFINE REGION +// +// The per-round refine loop (refine -> dense_colors_active{gather,radix,scan,scatter,emit}) used to +// re-enter the thread pool ~10x per round via tp_for_parallel, ~18 rounds => ~126 fork-join cycles. +// Each phase is tiny, so the pool wake->work->sleep latency dominated lnk_link_image self time (the +// sawtooth). lnk_icf_refine_region collapses ALL rounds into ONE tp_for_parallel of worker_count +// participants; every old phase boundary becomes a barrier_wait(tp->barrier) inside the region. The +// tiny serial glue (radix per-pass 256xW prefix, group-scan W-chunk prefix, convergence/buffer swap) +// runs on worker 0 behind a barrier while the others wait -- same work, no pool re-entry. +// +// ORDER PRESERVATION: every phase keeps its EXACT chunking and math. ranges are rebuilt each round by +// worker 0 into preallocated buffers (lnk_icf_divide_by_reloc / tp_divide_work logic, in-place) and +// read by all workers after a barrier -- identical boundaries to the per-round path. The radix sort, +// scan_mark/apply, scatter and emit bodies are the SAME computations as the standalone tasks; only the +// dispatch changed (barrier instead of tp_for_parallel). Result is byte-identical to the per-round +// path (verified by ICF_SCAN_SELFCHECK + the canonical DLL cmp). +// ============================================================================================ + +typedef struct LNK_ICFRegion +{ + // immutable inputs + TP_Context *tp; + LNK_ICFCand *cands; + U8 *rt_iscand; + U64 *rt_target; + U64 cand_count; + U32 worker_count; + + // ping-pong active sets + colors/keys (driver owns the buffers; pointers swap each round on worker 0) + U32 *active; // current active set (refine reads) + U32 *active2; // next active set (emit writes) + U32 *colors; // dense per-candidate color (scatter target) + U64 *newkey; // refine writes; gather reads + + // per-round scratch (preallocated to cand_count once; reused every round) + U64 *sk; // sorted keys + U32 *sv; // sorted values (index into active[]) + U32 *color_at; // per-sorted-position color + U8 *keep; // per-sorted-position survivor bit + U32 *out_slot; // per-sorted-position emit slot + U64 *kbuf; // radix double-buffer (keys) + U32 *vbuf; // radix double-buffer (vals) + U32 *hist; // radix histogram [worker_count * LNK_RADIX_SIZE] + U64 *chunk_nc; // [worker_count] group-scan boundary counts + U64 *chunk_kp; // [worker_count] group-scan keep counts + U64 *chunk_sc; // [worker_count] group-scan surviving-class counts + U64 *chunk_max; // [worker_count] radix per-pass max-key reduction + Rng1U64 *refine_ranges;// [worker_count+1] reloc-weighted (refine) + Rng1U64 *work_ranges; // [worker_count+1] even split over n (gather/scan/scatter/emit/radix) + + // round-loop control (written by worker 0, broadcast via barrier) + U64 active_count; + U64 active_class_count; + U64 color_base; + U64 round_n; // active_count snapshot for the current round's phases + U64 radix_pass_count; // passes for current round's radix sort + U64 *radix_ksrc; // current radix source/dest (swap on worker 0 each pass) + U32 *radix_vsrc; + U64 *radix_kdst; + U32 *radix_vdst; + B32 converged; // set by worker 0 -> all break together + U64 max_rounds; // hard cap on rounds (region stops, may hand off to worklist tail) +} LNK_ICFRegion; + +// in-place reloc-weighted division (mirrors lnk_icf_divide_by_reloc; writes into preallocated ranges) +internal void +lnk_icf_divide_by_reloc_into(Rng1U64 *ranges, U32 *active, U64 n, U32 worker_count, LNK_ICFCand *cands) +{ + // Count-based split: O(worker_count), no scan, no random gather. The old reloc-weighted + // split did two O(n) cache-miss gathers (cands[active[i]].reloc_count) per round x ~18 + // rounds = ~5.6s serial. Work-split only -> output independent of it. + U64 W = worker_count ? worker_count : 1; + for (U64 w = 0; w < worker_count; w += 1) { + ranges[w] = rng_1u64((w * n) / W, ((w + 1) * n) / W); + } + ranges[worker_count] = rng_1u64(n, n); + (void)active; (void)cands; +} + +// in-place even division (mirrors tp_divide_work; writes into preallocated ranges) +internal void +lnk_tp_divide_work_into(Rng1U64 *ranges, U64 item_count, U32 worker_count) +{ + U64 per_count = CeilIntegerDiv(item_count, worker_count); + for (U64 i = 0; i < worker_count; i += 1) { + ranges[i] = rng_1u64(Min(item_count, i * per_count), Min(item_count, i * per_count + per_count)); + } + ranges[worker_count] = rng_1u64(item_count, item_count); +} + +// One persistent parallel region spanning ALL refine rounds. worker_count participants; phase +// boundaries are barriers; serial glue runs on worker 0. Each worker owns ranges[worker_id]. +internal +THREAD_POOL_TASK_FUNC(lnk_icf_refine_region_task) +{ + LNK_ICFRegion *rs = raw_task; + U64 wid = worker_id; // 1:1 worker<->task: every participant blocks on the first barrier before any + // can steal a second task, so task_id == worker_id throughout (see note below). + U64 W = rs->worker_count; + + for (U64 round = 0; round < rs->max_rounds; round += 1) { + // -------- round setup (worker 0): build ranges, snapshot active_count -------- + if (wid == 0) { + rs->round_n = rs->active_count; + lnk_icf_divide_by_reloc_into(rs->refine_ranges, rs->active, rs->active_count, (U32)W, rs->cands); + } + barrier_wait(rs->tp->barrier); + U64 n = rs->round_n; + + // -------- PHASE: refine (reloc-weighted ranges) -------- + { + Rng1U64 r = rs->refine_ranges[wid]; + for EachInRange(ai, r) { + U64 ci = rs->active[ai]; + LNK_ICFCand *c = &rs->cands[ci]; + U64 k = lnk_icf_mix(0x9e3779b97f4a7c15ull, rs->colors[ci]); + for EachIndex(j, c->reloc_count) { + U64 idx = (U64)c->reloc_first + j; + U64 t = rs->rt_iscand[idx] ? rs->colors[rs->rt_target[idx]] : rs->rt_target[idx]; + k = lnk_icf_mix(k, t); + } + rs->newkey[ci] = k; + } + } + barrier_wait(rs->tp->barrier); + + // -------- build even-split ranges (worker 0) for gather -------- + if (wid == 0) { lnk_tp_divide_work_into(rs->work_ranges, n, (U32)W); } + barrier_wait(rs->tp->barrier); + + // -------- PHASE: gather sk[i]=newkey[active[i]], sv[i]=i -------- + { + Rng1U64 r = rs->work_ranges[wid]; + for EachInRange(i, r) { rs->sk[i] = rs->newkey[rs->active[i]]; rs->sv[i] = (U32)i; } + } + barrier_wait(rs->tp->barrier); + + // -------- PHASE: parallel LSD radix sort of (sk, sv) over n -------- + // worker 0 establishes pass_count from a parallel max reduction; each pass = parallel hist -> + // serial 256xW prefix (worker 0) -> parallel scatter -> pointer swap (worker 0). Identical to + // lnk_radix_sort_u64_pairs, just barrier-driven (no temp/scratch alloc inside the region). + { + // parallel max-key reduction into chunk_max[wid] + { + Rng1U64 r = rs->work_ranges[wid]; + U64 m = 0; + for EachInRange(i, r) { if (rs->sk[i] > m) m = rs->sk[i]; } + rs->chunk_max[wid] = m; + } + barrier_wait(rs->tp->barrier); + if (wid == 0) { + U64 max_key = 0; + for EachIndex(w, W) { if (rs->chunk_max[w] > max_key) max_key = rs->chunk_max[w]; } + U64 pass_count; + if (max_key != 0) { + U64 sig_bits = 64 - clz64(max_key); + U64 sig_passes = (sig_bits + LNK_RADIX_BITS - 1) / LNK_RADIX_BITS; + pass_count = sig_passes + (sig_passes & 1); + } else { + pass_count = 0; + } + if (n < 2) { pass_count = 0; } + rs->radix_pass_count = pass_count; + rs->radix_ksrc = rs->sk; rs->radix_vsrc = rs->sv; + rs->radix_kdst = rs->kbuf; rs->radix_vdst = rs->vbuf; + } + barrier_wait(rs->tp->barrier); + + U64 pass_count = rs->radix_pass_count; + for (U64 pass = 0; pass < pass_count; pass += 1) { + U64 shift = pass * LNK_RADIX_BITS; + U64 *ksrc = rs->radix_ksrc; U32 *vsrc = rs->radix_vsrc; + U64 *kdst = rs->radix_kdst; U32 *vdst = rs->radix_vdst; + + // hist (parallel): zero own row, count digits over own range + { + U32 *h = rs->hist + wid * LNK_RADIX_SIZE; + MemoryZero(h, sizeof(U32) * LNK_RADIX_SIZE); + Rng1U64 r = rs->work_ranges[wid]; + for EachInRange(i, r) { h[(ksrc[i] >> shift) & (LNK_RADIX_SIZE - 1)] += 1; } + } + barrier_wait(rs->tp->barrier); + + // exclusive prefix across (bucket, worker) -- serial, worker 0 (identical order to the task path) + if (wid == 0) { + U64 running = 0; + for EachIndex(bucket, LNK_RADIX_SIZE) { + for EachIndex(w, W) { + U32 *slot = &rs->hist[w * LNK_RADIX_SIZE + bucket]; + U32 c = *slot; + *slot = (U32)running; + running += c; + } + } + } + barrier_wait(rs->tp->barrier); + + // scatter (parallel): each worker writes its disjoint contiguous run + { + U32 *h = rs->hist + wid * LNK_RADIX_SIZE; + Rng1U64 r = rs->work_ranges[wid]; + for EachInRange(i, r) { + U64 d = (ksrc[i] >> shift) & (LNK_RADIX_SIZE - 1); + U32 pos = h[d]++; + kdst[pos] = ksrc[i]; + vdst[pos] = vsrc[i]; + } + } + barrier_wait(rs->tp->barrier); + + // swap src/dst (worker 0); even pass_count leaves sorted data back in sk/sv + if (wid == 0) { + rs->radix_ksrc = kdst; rs->radix_vsrc = vdst; + rs->radix_kdst = ksrc; rs->radix_vdst = vsrc; + } + barrier_wait(rs->tp->barrier); + } + // ranges unchanged across radix; work_ranges already an even split over n for scan/scatter/emit + } + + // -------- PHASE: scan mark (per-position keep bit + per-chunk local counts) -------- + { + Rng1U64 r = rs->work_ranges[wid]; + U64 N = n; + U64 nloc = 0, kloc = 0, sloc = 0; + for (U64 k = r.min; k < r.max; k += 1) { + B32 boundary = (k == 0 || rs->sk[k] != rs->sk[k - 1]); + B32 next_boundary = (k + 1 == N || rs->sk[k + 1] != rs->sk[k]); + U8 survive = (U8)!(boundary && next_boundary); + rs->keep[k] = survive; + if (boundary) { nloc += 1; } + if (survive) { kloc += 1; } + if (boundary && survive) { sloc += 1; } + } + rs->chunk_nc[wid] = nloc; rs->chunk_kp[wid] = kloc; rs->chunk_sc[wid] = sloc; + } + barrier_wait(rs->tp->barrier); + + // -------- serial: exclusive prefix over W chunk totals (worker 0) -------- + if (wid == 0) { + U64 nc = 0, next_count = 0, next_class_count = 0; + for EachIndex(w, W) { + U64 cn = rs->chunk_nc[w], ck = rs->chunk_kp[w], cs = rs->chunk_sc[w]; + rs->chunk_nc[w] = nc; rs->chunk_kp[w] = next_count; + nc += cn; next_count += ck; next_class_count += cs; + } + // stash round totals in chunk_max[0..2] (free scratch; radix max-reduce reuses it fresh next + // round, and the convergence block below reads it within this same round before the swap). + rs->chunk_max[0] = nc; rs->chunk_max[1] = next_count; rs->chunk_max[2] = next_class_count; + } + barrier_wait(rs->tp->barrier); + + U64 base = rs->color_base; + + // -------- PHASE: scan apply (re-derive color_at[]/out_slot[] from chunk's exclusive base) -------- + { + Rng1U64 r = rs->work_ranges[wid]; + U64 nc = rs->chunk_nc[wid]; + U64 slot = rs->chunk_kp[wid]; + for (U64 k = r.min; k < r.max; k += 1) { + B32 boundary = (k == 0 || rs->sk[k] != rs->sk[k - 1]); + if (boundary) { nc += 1; } + rs->color_at[k] = (U32)(base + (nc - 1)); + rs->out_slot[k] = (U32)slot; + slot += rs->keep[k]; + } + } + barrier_wait(rs->tp->barrier); + + // -------- PHASE: color scatter -------- + { + Rng1U64 r = rs->work_ranges[wid]; + for EachInRange(k, r) { rs->colors[rs->active[rs->sv[k]]] = rs->color_at[k]; } + } + barrier_wait(rs->tp->barrier); + + // -------- PHASE: survivor emit -------- + { + Rng1U64 r = rs->work_ranges[wid]; + for EachInRange(k, r) { + if (rs->keep[k]) { rs->active2[rs->out_slot[k]] = rs->active[rs->sv[k]]; } + } + } + barrier_wait(rs->tp->barrier); + + // -------- serial: convergence + buffer swap (worker 0), broadcast via barrier -------- + if (wid == 0) { + U64 nc = rs->chunk_max[0]; + U64 next_count = rs->chunk_max[1]; + U64 next_class_count = rs->chunk_max[2]; + rs->color_base += nc; + if (nc == rs->active_class_count) { + rs->converged = 1; + } else { + U32 *tmp = rs->active; rs->active = rs->active2; rs->active2 = tmp; + rs->active_count = next_count; + rs->active_class_count = next_class_count; + if (rs->active_count == 0) { rs->converged = 1; } + } + } + barrier_wait(rs->tp->barrier); + + if (rs->converged) { break; } + } +} + +// Is an object section an ICF fold candidate? Only externally-defined COMDATs are folded: the +// follower is redirected at its shared symbol-table node, so every reference (including from other +// objects) resolves to the leader and /OPT:REF then drops the follower. Returns 1 = candidate. +internal U32 +lnk_icf_section_kind(LNK_Obj *obj, U64 sect_idx, B32 include_static) +{ + COFF_SectionFlags flags = obj->section_flags[sect_idx]; + if (~flags & COFF_SectionFlag_LnkCOMDAT) { return 0; } + if ( flags & COFF_SectionFlag_LnkRemove) { return 0; } + // fold code and read-only initialized data (const tables, vtables, string literals); folding + // identical read-only data lets functions that reference it fold too (cascade). Mutable data + // is never folded. + B32 is_code = (flags & COFF_SectionFlag_CntCode) != 0; + B32 is_rodata = (flags & COFF_SectionFlag_CntInitializedData) && !(flags & COFF_SectionFlag_MemWrite); + if (!is_code && !is_rodata) { return 0; } + U32 sn = (U32)sect_idx + 1; + COFF_SectionHeader *header = lnk_coff_section_header_from_section_number(obj, sn); + if (header->fsize == 0) { return 0; } + LNK_Symbol *sym = lnk_obj_get_comdat_symlink(obj, sn); + if (sym == 0) { return include_static ? 2 : 0; } // static COMDAT: opt-in /OPT:ICFSTATIC, folded via icf_fold + dead-strip + LNK_ObjSymbolRef sym_ref = lnk_ref_from_symbol(sym); + if (sym_ref.obj != obj) { return 0; } // same-name follower (already removed) + COFF_ParsedSymbol sym_parsed = lnk_parsed_from_symbol(sym); + if (sym_parsed.section_number != sn || sym_parsed.value != 0) { return 0; } + return 1; // external leader +} + +typedef struct LNK_ICFCollectTask +{ + LNK_Obj **objs; + U64 *counts; + U64 *offsets; + LNK_ICFCand *cands; + B32 include_static; +} LNK_ICFCollectTask; + +internal +THREAD_POOL_TASK_FUNC(lnk_icf_count_task) +{ + LNK_ICFCollectTask *t = raw_task; + LNK_Obj *obj = t->objs[task_id]; + U64 n = 0; + for EachIndex(si, obj->header.section_count_no_null) { if (lnk_icf_section_kind(obj, si, t->include_static)) { n += 1; } } + t->counts[task_id] = n; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_icf_fill_task) +{ + LNK_ICFCollectTask *t = raw_task; + LNK_Obj *obj = t->objs[task_id]; + U64 cur = t->offsets[task_id]; + for EachIndex(si, obj->header.section_count_no_null) { + U32 kind = lnk_icf_section_kind(obj, si, t->include_static); + if (kind) { + LNK_ICFCand *c = &t->cands[cur++]; + c->obj = obj; c->sn = (U32)si + 1; + c->is_static = (kind == 2); + // count relocs here (parallel, the section is already in hand) so lnk_opt_icf only needs a + // cheap serial prefix sum for reloc_first instead of re-parsing every section serially. + COFF_SectionHeader *header = lnk_coff_section_header_from_section_number(obj, c->sn); + c->reloc_first = 0; + c->reloc_count = (U32)lnk_coff_relocs_from_section_header(obj, header).count; + c->key0 = 0; + } + } +} + +// --- parallel LSD radix sort (U64 key + U32 value), 8 x 8-bit passes ------------------------- +// 8-bit digits keep the serial per-pass offset prefix tiny (256*workers, not 65536*workers); +// the extra passes are parallel histogram+scatter, so they cost little. +#if !defined(LNK_RADIX_BITS) +#define LNK_RADIX_BITS 8 +#define LNK_RADIX_SIZE (1 << LNK_RADIX_BITS) +#endif +typedef struct LNK_RadixSortTask +{ + Rng1U64 *ranges; + U64 *ksrc; U32 *vsrc; // read + U64 *kdst; U32 *vdst; // write (scatter) + U32 *hist; // [worker_count * LNK_RADIX_SIZE], per-worker digit offsets + U64 shift; +} LNK_RadixSortTask; + +internal +THREAD_POOL_TASK_FUNC(lnk_radix_hist_task) +{ + LNK_RadixSortTask *t = raw_task; + U32 *h = t->hist + (U64)task_id * LNK_RADIX_SIZE; + for EachInRange(i, t->ranges[task_id]) { h[(t->ksrc[i] >> t->shift) & (LNK_RADIX_SIZE - 1)] += 1; } +} + +internal +THREAD_POOL_TASK_FUNC(lnk_radix_scatter_task) +{ + LNK_RadixSortTask *t = raw_task; + U32 *h = t->hist + (U64)task_id * LNK_RADIX_SIZE; + for EachInRange(i, t->ranges[task_id]) { + U64 d = (t->ksrc[i] >> t->shift) & (LNK_RADIX_SIZE - 1); + U32 pos = h[d]++; + t->kdst[pos] = t->ksrc[i]; + t->vdst[pos] = t->vsrc[i]; + } +} + +// sort (keys[], vals[]) ascending by key, in place. arena supplies scratch (double buffers + histograms). +internal void +lnk_radix_sort_u64_pairs(TP_Context *tp, Arena *arena, U64 n, U64 *keys, U32 *vals) +{ + if (n < 2) { return; } + U64 W = tp->worker_count; + + // Only sort over the passes that can actually differ: many call sites key by dense color ids + // (small ints), so the top 4-6 radix bytes are all zero. Each skipped pass is a full + // hist+scatter over n pairs (memory-bound). An even pass_count keeps the result in the + // original arrays (the double-buffer swap invariant), so round max_key up to a full byte. + U64 max_key = 0; + for EachIndex(i, n) { if (keys[i] > max_key) { max_key = keys[i]; } } + U64 pass_count = 64 / LNK_RADIX_BITS; + if (max_key != 0) { + U64 sig_bits = 64 - clz64(max_key); + U64 sig_passes = (sig_bits + LNK_RADIX_BITS - 1) / LNK_RADIX_BITS; + pass_count = sig_passes + (sig_passes & 1); // keep even so data lands back in keys/vals + } else { + pass_count = 0; // all keys equal (0); already trivially sorted, scatter is identity + } + if (pass_count == 0) { return; } + + Temp scratch = scratch_begin(&arena, 1); // internal buffers freed on return + LNK_RadixSortTask t = {0}; + t.ranges = tp_divide_work(scratch.arena, n, W); + t.hist = push_array_no_zero(scratch.arena, U32, W * LNK_RADIX_SIZE); + U64 *kbuf = push_array_no_zero(scratch.arena, U64, n); + U32 *vbuf = push_array_no_zero(scratch.arena, U32, n); + + U64 *ksrc = keys, *kdst = kbuf; + U32 *vsrc = vals, *vdst = vbuf; + for (U64 pass = 0; pass < pass_count; pass += 1) { + t.shift = pass * LNK_RADIX_BITS; + t.ksrc = ksrc; t.vsrc = vsrc; t.kdst = kdst; t.vdst = vdst; + + MemoryZero(t.hist, sizeof(U32) * W * LNK_RADIX_SIZE); + tp_for_parallel(tp, 0, W, lnk_radix_hist_task, &t); + + // exclusive prefix across (bucket, worker) so each worker writes a disjoint contiguous run + U64 running = 0; + for EachIndex(bucket, LNK_RADIX_SIZE) { + for EachIndex(w, W) { + U32 *slot = &t.hist[w * LNK_RADIX_SIZE + bucket]; + U32 c = *slot; + *slot = (U32)running; + running += c; + } + } + + tp_for_parallel(tp, 0, W, lnk_radix_scatter_task, &t); + + U64 *kt = ksrc; ksrc = kdst; kdst = kt; + U32 *vt = vsrc; vsrc = vdst; vdst = vt; + } + // even pass count -> sorted data is back in the original keys/vals arrays + scratch_end(scratch); +} + +// assign each candidate a dense equivalence-class id from its key, via a parallel sort + a +// cheap sequential group scan. Returns the number of distinct classes (for convergence). +internal U64 +lnk_icf_dense_colors(TP_Context *tp, Arena *arena, U64 n, U64 *keys, U32 *colors) +{ + Temp t = temp_begin(arena); + U64 *sk = push_array_no_zero(t.arena, U64, n ? n : 1); + U32 *sv = push_array_no_zero(t.arena, U32, n ? n : 1); + for EachIndex(ci, n) { sk[ci] = keys[ci]; sv[ci] = (U32)ci; } + lnk_radix_sort_u64_pairs(tp, t.arena, n, sk, sv); + U64 nc = 0; + for EachIndex(k, n) { + if (k == 0 || sk[k] != sk[k - 1]) { nc += 1; } + colors[sv[k]] = (U32)(nc - 1); + } + temp_end(t); + return nc; +} + +// ICF fold VERIFY (parallel, read-only). Per color group: elect the leader, then byte+reloc-verify +// each follower against it. The dominant cost is the str8_match/memcmp byte compare over millions of +// candidate sections -- read-only, so it parallelizes across groups with no shared writes. Output: +// leader_sci[k] = the elected leader's sci-index for sorted position k IF k is a verified follower +// that folds, else max_U32. The serial apply pass (in lnk_opt_icf) consumes this in group order, so +// the symlink/icf_fold writes stay serial+deterministic (cross-class symlink chains must not race). +typedef struct LNK_ICFFoldVerifyTask +{ + Rng1U64 *ranges; // group-index ranges per worker + U32 *group_first; + U32 *sci; + LNK_ICFCand *cands; + U8 *rt_iscand; + U64 *rt_target; + U32 *colors; + U32 *leader_sci; // out: per sorted position -> leader's sci index, or max_U32 + U32 *group_leader_oi; // out: per group -> elected leader sorted-position (for apply) +} LNK_ICFFoldVerifyTask; + +internal +THREAD_POOL_TASK_FUNC(lnk_icf_fold_verify_task) +{ + LNK_ICFFoldVerifyTask *task = raw_task; + for EachInRange(gi, task->ranges[task_id]) { + U64 i = task->group_first[gi]; + U64 j = task->group_first[gi + 1]; + task->group_leader_oi[gi] = (U32)i; + if (j - i < 2) { continue; } + + U64 leader_oi = i; + for (U64 k = i + 1; k < j; k += 1) { + LNK_ICFCand *a = &task->cands[task->sci[leader_oi]]; + LNK_ICFCand *b = &task->cands[task->sci[k]]; + B32 b_better = (b->is_static < a->is_static) || + (b->is_static == a->is_static && + Compose64Bit(b->obj->input_idx, b->sn) < Compose64Bit(a->obj->input_idx, a->sn)); + if (b_better) { leader_oi = k; } + } + task->group_leader_oi[gi] = (U32)leader_oi; + + LNK_ICFCand *L = &task->cands[task->sci[leader_oi]]; + COFF_SectionHeader *Lheader = lnk_coff_section_header_from_section_number(L->obj, L->sn); + String8 Ldata = str8_substr(L->obj->data, rng_1u64(Lheader->foff, Lheader->foff + Lheader->fsize)); + + for (U64 k = i; k < j; k += 1) { + if (k == leader_oi) { continue; } + LNK_ICFCand *F = &task->cands[task->sci[k]]; + COFF_SectionHeader *Fheader = lnk_coff_section_header_from_section_number(F->obj, F->sn); + if (Fheader->fsize != Lheader->fsize) { continue; } + if (F->reloc_count != L->reloc_count) { continue; } + String8 Fdata = str8_substr(F->obj->data, rng_1u64(Fheader->foff, Fheader->foff + Fheader->fsize)); + if (!str8_match(Ldata, Fdata, 0)) { continue; } + B32 relocs_match = 1; + for EachIndex(t, L->reloc_count) { + U64 li = L->reloc_first + t, fi = F->reloc_first + t; + U64 lt = task->rt_iscand[li] ? task->colors[task->rt_target[li]] : task->rt_target[li]; + U64 ft = task->rt_iscand[fi] ? task->colors[task->rt_target[fi]] : task->rt_target[fi]; + if (lt != ft) { relocs_match = 0; break; } + } + if (!relocs_match) { continue; } + task->leader_sci[k] = task->sci[leader_oi]; // verified fold + } + } +} + +// ============================================================================================ +// ICF DIRTY-CLASS WORKLIST (TAIL ACCELERATOR) +// +// The persistent refine region re-sorts the ENTIRE active set (~10.65M candidates) every round until a +// round splits nothing. Measured churn (live UE link): the partition is ~99.99% stable by round ~7; in +// the last ~11 rounds only tens-to-hundreds of candidates land in a class that actually SPLITS, yet each +// round still pays the full ~10.65M-key sort. The work is fixed by the sort size, NOT by the (tiny) +// amount of real refinement left. That tail is ~3-4s of pure waste. +// +// A candidate's refine key can only change if its OWN color changed (its class split) OR one of its +// reloc TARGETS changed color (a target's class split). So once a class is stable AND none of the +// classes it points at split, it can NEVER split again -- re-keying/re-sorting it is provably a no-op. +// +// The worklist tracks exactly the classes that can still split (DIRTY classes). Each round it re-keys +// only members of dirty classes, against a FROZEN colors[] snapshot (Jacobi: all keys this round read +// pre-round colors; new colors commit only at the round barrier -- reading mid-round colors would +// over-refine past the unique coarsest stable partition and may never terminate). When a class splits, +// every candidate that REFERENCES a member of the split class is enqueued next round (reverse +// target->referrers index). Dirty set empty => fixpoint. +// +// This reaches the IDENTICAL coarsest stable partition as the full region (same refinement operator; we +// only skip recomputation that is provably a no-op). The absolute color-id VALUES differ (ids from a +// separate running counter), but EVERY downstream consumer -- fold grouping (sorts by color, groups = +// equal-color runs), leader election (by candidate identity, not id), fold-verify (tests +// colors[a]==colors[b]) -- depends ONLY on the equivalence relation, never on id values or class +// ordering. So an id-renamed but partition-identical colors[] is byte-identical output. (Asserted every +// round by ICF_WORKLIST_SELFCHECK against the full region partition over the whole UE link.) +// ============================================================================================ + +// Build CSR reverse adjacency: for target candidate T, the referrers (candidates C with a reloc C->T) +// are rev_adj[rev_off[T] .. rev_off[T+1]). Deterministic (cands iterated in order -> referrers sorted by +// referrer index). Self-edges (C references its own class) are kept; harmless (re-keys C's own class). +// serial in-place sort of (keys[],vals[]) ascending by (key, then val). Used INSIDE the worklist per +// dirty class -- classes are small (a few..hundreds of members), so a pool-free serial sort beats +// re-entering the thread pool thousands of times per round. Stable on equal keys is not required (we +// sub-sort sub-runs by val anyway), but we break key ties by val for a deterministic order. +internal void +lnk_icf_small_sort(U64 *keys, U32 *vals, U64 n) +{ + // insertion sort for tiny n; introsort-lite (here: just insertion + a simple shellsort gap for + // larger classes) keeps it dependency-free and deterministic. n is bounded by the largest class. + for (U64 gap = n / 2; gap > 0; gap /= 2) { + for (U64 i = gap; i < n; i += 1) { + U64 k = keys[i]; U32 v = vals[i]; + U64 j = i; + while (j >= gap) { + U64 pk = keys[j - gap]; U32 pv = vals[j - gap]; + if (pk < k || (pk == k && pv <= v)) { break; } + keys[j] = pk; vals[j] = pv; j -= gap; + } + keys[j] = k; vals[j] = v; + } + } +} + +internal void +lnk_icf_build_reverse_index(Arena *arena, U64 cand_count, LNK_ICFCand *cands, U8 *rt_iscand, U64 *rt_target, + U32 **out_rev_off, U32 **out_rev_adj) +{ + U32 *rev_off = push_array(arena, U32, cand_count + 1); + for EachIndex(ci, cand_count) { + LNK_ICFCand *c = &cands[ci]; + for EachIndex(j, c->reloc_count) { + U64 idx = (U64)c->reloc_first + j; + if (rt_iscand[idx]) { rev_off[(U32)rt_target[idx] + 1] += 1; } + } + } + for EachIndex(t, cand_count) { rev_off[t + 1] += rev_off[t]; } + U64 edge_count = rev_off[cand_count]; + U32 *rev_adj = push_array_no_zero(arena, U32, edge_count ? edge_count : 1); + Temp t = temp_begin(arena); + U32 *cursor = push_array(t.arena, U32, cand_count ? cand_count : 1); + for EachIndex(ci, cand_count) { + LNK_ICFCand *c = &cands[ci]; + for EachIndex(j, c->reloc_count) { + U64 idx = (U64)c->reloc_first + j; + if (rt_iscand[idx]) { + U32 tg = (U32)rt_target[idx]; + rev_adj[rev_off[tg] + cursor[tg]] = (U32)ci; + cursor[tg] += 1; + } + } + } + temp_end(t); + *out_rev_off = rev_off; + *out_rev_adj = rev_adj; +} + +// Dirty-class worklist tail. Precondition: colors[] holds a stable-so-far partition; `active`/ +// `active_count` is the current active set (members of classes of size>=2); color_base = next free id. +// Runs the worklist to fixpoint, updating colors[] in place. Returns the round count via *out_rounds and +// 1 on success / 0 if the hang-guard tripped (caller must NOT ship -- it is a bug). +// +// Data model: the work unit is a CLASS (a color). Per-color member lists live in one flat slab `mem[]`; +// a class's members are mem[slot_first[s] .. slot_first[s]+slot_count[s]). `slot_of_color` maps a live +// color -> slot+1. A split replaces the parent slot's mem range, in sorted (newkey, cand-idx) order, with +// its sub-runs: the first sub-run KEEPS the parent slot & color; later sub-runs get fresh slots & fresh +// color ids. Singletons are dropped (final). Determinism: members in ascending cand index within a class; +// sub-runs sub-sorted by cand index; new ids from a deterministic running counter. +internal B32 +lnk_icf_worklist_refine(TP_Context *tp, Arena *arena, U64 cand_count, LNK_ICFCand *cands, + U32 *colors, U8 *rt_iscand, U64 *rt_target, + U32 *rev_off, U32 *rev_adj, + U32 *active, U64 active_count, U64 color_base, U64 *out_rounds) +{ + (void)tp; (void)cand_count; + Temp t = temp_begin(arena); + + // capacities: members <= active_count; every split adds >=1 new slot but total live members never + // exceeds active_count, and a class of size m can split into at most m sub-classes, so total slots + // ever created <= active_count. Size all slot-indexed arrays to active_count (+slack). + U64 cap = active_count + 16; + U32 *mem = push_array_no_zero(t.arena, U32, cap); + U32 *slot_first = push_array_no_zero(t.arena, U32, cap); + U32 *slot_count = push_array_no_zero(t.arena, U32, cap); + U32 *slot_color = push_array_no_zero(t.arena, U32, cap); + U64 slot_n = 0, mem_n = 0; + LNK_ICFMap slot_of_color = lnk_icf_map_make(t.arena, cap); + + // initial slots: group the active set by current color (one sort). Active members are already only in + // classes of size>=2, so every produced slot has count>=2. + { + U64 *sk = push_array_no_zero(t.arena, U64, active_count ? active_count : 1); + U32 *sv = push_array_no_zero(t.arena, U32, active_count ? active_count : 1); + for EachIndex(i, active_count) { sk[i] = (U64)colors[active[i]]; sv[i] = active[i]; } + lnk_radix_sort_u64_pairs(tp, t.arena, active_count, sk, sv); // by color, ties by cand idx (stable-by-value) + for (U64 a = 0; a < active_count; ) { + U32 col = (U32)sk[a]; + U64 b = a; while (b < active_count && (U32)sk[b] == col) { b += 1; } + U32 first = (U32)mem_n; + for (U64 q = a; q < b; q += 1) { mem[mem_n++] = sv[q]; } + slot_first[slot_n] = first; slot_count[slot_n] = (U32)(b - a); slot_color[slot_n] = col; + lnk_icf_map_put(&slot_of_color, col, slot_n + 1); + slot_n += 1; + a = b; + } + } + + // dirty worklist = slots to re-densify this round. Seed with all slots. + U32 *dirty = push_array_no_zero(t.arena, U32, cap); + U32 *next_dirty = push_array_no_zero(t.arena, U32, cap); + U8 *in_dirty = push_array(t.arena, U8, cap); // dedup membership for next_dirty (cleared after swap) + U64 dirty_n = 0; + for EachIndex(s, slot_n) { dirty[dirty_n++] = (U32)s; } + + // staged this round (committed only at the round barrier so re-keying reads FROZEN colors[]): + // - new color assignments (ci -> color); applied to colors[] after the round + // - newly created slots (from 2nd+ sub-runs of splits) + // The parent slot's mem range is rewritten in place immediately (disjoint from other slots' ranges and + // independent of colors[]), so member order is settled during the dirty loop; only colors[]/slot + // registration is deferred. + U32 *stage_ci = push_array_no_zero(t.arena, U32, active_count ? active_count : 1); + U32 *stage_col = push_array_no_zero(t.arena, U32, active_count ? active_count : 1); + U32 *split_mem = push_array_no_zero(t.arena, U32, active_count ? active_count : 1); // members of split classes + U32 *ns_slot = push_array_no_zero(t.arena, U32, cap); // staged new slot indices + U32 *ns_first = push_array_no_zero(t.arena, U32, cap); + U32 *ns_count = push_array_no_zero(t.arena, U32, cap); + U32 *ns_color = push_array_no_zero(t.arena, U32, cap); + + // hang-guard: hard round cap AND non-progress detection. The worklist tail converges in <~20 cheap + // rounds; if dirty_n fails to strictly shrink for too many consecutive rounds, it is a re-dirty bug. + enum { LNK_ICF_WL_MAX_ROUNDS = 40, LNK_ICF_WL_STALL_LIMIT = 8 }; + U64 rounds = 0, stall = 0, prev_dirty_n = max_U64; + B32 ok = 1; + + while (dirty_n > 0) { + if (rounds >= LNK_ICF_WL_MAX_ROUNDS) { ok = 0; break; } + if (dirty_n >= prev_dirty_n) { stall += 1; if (stall >= LNK_ICF_WL_STALL_LIMIT) { ok = 0; break; } } + else { stall = 0; } + prev_dirty_n = dirty_n; + rounds += 1; + + U64 stage_n = 0, split_n = 0, ns_n = 0; + + for EachIndex(di, dirty_n) { + U32 slot = dirty[di]; + U32 m = slot_count[slot]; + if (m < 2) { continue; } + U32 first = slot_first[slot]; + + // re-key this class's members against the FROZEN colors[] snapshot (colors[] not yet mutated) + Temp tk = temp_begin(t.arena); + U64 *kk = push_array_no_zero(tk.arena, U64, m); + U32 *vv = push_array_no_zero(tk.arena, U32, m); + for EachIndex(r, m) { + U32 ci = mem[first + r]; + LNK_ICFCand *c = &cands[ci]; + U64 k = lnk_icf_mix(0x9e3779b97f4a7c15ull, colors[ci]); + for EachIndex(j, c->reloc_count) { + U64 idx = (U64)c->reloc_first + j; + U64 tt = rt_iscand[idx] ? colors[rt_target[idx]] : rt_target[idx]; + k = lnk_icf_mix(k, tt); + } + kk[r] = k; vv[r] = ci; + } + lnk_icf_small_sort(kk, vv, m); // serial, pool-free: group members by new key (ties by cand idx) + + U64 sub = 0; + for EachIndex(r, m) { if (r == 0 || kk[r] != kk[r - 1]) { sub += 1; } } + if (sub == 1) { temp_end(tk); continue; } // class did not split + + // class splits. Rewrite parent mem range in sorted vv order (settles member order now). The 1st + // sub-run keeps slot/color; later sub-runs are staged as new slots with fresh ids. colors[] writes + // are STAGED (deferred to commit) so other dirty classes this round still read frozen colors. + for EachIndex(r, m) { mem[first + r] = vv[r]; } + U64 run0 = 0; B32 first_run = 1; + for (U64 r = 1; r <= m; r += 1) { + B32 boundary = (r == m) || (kk[r] != kk[r - 1]); + if (!boundary) { continue; } + U64 rn = r - run0; + U32 run_first = (U32)(first + run0); + U32 run_color; + if (first_run) { + run_color = slot_color[slot]; first_run = 0; + // parent slot's count shrinks to the 1st sub-run; staged so dirty loop still sees old count. + ns_slot[ns_n] = slot; // reuse parent slot + } else { + run_color = (U32)color_base; color_base += 1; + ns_slot[ns_n] = max_U32; // allocate a fresh slot at commit + } + ns_first[ns_n] = run_first; ns_count[ns_n] = (U32)rn; ns_color[ns_n] = run_color; ns_n += 1; + for (U64 q = run0; q < r; q += 1) { + U32 ci = mem[first + q]; + stage_ci[stage_n] = ci; stage_col[stage_n] = run_color; stage_n += 1; + split_mem[split_n++] = ci; + } + run0 = r; + } + temp_end(tk); + } + + if (stage_n == 0) { break; } // nothing split -> fixpoint + + // COMMIT (round barrier): register slots, then apply colors[]. (Order matters only that all reads in + // the dirty loop above already finished -- they did.) + for EachIndex(s, ns_n) { + U32 rslot = ns_slot[s]; + if (rslot == max_U32) { rslot = (U32)slot_n; slot_n += 1; } + slot_first[rslot] = ns_first[s]; slot_count[rslot] = ns_count[s]; slot_color[rslot] = ns_color[s]; + lnk_icf_map_put(&slot_of_color, (U64)ns_color[s], rslot + 1); + } + for EachIndex(i, stage_n) { colors[stage_ci[i]] = stage_col[i]; } + + // next dirty set: every referrer of a split-class member, mapped to its CURRENT class slot (size>=2). + U64 next_n = 0; + for EachIndex(i, split_n) { + U32 mem_ci = split_mem[i]; + for (U32 e = rev_off[mem_ci]; e < rev_off[mem_ci + 1]; e += 1) { + U32 ref = rev_adj[e]; + U64 sc = lnk_icf_map_get(&slot_of_color, (U64)colors[ref], 0); + if (sc) { + U32 rslot = (U32)(sc - 1); + if (slot_count[rslot] >= 2 && !in_dirty[rslot]) { in_dirty[rslot] = 1; next_dirty[next_n++] = rslot; } + } + } + } + U32 *tmp = dirty; dirty = next_dirty; next_dirty = tmp; + dirty_n = next_n; + for EachIndex(i, dirty_n) { in_dirty[dirty[i]] = 0; } + + if (lnk_get_log_status(LNK_Log_Debug)) { + lnk_log(LNK_Log_Debug, "/OPT:ICF worklist round %llu: split_members=%llu next_dirty=%llu slots=%llu", + rounds, split_n, dirty_n, slot_n); + } + } + + temp_end(t); + if (out_rounds) { *out_rounds = rounds; } + return ok; +} + +// relocations point at equivalent targets (iteratively), then folds each group's followers +// into a single leader by routing them through the existing COMDAT symlink machinery and +// letting /OPT:REF garbage-collect the now-unreferenced follower sections (and their +// associated .pdata/.xdata/.debug$S). Mirrors link.exe /OPT:ICF. +internal void +lnk_opt_icf(TP_Context *tp, Arena *perm, LNK_SymbolTable *symtab, LNK_Config *config, LNK_ObjList objs_list) +{ + if (config->opt_icf != LNK_SwitchState_Yes) { return; } + + ProfBeginFunction(); + Temp scratch = scratch_begin(&perm, 1); + Arena *arena = scratch.arena; + + U64 objs_count = objs_list.count; + LNK_Obj **objs = lnk_array_from_obj_list(arena, objs_list); + + // allocate the per-section static-fold map on every obj (sn-1 indexed, zeroed = not folded). It + // lives on the obj (not scratch) because /OPT:REF and the final section map read it after ICF. + for EachIndex(oi, objs_count) { + objs[oi]->icf_fold = push_array(perm, LNK_ICFFold, objs[oi]->header.section_count_no_null ? objs[oi]->header.section_count_no_null : 1); + } + + // collect candidate sections (parallel per obj): count, exact-size, then fill at per-obj + // offsets. Counting first avoids sizing the array to the total section count (which would be + // ~10x larger and waste ~1GB on UE-scale inputs). + LNK_ICFCollectTask col = {0}; + col.include_static = (config->opt_icf_static == LNK_SwitchState_Yes); + col.objs = objs; + col.counts = push_array(arena, U64, objs_count ? objs_count : 1); + tp_for_parallel(tp, 0, objs_count, lnk_icf_count_task, &col); + col.offsets = offsets_from_counts_array_u64(arena, col.counts, objs_count); + U64 cand_count = sum_array_u64(objs_count, col.counts); + LNK_ICFCand *cands = push_array_no_zero(arena, LNK_ICFCand, cand_count ? cand_count : 1); + col.cands = cands; + tp_for_parallel(tp, 0, objs_count, lnk_icf_fill_task, &col); + + if (cand_count < 2) { goto done; } + + // build target lookup (input_idx, section_number) -> cand_idx+1, sized to the candidate count + LNK_ICFMap cand_map = lnk_icf_map_make(arena, cand_count); + { + LNK_ICFCandMapPutTask put_task = {0}; + put_task.ranges = tp_divide_work(arena, cand_count, tp->worker_count); + put_task.cands = cands; + put_task.cand_map = &cand_map; + tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_candmap_put_task, &put_task); + } + + // assign each candidate a disjoint slice in the flattened reloc-target arrays. reloc_count was + // filled in parallel by lnk_icf_fill_task, so this is just a serial prefix sum (no re-parsing). + U64 total_relocs = 0; + for EachIndex(ci, cand_count) { + cands[ci].reloc_first = (U32)total_relocs; + total_relocs += cands[ci].reloc_count; + } + U8 *rt_iscand = push_array_no_zero(arena, U8, total_relocs ? total_relocs : 1); + U64 *rt_target = push_array_no_zero(arena, U64, total_relocs ? total_relocs : 1); + + { + LNK_ICFHashTask hash_task = {0}; + hash_task.ranges = tp_divide_work(arena, cand_count, tp->worker_count); + hash_task.cands = cands; + hash_task.cand_map = &cand_map; + hash_task.symtab = symtab; + hash_task.rt_iscand = rt_iscand; + hash_task.rt_target = rt_target; + tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_hash_task, &hash_task); + } + + // dense per-candidate equivalence-class colors (separate 8B array; hot refine/scan loops gather + // these by index millions of times -- keeping them out of the 40B LNK_ICFCand keeps the gathers + // cache-dense). zero-init: round 0 assigns real colors. + U32 *colors = push_array(arena, U32, cand_count ? cand_count : 1); + + // assign initial colors from content key (parallel sort + group scan) + U64 *newkey = push_array_no_zero(arena, U64, cand_count ? cand_count : 1); + for EachIndex(ci, cand_count) { newkey[ci] = cands[ci].key0; } + U64 class_count = lnk_icf_dense_colors(tp, arena, cand_count, newkey, colors); + U64 color_base = class_count; // next free class id; ids only ever grow, so finalized colors stick + + // Active set = candidates still sharing a class with another. A singleton class can never split + // or merge, so once a candidate is alone its color is final and it leaves refinement. Each round + // re-densifies only the active set, shrinking the per-round sort from all ~N candidates down to + // just those that still have a content+reloc twin. + // ping-pong active buffers: refine reads `active`, densify writes compacted survivors into `active2`. + // They must be distinct (densify reads active[sv[m]] while writing next_active[]). Swap each round. + U32 *active = push_array_no_zero(arena, U32, cand_count ? cand_count : 1); + U32 *active2 = push_array_no_zero(arena, U32, cand_count ? cand_count : 1); + U64 active_count = 0, active_class_count = 0; + { + Temp t = temp_begin(arena); + U32 *cls_size = push_array(t.arena, U32, class_count ? class_count : 1); + for EachIndex(ci, cand_count) { cls_size[colors[ci]] += 1; } + for EachIndex(ci, cand_count) { if (cls_size[colors[ci]] > 1) { active[active_count++] = (U32)ci; } } + for EachIndex(c, class_count) { if (cls_size[c] > 1) { active_class_count += 1; } } + temp_end(t); + } + + // iteratively refine classes by relocation-target classes until no class splits. + // PERSISTENT-WORKER REGION: collapse the ~18 rounds x ~10 phases of fork-join into ONE parallel + // region (lnk_icf_refine_region_task) where every phase boundary is a barrier and the serial glue + // (radix prefixes, group-scan prefix, convergence/swap) runs on worker 0. Byte-identical to the + // per-round path (preserves chunk boundaries, processing order, and serial-prefix math). + if (active_count > 0) { + LNK_ICFRegion rs = {0}; + rs.tp = tp; + rs.cands = cands; + rs.rt_iscand = rt_iscand; + rs.rt_target = rt_target; + rs.cand_count = cand_count; + rs.worker_count = tp->worker_count; + rs.active = active; + rs.active2 = active2; + rs.colors = colors; + rs.newkey = newkey; + + // per-round scratch, preallocated once to the max possible size (cand_count); reused every round. + U64 W = tp->worker_count; + rs.sk = push_array_no_zero(arena, U64, cand_count ? cand_count : 1); + rs.sv = push_array_no_zero(arena, U32, cand_count ? cand_count : 1); + rs.color_at = push_array_no_zero(arena, U32, cand_count ? cand_count : 1); + rs.keep = push_array_no_zero(arena, U8, cand_count ? cand_count : 1); + rs.out_slot = push_array_no_zero(arena, U32, cand_count ? cand_count : 1); + rs.kbuf = push_array_no_zero(arena, U64, cand_count ? cand_count : 1); + rs.vbuf = push_array_no_zero(arena, U32, cand_count ? cand_count : 1); + rs.hist = push_array_no_zero(arena, U32, W * LNK_RADIX_SIZE); + rs.chunk_nc = push_array_no_zero(arena, U64, W ? W : 1); + rs.chunk_kp = push_array_no_zero(arena, U64, W ? W : 1); + rs.chunk_sc = push_array_no_zero(arena, U64, W ? W : 1); + rs.chunk_max = push_array_no_zero(arena, U64, (W > 3 ? W : 3) ? (W > 3 ? W : 3) : 1); + rs.refine_ranges = push_array_no_zero(arena, Rng1U64, W + 1); + rs.work_ranges = push_array_no_zero(arena, Rng1U64, W + 1); + + rs.active_count = active_count; + rs.active_class_count = active_class_count; + rs.color_base = color_base; + rs.converged = 0; + + // TAIL ACCELERATOR: run the persistent region for a bounded number of warm-up rounds (churn is large + // here, the full parallel sort wins), then hand the still-active set to the dirty-class worklist for + // the long near-converged tail (rounds 8..18 re-sort ~10.65M to resolve a few hundred splits). Both + // compute the unique coarsest stable partition, so the final colors[] partition is identical; the + // worklist just reaches it in tail work ~= churn instead of ~= active_count. + // Worklist tail DISABLED by default: its CSR reverse-index (rev_adj = U32 x candidate-edge-count) + // costs ~10GB peak on the UE link, which hurts the page-fault-bound critical path more than the + // ~3-4s of tail re-sorts it saves. region_cap=64 lets the persistent region run to the true fixpoint + // (~19 rounds), so `converged` is set and the worklist handoff below is skipped (no reverse-index). + // Lower this back to 8 to re-enable the worklist on memory-rich machines. + U64 region_cap = 64; // was 8 (worklist warm-up) + +#if defined(ICF_WORKLIST_SELFCHECK) + // REFERENCE: clone pre-region state and run the region UNCAPPED to the true fixpoint into shadow + // colors[]; later assert the cap+worklist partition is identical every link. Reuses rs scratch + // buffers (sequential, not concurrent). active2/colors are restored to pre-region values first. + U32 *ref_colors = push_array_no_zero(arena, U32, cand_count ? cand_count : 1); + MemoryCopy(ref_colors, colors, sizeof(U32) * cand_count); + { + U32 *ref_active = push_array_no_zero(arena, U32, cand_count ? cand_count : 1); + U32 *ref_active2 = push_array_no_zero(arena, U32, cand_count ? cand_count : 1); + MemoryCopy(ref_active, active, sizeof(U32) * active_count); + LNK_ICFRegion ref = rs; // copy all scratch pointers + sizes + ref.active = ref_active; + ref.active2 = ref_active2; + ref.colors = ref_colors; + ref.active_count = active_count; ref.active_class_count = active_class_count; + ref.color_base = color_base; ref.converged = 0; + ref.max_rounds = 64; // uncapped (true fixpoint) + tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_refine_region_task, &ref); + } + // colors[] was untouched (ref ran on ref_colors); active/active2 untouched (ref used clones). +#endif + + rs.max_rounds = region_cap; + tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_refine_region_task, &rs); + + // mirror final state back (active/colors already mutated in place; pointers may have swapped) + active = rs.active; + active2 = rs.active2; + color_base = rs.color_base; + active_count = rs.active_count; + active_class_count = rs.active_class_count; + + // hand off the tail to the dirty-class worklist if the region stopped at the cap (not yet converged) + if (!rs.converged && active_count > 0) { + U32 *rev_off = 0, *rev_adj = 0; + lnk_icf_build_reverse_index(arena, cand_count, cands, rt_iscand, rt_target, &rev_off, &rev_adj); + U64 wrounds = 0; + B32 wl_ok = lnk_icf_worklist_refine(tp, arena, cand_count, cands, colors, rt_iscand, rt_target, + rev_off, rev_adj, active, active_count, color_base, &wrounds); + AssertAlways(wl_ok); // hang-guard tripped (round cap / non-progress) -> BUG, do not ship + if (lnk_get_log_status(LNK_Log_Debug)) { + lnk_log(LNK_Log_Debug, "/OPT:ICF worklist tail: %llu rounds (active=%llu at handoff)", wrounds, active_count); + } + } + +#if defined(ICF_WORKLIST_SELFCHECK) + // ASSERT: the cap+worklist colors[] induce the SAME partition as the uncapped region (ref_colors). + // Partition equality = the map colors[ci] -> ref_colors[ci] is a consistent bijection (both ways). + { + Temp tc = temp_begin(arena); + LNK_ICFMap fwd = lnk_icf_map_make(tc.arena, cand_count); // colors -> ref_colors + LNK_ICFMap bwd = lnk_icf_map_make(tc.arena, cand_count); // ref_colors -> colors + U64 mism = 0; + for EachIndex(ci, cand_count) { + U64 a = (U64)colors[ci], b = (U64)ref_colors[ci]; + U64 fa = lnk_icf_map_get(&fwd, a, max_U64); + if (fa == max_U64) { lnk_icf_map_put(&fwd, a, b); } else if (fa != b) { mism += 1; } + U64 fb = lnk_icf_map_get(&bwd, b, max_U64); + if (fb == max_U64) { lnk_icf_map_put(&bwd, b, a); } else if (fb != a) { mism += 1; } + } + if (mism) { + lnk_log(LNK_Log_Debug, "ICF_WORKLIST_SELFCHECK PARTITION MISMATCH: %llu cands disagree", mism); + } + AssertAlways(mism == 0); // worklist partition != full-region partition -> WRONG OUTPUT, do not ship + temp_end(tc); + } +#endif + } + (void)active2; (void)color_base; (void)active_count; (void)active_class_count; + + // group candidates by final color (parallel sort) and fold followers into a leader + U64 *keys = newkey; // reuse + U32 *sci = push_array_no_zero(arena, U32, cand_count ? cand_count : 1); + for EachIndex(ci, cand_count) { keys[ci] = colors[ci]; sci[ci] = (U32)ci; } + lnk_radix_sort_u64_pairs(tp, arena, cand_count, keys, sci); + + // serial pass: find group boundaries (cheap O(cand_count) scan over the sorted color keys). Each + // group [group_first[g], group_first[g+1]) shares one color. Verification + folding runs in + // parallel across groups (groups touch disjoint sections/symlink nodes -> race-free, deterministic). + U32 *group_first = push_array_no_zero(arena, U32, cand_count + 1); + U64 group_count = 0; + for (U64 i = 0; i < cand_count; ) { + group_first[group_count++] = (U32)i; + U64 color = keys[i]; + U64 j = i + 1; + while (j < cand_count && keys[j] == color) { j += 1; } + i = j; + } + group_first[group_count] = (U32)cand_count; + + // PARALLEL verify (read-only byte+reloc compare across groups) -> SERIAL apply (deterministic, + // group-ordered symlink/icf_fold writes; cross-class symlink chains must not race, so writes stay + // serial). leader_sci[k] = leader's sci index if sorted position k folds, else max_U32. + U32 *leader_sci = push_array_no_zero(arena, U32, cand_count ? cand_count : 1); + for EachIndex(k, cand_count) { leader_sci[k] = max_U32; } + U32 *group_leader_oi = push_array_no_zero(arena, U32, group_count ? group_count : 1); + if (group_count) { + LNK_ICFFoldVerifyTask vt = {0}; + vt.ranges = tp_divide_work(arena, group_count, tp->worker_count); + vt.group_first = group_first; + vt.sci = sci; + vt.cands = cands; + vt.rt_iscand = rt_iscand; + vt.rt_target = rt_target; + vt.colors = colors; + vt.leader_sci = leader_sci; + vt.group_leader_oi = group_leader_oi; + tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_fold_verify_task, &vt); + } + + U64 fold_count = 0; + for EachIndex(gi, group_count) { + U64 i = group_first[gi]; + U64 j = group_first[gi + 1]; + if (j - i < 2) { continue; } + U64 leader_oi = group_leader_oi[gi]; + LNK_ICFCand *L = &cands[sci[leader_oi]]; + LNK_SymbolHashTrie *Lnode = L->obj->symlinks[L->sn]; + for (U64 k = i; k < j; k += 1) { + if (leader_sci[k] == max_U32) { continue; } // not a verified follower + LNK_ICFCand *F = &cands[sci[k]]; + LNK_SymbolHashTrie *Fnode = F->obj->symlinks[F->sn]; + if (Fnode) { + if (Lnode && Fnode != Lnode) { Fnode->symbol = Lnode->symbol; fold_count += 1; } + } else { + F->obj->icf_fold[F->sn - 1].leader_obj_idx = L->obj->input_idx; + F->obj->icf_fold[F->sn - 1].leader_sn = L->sn; + F->obj->icf_fold[F->sn - 1].set = 1; + fold_count += 1; + } + } + } + + if (lnk_get_log_status(LNK_Log_Debug)) { + lnk_log(LNK_Log_Debug, "/OPT:ICF folded %llu of %llu code COMDATs into %llu classes", fold_count, cand_count, class_count); + } + + done:; + scratch_end(scratch); + ProfEnd(); +} + internal void lnk_opt_ref(TP_Context *tp, LNK_SymbolTable *symtab, LNK_Config *config, LNK_ObjList objs) { @@ -2926,6 +4650,28 @@ THREAD_POOL_TASK_FUNC(lnk_set_comdat_leaders_contribs_task) ProfEnd(); } +// /OPT:ICF static-COMDAT followers have no external symbol to redirect, so point their section-map +// entry at the leader's contrib. The follower section is dead-stripped, but any residual reloc to a +// static symbol in it resolves its address via sect_map[obj][sn-1] -> leader's contrib. +internal +THREAD_POOL_TASK_FUNC(lnk_set_icf_static_leader_contribs_task) +{ + LNK_BuildImageTask *task = raw_task; + U64 obj_idx = task_id; + LNK_Obj *obj = task->objs[obj_idx]; + + if (obj->icf_fold == 0) { return; } + + ProfBeginV("Set ICF Static Leader Contribs [%S]", obj->path); + for EachIndex(sect_idx, obj->header.section_count_no_null) { + LNK_ICFFold fold = obj->icf_fold[sect_idx]; + if (!fold.set) { continue; } + LNK_SectionContrib *leader_sc = task->sect_map[fold.leader_obj_idx][fold.leader_sn - 1]; + task->sect_map[obj_idx][sect_idx] = leader_sc; + } + ProfEnd(); +} + internal THREAD_POOL_TASK_FUNC(lnk_flag_debug_symbols_task) { @@ -2981,15 +4727,8 @@ THREAD_POOL_TASK_FUNC(lnk_patch_comdat_leaders_task) task->u.patch_symtabs.was_symbol_patched[obj_idx][symbol_idx] = 1; } - if (obj->header.is_big_obj) { - COFF_Symbol32 *symbol32 = symbol.raw_symbol; - symbol32->section_number = section_number; - symbol32->value = value; - } else { - COFF_Symbol16 *symbol16 = symbol.raw_symbol; - symbol16->section_number = (U16)section_number; - symbol16->value = value; - } + obj->parsed_symbols[symbol_idx].section_number = section_number; + obj->parsed_symbols[symbol_idx].value = value; } } } @@ -3010,11 +4749,33 @@ lnk_section_contrib_ptr_is_before(void *raw_a, void *raw_b) return u64_compar_is_before(&input_idx_a, &input_idx_b); } +// chunks at/above this size are sorted by the parallel radix (all threads) before the per-chunk +// task pass, so one giant section (e.g. merged .text) can't serialize the whole sort on one thread. +#define LNK_SORT_CONTRIBS_RADIX_MIN (64u*1024u) + +// sort one chunk's contribs by Compose64Bit(obj_idx, obj_sect_idx) using the parallel radix sort. +// The key is unique per contrib (one obj/section pair each), so this matches the radsort order. +internal void +lnk_sort_contribs_chunk_radix(TP_Context *tp, Arena *arena, LNK_SectionContribChunk *chunk) +{ + U64 n = chunk->count; + Temp t = temp_begin(arena); + U64 *keys = push_array_no_zero(t.arena, U64, n); + U32 *idx = push_array_no_zero(t.arena, U32, n); + for EachIndex(i, n) { keys[i] = Compose64Bit(chunk->v[i]->u.obj_idx, chunk->v[i]->u.obj_sect_idx); idx[i] = (U32)i; } + lnk_radix_sort_u64_pairs(tp, t.arena, n, keys, idx); + LNK_SectionContrib **sorted = push_array_no_zero(t.arena, LNK_SectionContrib *, n); + for EachIndex(i, n) { sorted[i] = chunk->v[idx[i]]; } + MemoryCopy(chunk->v, sorted, n * sizeof(sorted[0])); + temp_end(t); +} + internal THREAD_POOL_TASK_FUNC(lnk_sort_contribs_task) { LNK_BuildImageTask *task = raw_task; LNK_SectionContribChunk *chunk = task->u.sort_contribs.chunks[task_id]; + if (chunk->count >= LNK_SORT_CONTRIBS_RADIX_MIN) { return; } // big chunks done via parallel radix ProfBeginV("[%llu]", chunk->count); radsort(chunk->v, chunk->count, lnk_section_contrib_ptr_is_before); ProfEnd(); @@ -3053,15 +4814,8 @@ THREAD_POOL_TASK_FUNC(lnk_patch_common_block_leaders_task) COFF_ParsedSymbol parsed_symbol = lnk_parsed_from_symbol(symbol); U64 section_number = task->u.patch_symtabs.common_block_sect->sect_idx + 1; - if (symbol_ref.obj->header.is_big_obj) { - COFF_Symbol32 *symbol32 = parsed_symbol.raw_symbol; - symbol32->value = contrib->u.offset; - symbol32->section_number = safe_cast_u32(section_number); - } else { - COFF_Symbol16 *symbol16 = parsed_symbol.raw_symbol; - symbol16->value = contrib->u.offset; - symbol16->section_number = safe_cast_u16(section_number); - } + symbol_ref.obj->parsed_symbols[symbol_ref.symbol_idx].value = contrib->u.offset; + symbol_ref.obj->parsed_symbols[symbol_ref.symbol_idx].section_number = safe_cast_u32(section_number); task->u.patch_symtabs.was_symbol_patched[symbol_ref.obj->input_idx][symbol_ref.symbol_idx] = 1; } @@ -3086,17 +4840,9 @@ THREAD_POOL_TASK_FUNC(lnk_patch_common_block_symbols_task) COFF_ParsedSymbol defn_parsed = lnk_parsed_from_symbol(defn); Assert(lnk_interp_from_symbol(defn) == COFF_SymbolValueInterp_Regular); if (defn) { - if (obj->header.is_big_obj) { - COFF_Symbol32 *symbol32 = symbol.raw_symbol; - symbol32->section_number = defn_parsed.section_number; - symbol32->value = safe_cast_u32(defn_parsed.value); - symbol32->storage_class = COFF_SymStorageClass_Static; - } else { - COFF_Symbol16 *symbol16 = symbol.raw_symbol; - symbol16->section_number = safe_cast_u16(defn_parsed.section_number); - symbol16->value = safe_cast_u32(defn_parsed.value); - symbol16->storage_class = COFF_SymStorageClass_Static; - } + obj->parsed_symbols[symbol_idx].section_number = defn_parsed.section_number; + obj->parsed_symbols[symbol_idx].value = defn_parsed.value; + obj->parsed_symbols[symbol_idx].storage_class = COFF_SymStorageClass_Static; } } } @@ -3134,15 +4880,8 @@ THREAD_POOL_TASK_FUNC(lnk_patch_regular_symbols_task) value = sc->u.off + symbol.value; } - if (obj->header.is_big_obj) { - COFF_Symbol32 *symbol32 = symbol.raw_symbol; - symbol32->section_number = section_number; - symbol32->value = value; - } else { - COFF_Symbol16 *symbol16 = symbol.raw_symbol; - symbol16->section_number = safe_cast_u16(section_number); - symbol16->value = value; - } + obj->parsed_symbols[symbol_idx].section_number = section_number; + obj->parsed_symbols[symbol_idx].value = value; } } ProfEnd(); @@ -3179,17 +4918,9 @@ lnk_patch_obj_symtab(LNK_SymbolTable *symtab, LNK_Obj *obj, B8 *was_symbol_patch value = fixup_src.value; } - if (obj->header.is_big_obj) { - COFF_Symbol32 *symbol32 = fixup_dst.raw_symbol; - symbol32->section_number = section_number; - symbol32->value = value; - symbol32->storage_class = COFF_SymStorageClass_Static; - } else { - COFF_Symbol16 *symbol16 = fixup_dst.raw_symbol; - symbol16->section_number = (U16)section_number; - symbol16->value = value; - symbol16->storage_class = COFF_SymStorageClass_Static; - } + obj->parsed_symbols[symbol_idx].section_number = section_number; + obj->parsed_symbols[symbol_idx].value = value; + obj->parsed_symbols[symbol_idx].storage_class = COFF_SymStorageClass_Static; was_symbol_patched[symbol_idx] = 1; } @@ -3235,6 +4966,14 @@ THREAD_POOL_TASK_FUNC(lnk_image_fill_task) for EachNode(n, LNK_ImageFillNode, task->u.image_fill.fill_nodes[task_id]) { for EachIndex(i, n->sc_count) { LNK_SectionContrib *sc = n->sc[i]; + // fast-path: the vast majority of contribs are a single data-node -> one direct copy, skipping + // the list-walk + cursor bookkeeping on the hot 739MB image-write loop. + if (sc->first_data_node.next == 0) { + U64 image_off = sc->u.off + n->base_foff; + Assert(image_off + sc->first_data_node.string.size <= image_data.size); + MemoryCopyStr8(image_data.str + image_off, sc->first_data_node.string); + continue; + } U64 cursor = 0; for EachNode(data_n, String8Node, &sc->first_data_node) { U64 image_off = sc->u.off + n->base_foff + cursor; @@ -3870,17 +5609,9 @@ THREAD_POOL_TASK_FUNC(lnk_patch_section_symbols_task) if (sect && (~sect->flags & COFF_SectionFlag_LnkRemove)) { if (~sect->flags & COFF_SectionFlag_MemDiscardable) { LNK_SectionContrib *first_sc = lnk_get_first_section_contrib(sect); - if (obj->header.is_big_obj) { - COFF_Symbol32 *symbol32 = symbol.raw_symbol; - symbol32->section_number = safe_cast_u32(first_sc->u.sect_idx + 1); - symbol32->value = first_sc->u.off; - symbol32->storage_class = COFF_SymStorageClass_Static; - } else { - COFF_Symbol16 *symbol16 = symbol.raw_symbol; - symbol16->section_number = safe_cast_u16(first_sc->u.sect_idx + 1); - symbol16->value = first_sc->u.off; - symbol16->storage_class = COFF_SymStorageClass_Static; - } + obj->parsed_symbols[symbol_idx].section_number = safe_cast_u32(first_sc->u.sect_idx + 1); + obj->parsed_symbols[symbol_idx].value = first_sc->u.off; + obj->parsed_symbols[symbol_idx].storage_class = COFF_SymStorageClass_Static; } else { lnk_error_obj(LNK_Error_SectRefsDiscardedMemory, obj, "symbol %S (No. 0x%llx) references section with discard flag", symbol.name, symbol_idx); } @@ -3898,17 +5629,9 @@ THREAD_POOL_TASK_FUNC(lnk_patch_section_symbols_task) LNK_Section *fallback_sect = task->image_sects.v[task->image_sects.count-1]; U32 fallback_section_number = safe_cast_u32(fallback_sect->sect_idx + 1); U32 fallback_section_offset = safe_cast_u32(fallback_voff - fallback_sect->voff); - if (obj->header.is_big_obj) { - COFF_Symbol32 *symbol32 = symbol.raw_symbol; - symbol32->section_number = fallback_section_number; - symbol32->value = fallback_section_offset; - symbol32->storage_class = COFF_SymStorageClass_Static; - } else { - COFF_Symbol16 *symbol16 = symbol.raw_symbol; - symbol16->section_number = safe_cast_u16(fallback_section_number); - symbol16->value = fallback_section_offset; - symbol16->storage_class = COFF_SymStorageClass_Static; - } + obj->parsed_symbols[symbol_idx].section_number = fallback_section_number; + obj->parsed_symbols[symbol_idx].value = fallback_section_offset; + obj->parsed_symbols[symbol_idx].storage_class = COFF_SymStorageClass_Static; lnk_error_obj(LNK_Warning_UndefinedSectionSymbol, obj, "undefined section symbol %S (No. 0x%llx) refers to an image section that doesn't exist; patching to %#llx", symbol.name, symbol_idx, fallback_voff); } @@ -4563,6 +6286,14 @@ lnk_build_image(TP_Arena *arena, TP_Context *tp, LNK_Config *config, LNK_SymbolT Assert(cursor == total_chunk_count); } + // big chunks (e.g. merged .text) first, each sorted across all threads via parallel radix -- + // otherwise a single huge chunk serializes on one worker. Small chunks then go one-per-task. + for EachIndex(ci, total_chunk_count) { + LNK_SectionContribChunk *chunk = task.u.sort_contribs.chunks[ci]; + if (chunk->count >= LNK_SORT_CONTRIBS_RADIX_MIN) { + lnk_sort_contribs_chunk_radix(tp, scratch.arena, chunk); + } + } tp_for_parallel(tp, 0, total_chunk_count, lnk_sort_contribs_task, &task); ProfEnd(); @@ -4570,6 +6301,9 @@ lnk_build_image(TP_Arena *arena, TP_Context *tp, LNK_Config *config, LNK_SymbolT tp_for_parallel_prof(tp, 0, objs_count, lnk_set_comdat_leaders_contribs_task, &task, "Update Section Map With COMDAT Leader Contribs"); + // /OPT:ICF: redirect folded static-COMDAT followers' section-map entries to their leader contrib + tp_for_parallel_prof(tp, 0, objs_count, lnk_set_icf_static_leader_contribs_task, &task, "Update Section Map With ICF Static Leader Contribs"); + // build common block // // TODO: build common block in .bss and merge with .data @@ -4774,7 +6508,11 @@ lnk_build_image(TP_Arena *arena, TP_Context *tp, LNK_Config *config, LNK_SymbolT ProfBeginV("Alloc Image Buffer [%M]", lnk_section_table_total_fsize(sectab)); image_data.size = lnk_section_table_total_fsize(sectab) + image_string_table.total_size; - image_data.str = push_array_no_zero(arena->v[0], U8, image_data.size); + // Standalone reservation (not the shared link arena) so it can be released the instant the image + // is written to disk -- VirtualFree returns fast and the kernel zeroes this ~1GB on its background + // thread, overlapping the rest of the run, instead of in the single-threaded exit rundown. + image_data.str = reserve_memory(image_data.size); + commit_memory(image_data.str, image_data.size); ProfEnd(); ProfBegin("Fill Align Bytes"); @@ -5293,6 +7031,7 @@ lnk_write_thread(void *raw_ctx) ProfEnd(); } + internal void lnk_log_timers(void) { @@ -5322,10 +7061,39 @@ lnk_log_timers(void) StringJoin new_line_join = { str8_lit_comp(""), str8_lit_comp("\n"), str8_lit_comp("") }; String8 output = str8_list_join(scratch.arena, &output_list, &new_line_join); lnk_log(LNK_Log_Timers, "%S\n", output); - + + // Diagnostic: when RADLINK_PHASE_LOG is set, also write machine-parseable raw + // per-phase micros to that file (for automated perf A/B). Env-unset -> no-op, + // so normal/validation links are byte-identical; this never touches DLL/PDB bytes. + char *phase_log_path = getenv("RADLINK_PHASE_LOG"); + if (phase_log_path != 0 && phase_log_path[0] != 0) { + String8List raw_list = {0}; + for (U64 i = 0; i < LNK_Timer_Count; ++i) { + str8_list_pushf(scratch.arena, &raw_list, "%S %llu\n", lnk_string_from_timer_type(i), g_timers[i].end - g_timers[i].begin); + } + str8_list_pushf(scratch.arena, &raw_list, "TOTAL %llu\n", total_build_time_micro); + String8 raw_str = str8_list_join(scratch.arena, &raw_list, 0); + lnk_write_data_to_file_path(str8_cstring(phase_log_path), str8_zero(), raw_str); + } + scratch_end(scratch); } +internal +THREAD_POOL_TASK_FUNC(lnk_scratch_decommit_worker) +{ + // Each worker decommits the committed-but-unused pages of its OWN equipped + // tctx scratch arenas. Runs on the worker thread, so tctx_selected() yields + // that worker's scratch. No cross-thread arena access. + // + // The barrier (dispatched with task_count == worker_count) guarantees every + // worker runs the body exactly once -- otherwise the work-stealing loop could + // let one fast worker grab several tasks and leave other workers' scratch + // committed. All worker_count threads are woken, so all must reach the barrier. + tctx_scratch_decommit(); + barrier_wait(tp->barrier); +} + internal THREAD_POOL_TASK_FUNC(lnk_p2r_worker) { @@ -5381,6 +7149,52 @@ lnk_debug_filter_objs(Arena *arena, LNK_Obj **objs, U64 objs_count, U64 *count_o return debug_info_objs; } +// Parallel release of memory-mapped input file views. +// Inputs are mapped copy-on-write (PAGE_WRITECOPY/FILE_MAP_COPY); pages touched +// during linking become private-dirty and are reclaimed by the kernel in +// single-threaded process rundown at exit (~3s for a large link). Unmapping them +// in parallel before exit moves that reclaim off the serial post-exit path. +typedef struct LNK_UnmapViewTask +{ + String8 *views; +} LNK_UnmapViewTask; + +internal +THREAD_POOL_TASK_FUNC(lnk_unmap_view_task) +{ + LNK_UnmapViewTask *task = raw_task; + String8 view = task->views[task_id]; +#if OS_WINDOWS + UnmapViewOfFile(view.str); +#elif OS_LINUX + munmap(view.str, view.size); +#endif +} + +internal void +lnk_release_input_views(TP_Context *tp, LNK_Inputer *inputer) +{ + Temp scratch = scratch_begin(0, 0); + + // collect distinct whole-file mapped views (is_thin); skip lib-member + // substrings and linkgen arena data + U64 cap = inputer->objs.count + inputer->libs.count; + String8 *views = push_array_no_zero(scratch.arena, String8, cap); + U64 count = 0; + for EachNode(n, LNK_Input, inputer->objs.first) { if (n->is_thin && n->data.size) { views[count++] = n->data; } } + for EachNode(n, LNK_Input, inputer->libs.first) { if (n->is_thin && n->data.size) { views[count++] = n->data; } } + + if (count > 0) { + U64 begin_us = now_time_us(); + LNK_UnmapViewTask task = { .views = views }; + tp_for_parallel(tp, 0, count, lnk_unmap_view_task, &task); + U64 end_us = now_time_us(); + lnk_log(LNK_Log_Timers, "Released %llu input views in %.2f ms", count, (F64)(end_us - begin_us) / 1000.0); + } + + scratch_end(scratch); +} + internal void lnk_run_linker(TP_Context *tp, TP_Arena *arena, LNK_Config *config) { @@ -5399,7 +7213,7 @@ lnk_run_linker(TP_Context *tp, TP_Arena *arena, LNK_Config *config) LNK_SymbolTable *symtab = lnk_symbol_table_init(arena); // - // Link Image + // Link Image (group digests, if any, are synthesized + consumed inside lnk_link_image) // LNK_LinkResult link = lnk_link_image(tp, arena, config, inputer, symtab); @@ -5458,6 +7272,27 @@ lnk_run_linker(TP_Context *tp, TP_Arena *arena, LNK_Config *config) LNK_CodeViewInput cv = lnk_make_code_view_input(tp, arena, config, debug_info_objs_count, debug_info_objs, rrt_input); LNK_MergedTypes cv_types = lnk_merge_types(tp, arena, &cv, 0); + // prune merged types not reachable from any surviving symbol (PDB-size win). OFF by default: + // it removes types that a debugger can still legitimately cast to in the watch window + // (reachable-from-symbols is a subset of castable-types). Opt in with /OPT:GCTYPES. + if (config->opt_gc_types == LNK_SwitchState_Yes) { + lnk_gc_types(tp, arena->v[0], &cv, &cv_types); + } + + // merge-types reached the scratch high-water (~9GB of per-thread tctx scratch + // stays committed but idle). Release those unused scratch pages back to the OS + // before the PDB build re-grows, dropping the recorded peak working set. Each + // worker decommits its own scratch; do the main thread's scratch too. Only + // pages strictly above each arena's live `pos` are touched, so output stays + // byte-identical and the push path re-commits on demand during PDB build. + { + ProfBegin("Decommit Scratch"); + // task_count == worker_count + the in-worker barrier => every worker + // (worker 0 IS the main thread) runs exactly once, covering main's scratch. + tp_for_parallel(tp, 0, tp->worker_count, lnk_scratch_decommit_worker, 0); + ProfEnd(); + } + // // Debug Info // @@ -5576,11 +7411,28 @@ lnk_run_linker(TP_Context *tp, TP_Arena *arena, LNK_Config *config) // wait for the thread to finish writing image to disk thread_join(image_write_thread, -1); + // image is on disk and no longer read by anyone -- release its ~1GB now so the kernel reclaims it + // concurrently with the remaining work + exit, not single-threaded in the process rundown. + release_memory(image_ctx.image_data.str, image_ctx.image_data.size); + + // outputs are written and inputs are no longer read; release the copy-on-write + // input views in parallel so their dirty pages are reclaimed here (multi-threaded) + // instead of in single-threaded process rundown at exit. Only safe for the CoW + // (read-only) mapping mode; read-write-shared would flush dirty pages back to the + // input files on unmap. + if ((config->io_flags & LNK_IO_Flags_MemoryMapFilesReadOnly) && + !(config->io_flags & LNK_IO_Flags_MemoryMapFilesReadWrite)) { + lnk_release_input_views(tp, inputer); + } + // // Timers // - if (lnk_get_log_status(LNK_Log_Timers)) { - lnk_log_timers(); + { + char *phase_log_env = getenv("RADLINK_PHASE_LOG"); + if (lnk_get_log_status(LNK_Log_Timers) || (phase_log_env != 0 && phase_log_env[0] != 0)) { + lnk_log_timers(); + } } scratch_end(scratch); @@ -5870,8 +7722,8 @@ entry_point(CmdLine *cmdline) } switch (config->boot_mode) { - case LNK_BootMode_Linker: lnk_run_linker (tp, tp_arena, config); break; - case LNK_BootMode_TypeServer: lnk_run_type_server(tp, tp_arena, config); break; + case LNK_BootMode_Linker: lnk_run_linker (tp, tp_arena, config); break; + case LNK_BootMode_TypeServer: lnk_run_type_server (tp, tp_arena, config); break; } lnk_log_end(); diff --git a/src/linker/lnk.h b/src/linker/lnk.h index d6388d0a0..f7207f5dc 100644 --- a/src/linker/lnk.h +++ b/src/linker/lnk.h @@ -204,6 +204,7 @@ typedef struct LNK_BaseRelocPageArray typedef struct { B32 search_anti_deps; + B32 reset_cursor; // anti-dep mode flipped: rescan from list start LNK_Link *link; HashMap *imports_hm; LNK_SymbolTable *symtab; @@ -391,6 +392,7 @@ internal LNK_LinkResult lnk_link_image (TP_Context *tp, TP_Arena *arena, LNK_Con // --- Optimizations ----------------------------------------------------------- internal void lnk_opt_ref(TP_Context *tp, LNK_SymbolTable *symtab, LNK_Config *config, LNK_ObjList objs); +internal void lnk_opt_icf(TP_Context *tp, Arena *perm, LNK_SymbolTable *symtab, LNK_Config *config, LNK_ObjList objs); // --- Win32 Image ------------------------------------------------------------- diff --git a/src/linker/lnk_config.c b/src/linker/lnk_config.c index ac624c0ed..4b988d3f8 100644 --- a/src/linker/lnk_config.c +++ b/src/linker/lnk_config.c @@ -106,6 +106,9 @@ global read_only LNK_CmdSwitch g_cmd_switch_map[] = { LNK_CmdSwitch_RadTypeServer, 0, "RAD_TYPE_SERVER", ":FILENAME", "Merge types and store them in the specified file. The filename must have the .rrt extension." }, + { LNK_CmdSwitch_IfcMap, 1, "IFCMAP", ":FILENAME", "Map a header-unit module interface (.ifc) for debug-record resolution (TOML)." }, + { LNK_CmdSwitch_IfcDebugRecords, 0, "IFCDEBUGRECORDS", "[:NO]", "Resolve MSVC header-unit IFC debug records into real CodeView types." }, + { LNK_CmdSwitch_Help, 0, "HELP", "", "" }, { LNK_CmdSwitch_Help, 0, "?", "", "" }, }; @@ -1675,10 +1678,18 @@ lnk_apply_cmd_option_to_config(LNK_Config *config, String8 cmd_name, String8List config->opt_icf = LNK_SwitchState_Yes; } else if (str8_match_lit("noicf", param, StringMatchFlag_CaseInsensitive)) { config->opt_icf = LNK_SwitchState_No; + } else if (str8_match_lit("icfstatic", param, StringMatchFlag_CaseInsensitive)) { + config->opt_icf_static = LNK_SwitchState_Yes; + } else if (str8_match_lit("noicfstatic", param, StringMatchFlag_CaseInsensitive)) { + config->opt_icf_static = LNK_SwitchState_No; } else if (str8_match_lit("lbr", param, StringMatchFlag_CaseInsensitive)) { config->opt_lbr = LNK_SwitchState_Yes; } else if (str8_match_lit("nolibr", param, StringMatchFlag_CaseInsensitive)) { config->opt_lbr = LNK_SwitchState_No; + } else if (str8_match_lit("gctypes", param, StringMatchFlag_CaseInsensitive)) { + config->opt_gc_types = LNK_SwitchState_Yes; + } else if (str8_match_lit("nogctypes", param, StringMatchFlag_CaseInsensitive)) { + config->opt_gc_types = LNK_SwitchState_No; } else { lnk_error_cmd_switch(LNK_Error_Cmdl, obj, cmd_switch, "unknown option \"%S\"", param); } @@ -2179,6 +2190,19 @@ lnk_apply_cmd_option_to_config(LNK_Config *config, String8 cmd_name, String8List lnk_error_cmd_switch(LNK_Error_Cmdl, obj, cmd_switch, "missing type server file path"); } } break; + + case LNK_CmdSwitch_IfcMap: { + // collect .toml paths (header-unit -> .ifc); parsed lazily during debug-info build + String8List copy = str8_list_copy(config->arena, &value_strings); + str8_list_concat_in_place(&config->ifc_map_list, ©); + } break; + + case LNK_CmdSwitch_IfcDebugRecords: { + LNK_SwitchState state = LNK_SwitchState_Null; + if (lnk_cmd_switch_parse_flag(obj, cmd_switch, value_strings, &state)) { + config->ifc_debug_records = state; + } + } break; } scratch_end(scratch); diff --git a/src/linker/lnk_config.h b/src/linker/lnk_config.h index a781d664f..d9d10c248 100644 --- a/src/linker/lnk_config.h +++ b/src/linker/lnk_config.h @@ -143,6 +143,9 @@ typedef enum LNK_CmdSwitch_RadTypeServer, LNK_CmdSwitch_RadTypeServer_MatchObj, + LNK_CmdSwitch_IfcMap, + LNK_CmdSwitch_IfcDebugRecords, + LNK_CmdSwitch_Help, LNK_CmdSwitch_Count @@ -289,7 +292,9 @@ typedef struct LNK_Config B32 ghash; LNK_SwitchState opt_ref; LNK_SwitchState opt_icf; + LNK_SwitchState opt_icf_static; // /OPT:ICFSTATIC -- also fold static (internal-linkage) COMDATs. Default OFF: runtime-unvalidated, can shrink .text substantially. LNK_SwitchState opt_lbr; + LNK_SwitchState opt_gc_types; // /OPT:GCTYPES -- prune unreferenced CodeView types. Default OFF: shrinks PDB but a pruned type can't be cast-to in the debugger watch window. U64 opt_iter_count; LNK_SwitchState import_table_emit_biat; LNK_SwitchState import_table_emit_uiat; @@ -387,6 +392,8 @@ typedef struct LNK_Config String8 type_server_name; LNK_SwitchState type_server; LNK_SwitchState sort_imports; + LNK_SwitchState ifc_debug_records; // resolve LF_IFC_RECORD (0x1522) into real CodeView types + String8List ifc_map_list; // .toml paths from /ifcMap (header-unit -> .ifc) } LNK_Config; // --- MSVC Error Codes -------------------------------------------------------- diff --git a/src/linker/lnk_debug_info.c b/src/linker/lnk_debug_info.c index fe125133c..3e977d5df 100644 --- a/src/linker/lnk_debug_info.c +++ b/src/linker/lnk_debug_info.c @@ -614,6 +614,681 @@ lnk_rrt_array_from_config(Arena *arena, LNK_Config *config) return rrt_arr; } +//////////////////////////////// +// IFC header-unit debug-record resolution + +typedef struct LNK_IfcMapEntry +{ + String8 ifc_path; // absolute .ifc path +} LNK_IfcMapEntry; + +// Parse the trivial /ifcMap TOML by hand: +// [[header-unit]] +// name = ["quote", ''] +// ifc = "" +// Registers basename(header-unit-path) -> .ifc path in `hm` (hash_map of path->raw LNK_IfcMapEntry*). +internal void +lnk_parse_ifc_map_toml(Arena *arena, HashMap *hm, String8 toml_data) +{ + U64 cursor = 0; + String8 cur_name = {0}; + while (cursor < toml_data.size) { + // read a line + U64 line_end = cursor; + while (line_end < toml_data.size && toml_data.str[line_end] != '\n') { line_end += 1; } + String8 line = str8_skip_chop_whitespace(str8_substr(toml_data, r1u64(cursor, line_end))); + cursor = line_end + 1; + + if (line.size == 0 || line.str[0] == '#') { continue; } + + if (str8_match(str8_prefix(line, 4), str8_lit("name"), 0)) { + // name = ["quote", ''] -- extract the last single-quoted token + U64 q0 = str8_find_needle(line, 0, str8_lit("'"), 0); + if (q0 < line.size) { + U64 q1 = str8_find_needle(line, q0 + 1, str8_lit("'"), 0); + if (q1 < line.size) { + cur_name = str8_substr(line, r1u64(q0 + 1, q1)); + } + } + } else if (str8_match(str8_prefix(line, 3), str8_lit("ifc"), 0)) { + U64 q0 = str8_find_needle(line, 0, str8_lit("\""), 0); + if (q0 < line.size && cur_name.size) { + U64 q1 = str8_find_needle(line, q0 + 1, str8_lit("\""), 0); + if (q1 < line.size) { + String8 ifc_path = str8_substr(line, r1u64(q0 + 1, q1)); + // key by basename of the header-unit path (matches LF_IFC_RECORD header_unit_path basename) + String8 base = str8_skip_last_slash(cur_name); + // header-unit paths use backslashes; normalize to last path component + U64 bs = str8_find_needle_reverse(base, 0, str8_lit("\\"), 0); + if (bs) { base = str8_skip(base, bs); } + LNK_IfcMapEntry *e = push_array(arena, LNK_IfcMapEntry, 1); + e->ifc_path = push_str8_copy(arena, ifc_path); + hash_map_push_string_raw(arena, hm, push_str8_copy(arena, base), e); + } + } + cur_name = str8_zero(); + } + } +} + +// Reads every /ifcMap toml, materializes the union of header-unit basename -> .ifc path. +internal HashMap +lnk_build_ifc_map(Arena *arena, LNK_Config *config) +{ + HashMap hm = {0}; + Temp scratch = scratch_begin(&arena, 1); + for EachNode(n, String8Node, config->ifc_map_list.first) { + String8 toml = lnk_read_data_from_file_path(scratch.arena, 0, n->string); + if (toml.size == 0) { + lnk_error(LNK_Error_Cmdl, "/ifcMap: unable to read TOML '%S'", n->string); + continue; + } + lnk_parse_ifc_map_toml(arena, &hm, toml); + } + scratch_end(scratch); + return hm; +} + +// LF_IFC_RECORD (0x1522) body layout (header {len,kind} already stripped from leaf.data): +// u16 version (==2); u32 ifc_type_index X; u8[16] guid; u8[16] hash; char[] header_unit_path NUL +typedef struct LNK_IfcRecord +{ + U32 ifc_type_index; // X: TI into the .ifc debug-records blob (base 0x1000) + U8 guid[16]; + U8 hash[16]; + String8 header_unit_path; + B32 is_valid; +} LNK_IfcRecord; + +internal LNK_IfcRecord +lnk_parse_ifc_record(String8 leaf_data) +{ + LNK_IfcRecord rec = {0}; + if (leaf_data.size < 2 + 4 + 16 + 16) { return rec; } + U64 off = 0; + U16 version; off += str8_deserial_read_struct(leaf_data, off, &version); + off += str8_deserial_read_struct(leaf_data, off, &rec.ifc_type_index); + MemoryCopy(rec.guid, leaf_data.str + off, 16); off += 16; + MemoryCopy(rec.hash, leaf_data.str + off, 16); off += 16; + rec.header_unit_path = str8_cstring_capped(leaf_data.str + off, leaf_data.str + leaf_data.size); + rec.is_valid = 1; + return rec; +} + +// Per-blob closure + NOTYPE-prune (third pass of lnk_apply_ifc_debug_records). Each blob is fully +// independent: it reads/writes only its own ref_bits[blob_i] and its own blob DebugT leaves, so the +// work parallelizes across the ~13 blobs with no shared state. The worklist scratch comes from the +// per-worker arena. Closure counts are written per-blob and summed afterward (order-independent). +// Open-addressing U64 hash set keyed by unique_name hash. Used to record which UDT unique_names +// already have a COMPLETE definition in a non-blob (consuming) obj, so the blob prune can keep a +// blob's complete definition only for names that NO normal obj completes (blob-only types). cap is +// a power of two; 0-hash is reserved as the empty sentinel (we OR in a bit so a real 0 can't occur). +typedef struct LNK_U64Set { U64 *slots; U64 cap; } LNK_U64Set; + +internal U64 +lnk_uname_hash(String8 s) +{ + U64 h = 5381; + for EachIndex(c, s.size) { h = ((h << 5) + h) ^ (U64)s.str[c]; } + return h | 1; // never 0 (0 is the empty sentinel) +} + +internal void +lnk_u64set_add(LNK_U64Set *set, U64 h) +{ + U64 i = h & (set->cap - 1); + for (;;) { + if (set->slots[i] == 0) { set->slots[i] = h; return; } + if (set->slots[i] == h) { return; } + i = (i + 1) & (set->cap - 1); + } +} + +// Thread-safe insert mirroring lnk_icf_map_put_atomic (lnk.c). The empty sentinel is 0 (not +// LNK_ICF_EMPTY); h is guaranteed nonzero by lnk_uname_hash (`| 1`) so a real key can never be 0. +// Claim an empty slot with an atomic CAS 0->h; the CAS winner owns it. On a lost race re-read the +// same slot (it may now hold our h via a duplicate, or another key) before advancing. Duplicate +// keys across objs are legal and idempotent (a slot already holding h returns), so any insertion +// order yields the identical final membership -- the set is read-only (lnk_u64set_has) afterward. +internal void +lnk_u64set_add_atomic(LNK_U64Set *set, U64 h) +{ + U64 i = h & (set->cap - 1); + for (;;) { + if (set->slots[i] == 0) { + if (ins_atomic_u64_eval_cond_assign(&set->slots[i], h, 0) == 0) { return; } + continue; + } + if (set->slots[i] == h) { return; } + i = (i + 1) & (set->cap - 1); + } +} + +internal B32 +lnk_u64set_has(LNK_U64Set *set, U64 h) +{ + U64 i = h & (set->cap - 1); + for (;;) { + if (set->slots[i] == 0) { return 0; } + if (set->slots[i] == h) { return 1; } + i = (i + 1) & (set->cap - 1); + } +} + +// Parallel collect of complete-definition unique_name hashes per non-blob obj. Each obj is scanned +// independently (read-only over its .debug$T leaves) and emits its hashes into a per-obj list; a +// serial pass then adds them to the shared open-addressing set. Moves the ~1.25s serial cv_get_udt_info +// scan off the main thread. Determinism: set membership is order-independent, serial-add reproduces. +typedef struct LNK_IfcCompleteScanTask +{ + LNK_CodeViewInput *input; + U64 **out_hashes; // per-obj hash array (allocated by task) + U64 *out_counts; // per-obj count +} LNK_IfcCompleteScanTask; + +internal +THREAD_POOL_TASK_FUNC(lnk_ifc_complete_scan_task) +{ + LNK_IfcCompleteScanTask *task = raw_task; + U64 obj_idx = task_id; + CV_DebugT *dt = &task->input->debug_t_arr[obj_idx]; + U64 *hashes = push_array_no_zero(arena, U64, dt->count ? dt->count : 1); + U64 n = 0; + for EachIndex(leaf_idx, dt->count) { + CV_Leaf leaf = cv_debug_t_get_leaf(dt, leaf_idx); + CV_UDTInfo ui = cv_get_udt_info(leaf.kind, leaf.data); + if (!(ui.props & CV_TypeProp_HasUniqueName) || ui.unique_name.size == 0) { continue; } + if (ui.props & CV_TypeProp_FwdRef) { continue; } + hashes[n++] = lnk_uname_hash(ui.unique_name); + } + task->out_hashes[obj_idx] = hashes; + task->out_counts[obj_idx] = n; +} + +// Parallel merge of the per-obj complete-def hash lists into the shared set. Replaces the serial +// lnk_u64set_add loop (the ~914ms hotspot -- 801ms of it first-touch KiPageFault on a single thread +// faulting a 128MB+ set). lnk_u64set_add_atomic spreads both the random-scatter probes AND the +// page faults across the pool. Determinism: keys may legitimately duplicate across objs, but the +// insert is idempotent and the set is read-only afterward (lnk_u64set_has), so insertion order +// cannot change the final membership -- output is bit-identical to the serial merge. +typedef struct LNK_IfcSetMergeTask +{ + Rng1U64 *ranges; + U64 **out_hashes; + U64 *out_counts; + LNK_U64Set *set; + U64 nonblob_count; +} LNK_IfcSetMergeTask; + +internal +THREAD_POOL_TASK_FUNC(lnk_ifc_set_merge_task) +{ + LNK_IfcSetMergeTask *task = raw_task; + for EachInRange(obj_idx, task->ranges[task_id]) { + U64 *h = task->out_hashes[obj_idx]; + U64 n = task->out_counts[obj_idx]; + for EachIndex(t, n) { lnk_u64set_add_atomic(task->set, h[t]); } + } +} + +// Parallel second-pass scan: each consuming obj's .debug$T is scanned independently for 0x1522 +// (LF_IFC_RECORD) leaves. A worker (a) rewrites every 0x1522 leaf to NOTYPE in-place (per-obj +// disjoint, safe) and (b) for each leaf that resolves to a .ifc blob leaf, emits an ordered +// redirect record. The shared redirect hash map / ref_bits seed / redirect_count are NOT touched +// here; the serial merge replays the per-obj records in ascending obj_idx, then ascending leaf_idx +// (exactly the original serial order) so the hash-map push order -- and thus byte-for-byte output -- +// is identical to the serial scan. All blob/.ifc state (ifc_map_hm, ifc_path_to_blobidx, ifc_files, +// debug_t_arr blob ranges) is read-only after pass 1, so worker reads are race-free. +typedef struct LNK_IfcRedirectRec +{ + CV_TypeIndex K; // consuming obj's local placeholder TI + U64 blob_obj_idx; // injected blob obj index + U64 blob_leaf_idx; // leaf inside the blob + U64 blob_i; // blob slot (for ref_bits seed) +} LNK_IfcRedirectRec; + +typedef struct LNK_IfcScanTask +{ + LNK_CodeViewInput *input; + HashMap *ifc_map_hm; + HashMap *ifc_path_to_blobidx; + IFC_File *ifc_files; + LNK_IfcRedirectRec **out_recs; // per-obj ordered redirect records (allocated by task) + U64 *out_counts; // per-obj record count +} LNK_IfcScanTask; + +internal +THREAD_POOL_TASK_FUNC(lnk_ifc_scan_task) +{ + LNK_IfcScanTask *task = raw_task; + U64 obj_idx = task_id; + LNK_CodeViewInput *input = task->input; + CV_DebugT *debug_t = &input->debug_t_arr[obj_idx]; + + // count 0x1522 leaves first to size the per-obj record array + U64 ifc_leaf_count = 0; + for EachIndex(leaf_idx, debug_t->count) { + CV_LeafHeader *hdr = cv_debug_t_get_leaf_header(debug_t, leaf_idx); + if (hdr->kind == 0x1522) { ifc_leaf_count += 1; } + } + if (ifc_leaf_count == 0) { task->out_recs[obj_idx] = 0; task->out_counts[obj_idx] = 0; return; } + + LNK_IfcRedirectRec *recs = push_array_no_zero(arena, LNK_IfcRedirectRec, ifc_leaf_count); + U64 n = 0; + + for EachIndex(leaf_idx, debug_t->count) { + CV_LeafHeader *hdr = cv_debug_t_get_leaf_header(debug_t, leaf_idx); + if (hdr->kind != 0x1522) { continue; } + + CV_Leaf leaf = cv_debug_t_get_leaf(debug_t, leaf_idx); + LNK_IfcRecord rec = lnk_parse_ifc_record(leaf.data); + + CV_TypeIndex K = cv_ti_from_leaf_idx(debug_t, CV_TypeIndexSource_TPI, leaf_idx); + + if (rec.is_valid) { + String8 base = str8_skip_last_slash(rec.header_unit_path); + U64 bs = str8_find_needle_reverse(base, 0, str8_lit("\\"), 0); + if (bs) { base = str8_skip(base, bs); } + LNK_IfcMapEntry *e = hash_map_search_string_raw(task->ifc_map_hm, base); + if (e) { + U64 *slot = hash_map_search_string_u64(task->ifc_path_to_blobidx, e->ifc_path); + if (slot) { + U64 blob_i = *slot - 1; + IFC_File *f = &task->ifc_files[blob_i]; + U64 blob_obj_idx = input->ifc_obj_range.min + blob_i; + CV_DebugT *bdt = &input->debug_t_arr[blob_obj_idx]; + + B32 hash_ok = MemoryMatch(rec.guid, f->content_hash, 16) && + MemoryMatch(rec.hash, f->content_hash + 16, 16); + + if (f->is_valid && hash_ok) { + U64 blob_leaf_idx = cv_leaf_idx_from_ti(bdt, CV_TypeIndexSource_TPI, rec.ifc_type_index); + if (blob_leaf_idx < bdt->count) { + recs[n].K = K; + recs[n].blob_obj_idx = blob_obj_idx; + recs[n].blob_leaf_idx = blob_leaf_idx; + recs[n].blob_i = blob_i; + n += 1; + } + } + } + } + } + + // exclude the placeholder leaf from output regardless: rewrite to NOTYPE. + // per-obj disjoint write -- safe in parallel. + memory_write16(MemberFromPtr(CV_LeafHeader, hdr, kind), (U16)CV_LeafKind_NOTYPE); + } + + task->out_recs[obj_idx] = recs; + task->out_counts[obj_idx] = n; +} + +typedef struct LNK_IfcCloseTask +{ + LNK_CodeViewInput *input; + U8 **ref_bits; + U64 *closure_leaves; // per-blob output: # leaves surviving in closure + LNK_U64Set *nonblob_complete; // unique_name hashes completed by some non-blob obj +} LNK_IfcCloseTask; + +internal +THREAD_POOL_TASK_FUNC(lnk_ifc_close_blob_task) +{ + LNK_IfcCloseTask *task = raw_task; + U64 blob_i = task_id; + LNK_CodeViewInput *input = task->input; + U64 blob_obj_idx = input->ifc_obj_range.min + blob_i; + CV_DebugT *bdt = &input->debug_t_arr[blob_obj_idx]; + U8 *bits = task->ref_bits[blob_i]; + U64 closure = 0; + if (bdt->count == 0) { task->closure_leaves[blob_i] = 0; return; } + + Temp wtemp = temp_begin(arena); + + // Extra closure roots for forward-ref completion: in CodeView a forward-ref UDT is completed by + // ANY same-unique_name complete definition in the PDB. A consuming obj typically emits only a + // forward-ref of a header-unit type; the full-merge build incidentally kept the matching complete + // definition from the .ifc blob, so the debugger could complete it. On-demand would drop that + // definition (nothing references it by TI), leaving the type incomplete vs full-merge. To preserve + // fidelity WITHOUT dragging the whole blob, root every blob complete-def UDT whose unique_name has + // NO complete definition in any non-blob obj (i.e. blob-only types -- trait/delegate marker structs + // etc.). Common types (FString, FGuid, ...) are completed by normal objs, so their redundant blob + // copies stay pruned. Members of the kept defs are pulled by the closure walk below. + for EachIndex(leaf_idx, bdt->count) { + if (bits[leaf_idx >> 3] & (1u << (leaf_idx & 7))) { continue; } // already a root + CV_Leaf leaf = cv_debug_t_get_leaf(bdt, leaf_idx); + CV_UDTInfo ui = cv_get_udt_info(leaf.kind, leaf.data); + if (!(ui.props & CV_TypeProp_HasUniqueName) || ui.unique_name.size == 0) { continue; } + if (ui.props & CV_TypeProp_FwdRef) { continue; } // only complete definitions + U64 h = lnk_uname_hash(ui.unique_name); + if (lnk_u64set_has(task->nonblob_complete, h)) { continue; } // a normal obj already completes it + bits[leaf_idx >> 3] |= (U8)(1u << (leaf_idx & 7)); + } + + U64 *worklist = push_array_no_zero(wtemp.arena, U64, bdt->count); + U64 wl_count = 0; + for EachIndex(leaf_idx, bdt->count) { + if (bits[leaf_idx >> 3] & (1u << (leaf_idx & 7))) { worklist[wl_count++] = leaf_idx; } + } + + while (wl_count) { + U64 leaf_idx = worklist[--wl_count]; + CV_Leaf leaf = cv_debug_t_get_leaf(bdt, leaf_idx); + Temp itemp = temp_begin(wtemp.arena); + CV_TypeIndexInfoList ti_list = cv_get_leaf_type_index_offsets(itemp.arena, leaf.kind, leaf.data); + for EachNode(ti_info, CV_TypeIndexInfo, ti_list.first) { + CV_TypeIndex *ti_ptr = str8_deserial_get_raw_ptr(leaf.data, ti_info->offset, sizeof(*ti_ptr)); + if (ti_ptr == 0) { continue; } + CV_TypeIndex sub_ti = memory_read32(ti_ptr); + if (sub_ti < bdt->ti_ranges[ti_info->source].min || + sub_ti >= bdt->ti_ranges[ti_info->source].max) { continue; } + U64 sub_leaf_idx = cv_leaf_idx_from_ti(bdt, ti_info->source, sub_ti); + if (sub_leaf_idx >= bdt->count) { continue; } + if (bits[sub_leaf_idx >> 3] & (1u << (sub_leaf_idx & 7))) { continue; } + bits[sub_leaf_idx >> 3] |= (U8)(1u << (sub_leaf_idx & 7)); + worklist[wl_count++] = sub_leaf_idx; + } + temp_end(itemp); + } + temp_end(wtemp); + + for EachIndex(leaf_idx, bdt->count) { + if (bits[leaf_idx >> 3] & (1u << (leaf_idx & 7))) { closure += 1; continue; } + CV_LeafHeader *hdr = cv_debug_t_get_leaf_header(bdt, leaf_idx); + if (hdr->kind == CV_LeafKind_NOTYPE) { continue; } + memory_write16(MemberFromPtr(CV_LeafHeader, hdr, kind), (U16)CV_LeafKind_NOTYPE); + memory_write16(MemberFromPtr(CV_LeafHeader, hdr, size), (U16)sizeof(CV_LeafKind)); + } + task->closure_leaves[blob_i] = closure; +} + +// Injects referenced .ifc debug-records blobs as extra "objs" in `input`, scans every +// consuming obj's .debug$T for LF_IFC_RECORD (0x1522) leaves, registers each placeholder +// local TI -> blob leaf redirect, and rewrites the 0x1522 leaf to NOTYPE so it is excluded +// from the output TPI. Must run after .debug$T is parsed and before min-type-index / symbol +// setup (which iterate input->count). +internal void +lnk_apply_ifc_debug_records(TP_Context *tp, TP_Arena *tp_arena, LNK_CodeViewInput *input, LNK_Config *config) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&tp_arena->v[0], 1); + Arena *arena = tp_arena->v[0]; + + // basename -> .ifc path + HashMap ifc_map_hm = lnk_build_ifc_map(scratch.arena, config); + + // first pass: discover which .ifc files are actually referenced and by which records. + // de-dup .ifc reads by path; parse blob to a CV_DebugT once per .ifc. + HashMap ifc_path_to_blobidx = {0}; // path -> (blob slot index + 1) + IFC_File *ifc_files = push_array(scratch.arena, IFC_File, 256); + U64 ifc_file_count = 0; + CV_DebugT blob_debug_t[256] = {0}; + + // scan all consuming objs for 0x1522 leaves; collect the .ifc paths needed + for EachIndex(obj_idx, input->obj_count) { + CV_DebugT *debug_t = &input->debug_t_arr[obj_idx]; + for EachIndex(leaf_idx, debug_t->count) { + CV_LeafHeader *hdr = cv_debug_t_get_leaf_header(debug_t, leaf_idx); + if (hdr->kind != 0x1522) { continue; } + + CV_Leaf leaf = cv_debug_t_get_leaf(debug_t, leaf_idx); + LNK_IfcRecord rec = lnk_parse_ifc_record(leaf.data); + if (!rec.is_valid) { continue; } + + String8 base = str8_skip_last_slash(rec.header_unit_path); + U64 bs = str8_find_needle_reverse(base, 0, str8_lit("\\"), 0); + if (bs) { base = str8_skip(base, bs); } + + LNK_IfcMapEntry *e = hash_map_search_string_raw(&ifc_map_hm, base); + if (e == 0) { continue; } + + if (hash_map_search_string_u64(&ifc_path_to_blobidx, e->ifc_path) == 0) { + if (ifc_file_count >= 256) { continue; } + String8 err = {0}; + IFC_File f = ifc_file_read(arena, e->ifc_path, &err); + if (!f.is_valid) { + lnk_error(LNK_Error_Cmdl, "/ifcDebugRecords: %S", err); + // still reserve a slot so we don't retry + hash_map_push_string_u64(scratch.arena, &ifc_path_to_blobidx, e->ifc_path, ifc_file_count + 1); + ifc_files[ifc_file_count] = f; + blob_debug_t[ifc_file_count] = (CV_DebugT){0}; + ifc_file_count += 1; + continue; + } + // parse the raw CV leaf stream (no signature, TI base 0x1000) + CV_DebugT dt = cv_debug_t_from_data(arena, f.debug_records, 1); + hash_map_push_string_u64(scratch.arena, &ifc_path_to_blobidx, e->ifc_path, ifc_file_count + 1); + ifc_files[ifc_file_count] = f; + blob_debug_t[ifc_file_count] = dt; + ifc_file_count += 1; + } + } + } + + if (ifc_file_count == 0) { goto done; } + + // --- inject blob objs into the parallel arrays (like type servers, but in ifc_obj_range) --- + U64 prev_count = input->count; + U64 new_count = prev_count + ifc_file_count; + + LNK_Obj **obj_arr2 = push_array(arena, LNK_Obj *, new_count); + CV_DebugS *debug_s_arr2 = push_array(arena, CV_DebugS, new_count); + CV_DebugT *debug_t_arr2 = push_array(arena, CV_DebugT, new_count); + CV_DebugH *debug_h_arr2 = push_array(arena, CV_DebugH, new_count); + U64 *obj_to_ts2 = push_array(arena, U64, new_count); + + MemoryCopyTyped(obj_arr2, input->obj_arr, prev_count); + MemoryCopyTyped(debug_s_arr2, input->debug_s_arr, prev_count); + MemoryCopyTyped(debug_t_arr2, input->debug_t_arr, prev_count); + MemoryCopyTyped(debug_h_arr2, input->debug_h_arr, prev_count); + MemoryCopyTyped(obj_to_ts2, input->obj_to_ts, prev_count); + MemorySet(obj_to_ts2 + prev_count, 0xff, ifc_file_count * sizeof(U64)); // blobs are not type servers + + // blob obj indices + index list for hash-deep / dedup + U32Array ifc_indices = { .v = push_array(arena, U32, ifc_file_count) }; + for EachIndex(i, ifc_file_count) { + U64 blob_obj_idx = prev_count + i; + LNK_Obj *blob_obj = push_array(arena, LNK_Obj, 1); + blob_obj->path = ifc_files[i].path; + obj_arr2[blob_obj_idx] = blob_obj; + debug_t_arr2[blob_obj_idx] = blob_debug_t[i]; + ifc_indices.v[ifc_indices.count++] = (U32)blob_obj_idx; + } + + input->count = new_count; + input->obj_arr = obj_arr2; + input->debug_s_arr = debug_s_arr2; + input->debug_t_arr = debug_t_arr2; + input->debug_h_arr = debug_h_arr2; + input->obj_to_ts = obj_to_ts2; + input->ifc_obj_range = r1u64(prev_count, new_count); + input->ifc_indices = ifc_indices; // hashed + deduped before int objs (see lnk_merge_types) + + // --- on-demand pruning state: per blob, a "referenced" bitset of leaf indices that + // are reachable from some consuming obj's 0x1522 redirect (the closure roots). Only these + // + their transitive blob-internal deps get merged; the rest are rewritten to NOTYPE so the + // hash/dedup pipeline skips ~all of the ~1.5M blob leaves that nothing references. --- + U8 **ref_bits = push_array(scratch.arena, U8 *, ifc_file_count); + for EachIndex(i, ifc_file_count) { + U64 c = blob_debug_t[i].count; + ref_bits[i] = push_array(scratch.arena, U8, (c + 7) / 8); // zero-init -> nothing referenced yet + } + + // --- second pass: register redirects + NOTYPE the 0x1522 leaves + seed closure roots --- + // Parallel per-obj scan (the ~1.86s serial hotspot): each worker NOTYPE-rewrites its own 0x1522 + // leaves and emits an ordered list of resolved redirects. A serial merge then replays them in + // ascending obj_idx, then ascending leaf_idx -- the exact original serial order -- so the redirect + // hash-map push order, ref_bits seeding, and redirect_count are bit-for-bit identical to serial. + input->has_ifc_redirects = 1; + U64 redirect_count = 0; + { + LNK_IfcScanTask scan = {0}; + scan.input = input; + scan.ifc_map_hm = &ifc_map_hm; + scan.ifc_path_to_blobidx = &ifc_path_to_blobidx; + scan.ifc_files = ifc_files; + scan.out_recs = push_array(scratch.arena, LNK_IfcRedirectRec *, input->obj_count); + scan.out_counts = push_array(scratch.arena, U64, input->obj_count); + tp_for_parallel(tp, tp_arena, input->obj_count, lnk_ifc_scan_task, &scan); + + // deterministic ordered merge: ascending obj_idx, and within an obj the records are already in + // ascending leaf_idx order (the worker scans leaf_idx ascending). + for EachIndex(obj_idx, input->obj_count) { + LNK_IfcRedirectRec *recs = scan.out_recs[obj_idx]; + U64 n = scan.out_counts[obj_idx]; + for EachIndex(t, n) { + LNK_IfcRedirectRec *r = &recs[t]; + hash_map_push_u64_u64(arena, &input->ifc_redirect_hm, + Compose64Bit(obj_idx, r->K), + Compose64Bit(r->blob_obj_idx, r->blob_leaf_idx)); + redirect_count += 1; + // seed closure root: this blob leaf is referenced + ref_bits[r->blob_i][r->blob_leaf_idx >> 3] |= (U8)(1u << (r->blob_leaf_idx & 7)); + } + } + } + + // --- third pass: per blob, close the referenced set over blob-internal sub-TIs, then + // NOTYPE every leaf not in the closure. cv_leaf_idx_from_ti on a raw blob is source-agnostic + // (source_offsets are 0, all ti_ranges == [0x1000, 0x1000+count)) so a sub-TI maps directly to + // leaf_idx = ti - 0x1000 regardless of its CV_TypeIndexSource label. Walk is iterative (worklist). + U64 total_blob_leaves = 0, total_closure_leaves = 0; + for EachIndex(blob_i, ifc_file_count) { + total_blob_leaves += input->debug_t_arr[input->ifc_obj_range.min + blob_i].count; + } + if (ifc_file_count) { + // Build the set of unique_names that already have a COMPLETE definition in some non-blob obj. + // The blob prune keeps a blob complete-def only when its name is absent here (blob-only type), + // so forward-refs that no normal obj can complete still get their definition (full-merge fidelity) + // while redundant blob copies of normally-defined types stay pruned. Size to ~2x the non-blob + // complete-def count, rounded up to a power of two, for low load factor. + U64 nonblob_complete_estimate = 0; + for EachIndex(obj_idx, input->ifc_obj_range.min) { + nonblob_complete_estimate += input->debug_t_arr[obj_idx].source_counts[CV_TypeIndexSource_TPI]; + } + LNK_U64Set nonblob_complete = {0}; + nonblob_complete.cap = 1; + while (nonblob_complete.cap < (nonblob_complete_estimate * 2 + 16)) { nonblob_complete.cap <<= 1; } + nonblob_complete.slots = push_array(scratch.arena, U64, nonblob_complete.cap); + // parallel scan: each non-blob obj emits its complete-def hashes; serial merge adds to the set. + U64 nonblob_count = input->ifc_obj_range.min; + if (nonblob_count) { + LNK_IfcCompleteScanTask scan = {0}; + scan.input = input; + scan.out_hashes = push_array(scratch.arena, U64 *, nonblob_count); + scan.out_counts = push_array(scratch.arena, U64, nonblob_count); + tp_for_parallel(tp, tp_arena, nonblob_count, lnk_ifc_complete_scan_task, &scan); + // parallel atomic-CAS merge (replaces the serial lnk_u64set_add loop): output-identical + // because set membership is order-independent + idempotent (see lnk_u64set_add_atomic). + LNK_IfcSetMergeTask merge = {0}; + merge.ranges = tp_divide_work(scratch.arena, nonblob_count, tp->worker_count); + merge.out_hashes = scan.out_hashes; + merge.out_counts = scan.out_counts; + merge.set = &nonblob_complete; + merge.nonblob_count = nonblob_count; + tp_for_parallel(tp, 0, tp->worker_count, lnk_ifc_set_merge_task, &merge); + } + + LNK_IfcCloseTask close_task = {0}; + close_task.input = input; + close_task.ref_bits = ref_bits; + close_task.closure_leaves = push_array(scratch.arena, U64, ifc_file_count); + close_task.nonblob_complete = &nonblob_complete; + tp_for_parallel(tp, tp_arena, ifc_file_count, lnk_ifc_close_blob_task, &close_task); + for EachIndex(blob_i, ifc_file_count) { total_closure_leaves += close_task.closure_leaves[blob_i]; } + } + (void)tp; + + lnk_log(LNK_Log_Debug, "[IFC] injected %llu .ifc blob(s), %llu record redirect(s); on-demand closure %llu / %llu blob leaves (%.1f%%)", + ifc_file_count, redirect_count, total_closure_leaves, total_blob_leaves, + total_blob_leaves ? (100.0 * (F64)total_closure_leaves / (F64)total_blob_leaves) : 0.0); + +done: + scratch_end(scratch); + ProfEnd(); +} + +//////////////////////////////// +// parallel setup tasks for lnk_make_code_view_input + +// Loop 3 (PCH/ext/int classification). The expensive predicates -- the read-only rrt_hm lookup +// and cv_debug_t_is_type_server_ref -- run in parallel per obj. Each obj's class tag and PCH-merge +// mutation are fully independent, so this pass is data-parallel. The ordered 3-array compaction +// and the MultipleDebugTAndDebugP warning are then replayed SERIALLY in obj_idx order, so output +// (array contents/order + warning order + discarded set) is byte-identical to the serial loop. +typedef struct LNK_CvClassifyTask +{ + LNK_CodeViewInput *input; + CV_DebugT *debug_p_arr; + HashMap *rrt_hm; // read-only after build + LNK_Obj **obj_arr; + U8 *class_tag; // 0=debug_p, 1=ext, 2=int + U8 *warn_multi; +} LNK_CvClassifyTask; + +internal +THREAD_POOL_TASK_FUNC(lnk_cv_classify_task) +{ + LNK_CvClassifyTask *t = raw_task; + U64 obj_idx = task_id; + CV_DebugT *debug_t = &t->input->debug_t_arr[obj_idx]; + CV_DebugT *debug_p = &t->debug_p_arr[obj_idx]; + + // classify (same predicate order/precedence as the serial loop) + U8 tag; + if (hash_map_search_path_u64(t->rrt_hm, t->obj_arr[obj_idx]->path)) { tag = 1; } + else if (debug_p->count > 0 && debug_t->count == 0) { tag = 0; } + else if (cv_debug_t_is_type_server_ref(debug_t)) { tag = 1; } + else { tag = 2; } + t->class_tag[obj_idx] = tag; + + // per-obj independent debug_t mutation (identical to serial) + if (debug_t->count == 0 && debug_p->count > 0) { + *debug_t = *debug_p; + } else if (debug_t->count && debug_p->count) { + t->warn_multi[obj_idx] = 1; // defer warning to serial obj-order replay + MemoryZeroStruct(debug_t); + MemoryZeroStruct(debug_p); + } +} + +// Loop 4 (Make Symbol Inputs) count pass. cv_sub_section_from_debug_s is a pure read of the +// already-parsed data_list, so caching each obj's Symbols sub-section list in parallel is safe. +typedef struct LNK_CvSymTask +{ + LNK_CodeViewInput *input; + String8List *per_obj_syms; + U64 *counts; // per-obj node_count (count pass) + U64 *offsets; // per-obj symbol_inputs offset (fill pass) +} LNK_CvSymTask; + +internal +THREAD_POOL_TASK_FUNC(lnk_cv_sym_count_task) +{ + LNK_CvSymTask *t = raw_task; + U64 obj_idx = task_id; + t->per_obj_syms[obj_idx] = cv_sub_section_from_debug_s(t->input->debug_s_arr[obj_idx], CV_C13SubSectionKind_Symbols); + t->counts[obj_idx] = t->per_obj_syms[obj_idx].node_count; +} + +// Loop 4 fill pass. Each obj writes a disjoint, contiguous range of symbol_inputs starting at its +// prefix-sum offset, in node order -- byte-identical to the serial append (which walked obj_idx +// ascending, each obj's nodes in list order). +internal +THREAD_POOL_TASK_FUNC(lnk_cv_sym_fill_task) +{ + LNK_CvSymTask *t = raw_task; + U64 obj_idx = task_id; + U64 cur = t->offsets[obj_idx]; + String8List s = t->per_obj_syms[obj_idx]; + for EachNode(n, String8Node, s.first) { + LNK_SymbolInput *in = &t->input->symbol_inputs[cur++]; + in->obj_idx = obj_idx; + in->raw_symbols = n->string; + } +} + internal LNK_CodeViewInput lnk_make_code_view_input(TP_Context *tp, TP_Arena *tp_arena, LNK_Config *config, U64 obj_count, LNK_Obj **obj_arr, LNK_RRT_Array rrt_input) { @@ -634,47 +1309,51 @@ lnk_make_code_view_input(TP_Context *tp, TP_Arena *tp_arena, LNK_Config *config, ProfBegin("Apply RRT to Objs"); - // hash map (obj path, obj idx) + // hash map (obj path, obj idx). Kept SERIAL: HashMap is a 4-ary trie whose insert mutates shared + // child pointers + arena-allocates nodes -> not safe for concurrent insert. Only built (and + // consulted) when there is at least one input RRT; the monolithic Engine.dll link has none. HashMap obj_path_hm = {0}; - for EachIndex(obj_idx, obj_count) { - hash_map_push_path_u64(scratch.arena, &obj_path_hm, obj_arr[obj_idx]->path, obj_idx); - } + if (rrt_input.count) { + for EachIndex(obj_idx, obj_count) { + hash_map_push_path_u64(scratch.arena, &obj_path_hm, obj_arr[obj_idx]->path, obj_idx); + } - for EachIndex(obj_idx, obj_count) { - LNK_Obj *obj = obj_arr[obj_idx]; - U64 *packed_rrt_idx = hash_map_search_path_u64(&rrt_hm, obj->path); - - // obj is not part of any input RRT - if (packed_rrt_idx == 0) { continue; } - - // unpack index - U32 rrt_idx = *packed_rrt_idx >> 32; - U32 rrt_obj_idx = *packed_rrt_idx & max_U32; - LNK_RRT *rrt = &rrt_input.v[rrt_idx]; - - // obj was recompiled, do not apply RRT indirection - FileProperties obj_file_props = properties_from_file_path(obj->path); - if (rrt->obj_time_stamps[rrt_obj_idx] != obj_file_props.modified) { continue; } - - // invalidate debug section pointers - obj->debug_t_sect_idx = ~0; - obj->debug_p_sect_idx = ~0; - obj->debug_h_sect_idx = ~0; - - // apply type index map - obj->ti_range = rrt->obj_ti_ranges[rrt_obj_idx]; - obj->ti_map = rrt->obj_ti_maps [rrt_obj_idx]; - - // apply PCH info - U32 rrt_pch_obj_idx = rrt->obj_pch_indices[rrt_obj_idx]; - if (rrt_pch_obj_idx < rrt->obj_count) { - String8 rrt_pch_obj_path = rrt->obj_paths.v[rrt_pch_obj_idx]; - U64 pch_obj_idx = *hash_map_search_path_u64(&obj_path_hm, rrt_pch_obj_path); - obj->pch_ti_range = rrt->obj_pch_ti_ranges[rrt_obj_idx]; - obj->pch_obj_idx = pch_obj_idx; - } else { - obj->pch_ti_range = r1u64(0,0); - obj->pch_obj_idx = ~0; + for EachIndex(obj_idx, obj_count) { + LNK_Obj *obj = obj_arr[obj_idx]; + U64 *packed_rrt_idx = hash_map_search_path_u64(&rrt_hm, obj->path); + + // obj is not part of any input RRT + if (packed_rrt_idx == 0) { continue; } + + // unpack index + U32 rrt_idx = *packed_rrt_idx >> 32; + U32 rrt_obj_idx = *packed_rrt_idx & max_U32; + LNK_RRT *rrt = &rrt_input.v[rrt_idx]; + + // obj was recompiled, do not apply RRT indirection + FileProperties obj_file_props = properties_from_file_path(obj->path); + if (rrt->obj_time_stamps[rrt_obj_idx] != obj_file_props.modified) { continue; } + + // invalidate debug section pointers + obj->debug_t_sect_idx = ~0; + obj->debug_p_sect_idx = ~0; + obj->debug_h_sect_idx = ~0; + + // apply type index map + obj->ti_range = rrt->obj_ti_ranges[rrt_obj_idx]; + obj->ti_map = rrt->obj_ti_maps [rrt_obj_idx]; + + // apply PCH info + U32 rrt_pch_obj_idx = rrt->obj_pch_indices[rrt_obj_idx]; + if (rrt_pch_obj_idx < rrt->obj_count) { + String8 rrt_pch_obj_path = rrt->obj_paths.v[rrt_pch_obj_idx]; + U64 pch_obj_idx = *hash_map_search_path_u64(&obj_path_hm, rrt_pch_obj_path); + obj->pch_ti_range = rrt->obj_pch_ti_ranges[rrt_obj_idx]; + obj->pch_obj_idx = pch_obj_idx; + } else { + obj->pch_ti_range = r1u64(0,0); + obj->pch_obj_idx = ~0; + } } } ProfEnd(); @@ -781,25 +1460,35 @@ lnk_make_code_view_input(TP_Context *tp, TP_Arena *tp_arena, LNK_Config *config, ProfEnd(); // sort objs based on type: PCH, /Zi (external), /Z7 (internal) - input.debug_p_indices.v = push_array(tp_arena->v[0], U32, obj_count); + input.debug_p_indices.v = push_array(tp_arena->v[0], U32, obj_count); input.ext_obj_indices.v = push_array(tp_arena->v[0], U32, obj_count); input.int_obj_indices.v = push_array(tp_arena->v[0], U32, obj_count); - for EachIndex(obj_idx, obj_count) { - CV_DebugT *debug_t = &input.debug_t_arr[obj_idx]; - CV_DebugT *debug_p = &debug_p_arr[obj_idx]; - U32Array *arr_ptr; - if (hash_map_search_path_u64(&rrt_hm, obj_arr[obj_idx]->path)) { arr_ptr = &input.ext_obj_indices; } - else if (debug_p->count > 0 && debug_t->count == 0) { arr_ptr = &input.debug_p_indices; } - else if (cv_debug_t_is_type_server_ref(debug_t)) { arr_ptr = &input.ext_obj_indices; } - else { arr_ptr = &input.int_obj_indices; } - arr_ptr->v[arr_ptr->count++] = obj_idx; + ProfScope("Classify Objs") + { + // parallel: classify each obj + apply per-obj debug_t mutation (see lnk_cv_classify_task) + LNK_CvClassifyTask classify = {0}; + classify.input = &input; + classify.debug_p_arr = debug_p_arr; + classify.rrt_hm = &rrt_hm; + classify.obj_arr = obj_arr; + classify.class_tag = push_array(scratch.arena, U8, obj_count ? obj_count : 1); + classify.warn_multi = push_array(scratch.arena, U8, obj_count ? obj_count : 1); + tp_for_parallel_prof(tp, 0, obj_count, lnk_cv_classify_task, &classify, "Classify Objs (parallel)"); + + // serial obj-order compaction into the 3 ordered arrays + deterministic warning emission. + // Cache-linear single pass; preserves the exact element order + warning order of the old loop. + for EachIndex(obj_idx, obj_count) { + U32Array *arr_ptr; + switch (classify.class_tag[obj_idx]) { + case 0: arr_ptr = &input.debug_p_indices; break; + case 1: arr_ptr = &input.ext_obj_indices; break; + default: arr_ptr = &input.int_obj_indices; break; + } + arr_ptr->v[arr_ptr->count++] = obj_idx; - if (debug_t->count == 0 && debug_p->count > 0) { - *debug_t = *debug_p; - } else if (debug_t->count && debug_p->count) { - lnk_error_obj(LNK_Warning_MultipleDebugTAndDebugP, obj_arr[obj_idx], "multiple sections with debug types detected, obj must have either .debug$T or .debug$P; discarding both sections"); - MemoryZeroStruct(debug_t); - MemoryZeroStruct(debug_p); + if (classify.warn_multi[obj_idx]) { + lnk_error_obj(LNK_Warning_MultipleDebugTAndDebugP, obj_arr[obj_idx], "multiple sections with debug types detected, obj must have either .debug$T or .debug$P; discarding both sections"); + } } } @@ -1059,6 +1748,12 @@ lnk_make_code_view_input(TP_Context *tp, TP_Arena *tp_arena, LNK_Config *config, } ProfEnd(); + // resolve MSVC header-unit IFC debug records (LF_IFC_RECORD 0x1522) -> real CodeView types. + // injects .ifc debug-records blobs as extra objs and registers placeholder-TI redirects. + if (config->ifc_debug_records == LNK_SwitchState_Yes && config->ifc_map_list.node_count) { + lnk_apply_ifc_debug_records(tp, tp_arena, &input, config); + } + // set default min type index for EachIndex(ti_source, CV_TypeIndexSource_COUNT) { input.min_type_indices[ti_source] = CV_MinComplexTypeIndex; } @@ -1075,25 +1770,24 @@ lnk_make_code_view_input(TP_Context *tp, TP_Arena *tp_arena, LNK_Config *config, ProfBegin("Make Symbol Inputs"); { - // count symbol blocks - for EachIndex(obj_idx, input.count) { - String8List s = cv_sub_section_from_debug_s(input.debug_s_arr[obj_idx], CV_C13SubSectionKind_Symbols); - input.symbol_input_count += s.node_count; - } - + // count symbol blocks (cache each obj's Symbols sub-section list so the fill pass below + // does not re-decode .debug$S a second time -- cv_sub_section_from_debug_s walks subsections). + String8List *per_obj_syms = push_array(scratch.arena, String8List, input.count ? input.count : 1); + LNK_CvSymTask sym_task = {0}; + sym_task.input = &input; + sym_task.per_obj_syms = per_obj_syms; + sym_task.counts = push_array(scratch.arena, U64, input.count ? input.count : 1); + + // parallel: cache each obj's Symbols sub-section list + count nodes + tp_for_parallel_prof(tp, 0, input.count, lnk_cv_sym_count_task, &sym_task, "Count Symbol Inputs"); + input.symbol_input_count = sum_array_u64(input.count, sym_task.counts); + sym_task.offsets = offsets_from_counts_array_u64(scratch.arena, sym_task.counts, input.count); + // alloc block pointers - input.symbol_inputs = push_array_no_zero(tp_arena->v[0], LNK_SymbolInput, input.symbol_input_count); + input.symbol_inputs = push_array_no_zero(tp_arena->v[0], LNK_SymbolInput, input.symbol_input_count ? input.symbol_input_count : 1); - U64 symbol_input_count = 0; - for EachIndex(obj_idx, input.count) { - String8List s = cv_sub_section_from_debug_s(input.debug_s_arr[obj_idx], CV_C13SubSectionKind_Symbols); - for EachNode(n, String8Node, s.first) { - Assert(symbol_input_count < input.symbol_input_count); - LNK_SymbolInput *in = &input.symbol_inputs[symbol_input_count++]; - in->obj_idx = obj_idx; - in->raw_symbols = n->string; - } - } + // parallel fill into disjoint per-obj ranges at prefix-sum offsets (byte-identical order) + tp_for_parallel_prof(tp, 0, input.count, lnk_cv_sym_fill_task, &sym_task, "Fill Symbol Inputs"); ProfBegin("Make Ranges"); U64 total_input_size = 0; @@ -1124,6 +1818,16 @@ lnk_make_code_view_input(TP_Context *tp, TP_Arena *tp_arena, LNK_Config *config, internal LNK_LeafRef lnk_leaf_ref_from_ti(LNK_CodeViewInput *input, U32 obj_idx, CV_TypeIndexSource source, CV_TypeIndex ti) { + // IFC redirect: a consuming obj's local LF_IFC_RECORD placeholder TI is mapped + // to a leaf inside an injected .ifc debug-records blob obj. The blob leaves then + // dedup/hash/fixup natively through the rest of this function. + if (input->has_ifc_redirects && source == CV_TypeIndexSource_TPI) { + U64 *packed = hash_map_search_u64_u64(&input->ifc_redirect_hm, Compose64Bit(obj_idx, ti)); + if (packed) { + return (LNK_LeafRef){ (U32)(*packed >> 32), (U32)(*packed & max_U32) }; + } + } + // ti range: external type server U64 ts_idx = input->obj_to_ts[obj_idx]; if (ts_idx != max_U64) { @@ -1370,29 +2074,23 @@ lnk_hash_cv_leaf_deep(Arena *arena, temp_end(temp); } -internal LNK_LeafRef * -lnk_leaf_hash_table_search(LNK_LeafHashTable *ht, LNK_CodeViewInput *input, LNK_LeafRef leaf_ref) +// Returns the type index assigned to leaf_ref's deduped class, or 0 if absent. Dedup is by hash +// (lnk_match_leaf_ref is a_hash==b_hash), so the assigned-ti table is keyed purely by hash -- a single +// probe, deref-free (the occupant hash is on the slot), and sized to the UNIQUE leaf count. +internal CV_TypeIndex +lnk_leaf_hash_table_search_ti(LNK_AssignedTiHash *ht, LNK_CodeViewInput *input, LNK_LeafRef leaf_ref) { - LNK_LeafRef *match = 0; - - CV_DebugT *debug_t = &input->debug_t_arr[leaf_ref.obj_idx]; - CV_DebugH *debug_h = &input->debug_h_arr[leaf_ref.obj_idx]; - U64 hash = debug_h->v[leaf_ref.leaf_idx]; - U64 best_bucket_idx = hash % ht->cap; - U64 bucket_idx = best_bucket_idx; + CV_DebugH *debug_h = &input->debug_h_arr[leaf_ref.obj_idx]; + U64 hash = debug_h->v[leaf_ref.leaf_idx]; + U64 best_idx = hash & (ht->cap - 1); // cap is pow2 + U64 idx = best_idx; do { - LNK_LeafRef *bucket = ht->bucket_arr[bucket_idx]; - if (bucket == 0) { break; } - - if (lnk_match_leaf_ref(input, *bucket, leaf_ref)) { - match = bucket; - break; - } - - bucket_idx = (bucket_idx + 1) == ht->cap ? 0 : (bucket_idx + 1); - } while (bucket_idx != best_bucket_idx); + if (ht->ti_arr[idx] == 0) { break; } // empty slot -> not present + if (ht->hash_arr[idx] == hash) { return ht->ti_arr[idx]; } + idx = (idx + 1) == ht->cap ? 0 : (idx + 1); + } while (idx != best_idx); - return match; + return 0; } internal @@ -1419,8 +2117,12 @@ THREAD_POOL_TASK_FUNC(lnk_hash_debug_t_deep_task) LNK_MergeTypes *task = raw_task; U64 obj_idx = task->indices.v[task_id]; CV_DebugT *debug_t = &task->input->debug_t_arr[obj_idx]; + // on-demand IFC pruning: leaves the closure didn't reach were rewritten to NOTYPE. + // skip them as deep-hash roots so we don't hash/bucket ~all of the ~1.5M blob leaves. + B32 is_ifc_blob = task->input->has_ifc_redirects && contains_1u64(task->input->ifc_obj_range, obj_idx); for EachIndex(leaf_idx, debug_t->count) { if (task->input->debug_h_arr[obj_idx].v[leaf_idx] != 0) { continue; } + if (is_ifc_blob && cv_debug_t_get_leaf_header(debug_t, leaf_idx)->kind == CV_LeafKind_NOTYPE) { continue; } Temp temp = temp_begin(task->fixed_arenas[worker_id]); CV_Leaf leaf = cv_debug_t_get_leaf(debug_t, leaf_idx); CV_TypeIndexInfoList ti_list = cv_get_leaf_type_index_offsets(temp.arena, leaf.kind, leaf.data); @@ -1454,7 +2156,7 @@ THREAD_POOL_TASK_FUNC(lnk_populate_leaf_ht) CV_LeafKind kind = memory_read16(MemberFromPtr(CV_LeafHeader, header, kind)); // leaf header -> leaf kind CV_TypeIndexSource leaf_source = cv_type_index_source_from_leaf_kind(kind); // leaf kind -> type stream LNK_LeafHashTable *leaf_ht = &task->leaf_ht_arr[leaf_source]; // type stream -> hash table - U64 best_idx = debug_h->v[leaf_idx] % leaf_ht->cap; // leaf ref -> hash -> bucket index + U64 best_idx = debug_h->v[leaf_idx] & (leaf_ht->cap - 1); // leaf ref -> hash -> bucket index (cap pow2) U64 idx = best_idx; do { // load leaf ref @@ -1495,8 +2197,14 @@ THREAD_POOL_TASK_FUNC(lnk_leaf_dedup_task) CV_DebugH *debug_h = &task->input->debug_h_arr[obj_idx]; ProfBeginDynamic("dedup in obj 0x%llx (%.*s) leaf count %llu", obj_idx, str8_varg(task->input->obj_arr[obj_idx]->path), debug_t->count); + // on-demand IFC pruning: NOTYPE'd blob leaves were never hashed (hash==0). skip them so we + // don't insert ~1.5M zero-hash leaves into the NULL-source table (whose cap is sized from the + // pre-prune source_counts and would overflow). they are never emitted, so nothing references them. + B32 is_ifc_blob = task->input->has_ifc_redirects && contains_1u64(task->input->ifc_obj_range, obj_idx); + LNK_LeafRef *bucket = 0; for EachIndex(leaf_idx, debug_t->count) { + if (is_ifc_blob && cv_debug_t_get_leaf_header(debug_t, leaf_idx)->kind == CV_LeafKind_NOTYPE) { continue; } // alloc new bucket and assign type ref if (bucket == 0) { bucket = push_array_no_zero(arena, LNK_LeafRef, 1); } @@ -1509,7 +2217,7 @@ THREAD_POOL_TASK_FUNC(lnk_leaf_dedup_task) CV_LeafKind kind = memory_read16(MemberFromPtr(CV_LeafHeader, header, kind)); // leaf header -> leaf kind CV_TypeIndexSource leaf_source = cv_type_index_source_from_leaf_kind(kind); // leaf kind -> type stream LNK_LeafHashTable *leaf_ht = &task->leaf_ht_arr[leaf_source]; // type stream -> hash table - U64 best_idx = debug_h->v[leaf_idx] % leaf_ht->cap; // leaf ref -> hash -> bucket index + U64 best_idx = debug_h->v[leaf_idx] & (leaf_ht->cap - 1); // leaf ref -> hash -> bucket index (cap pow2) U64 idx = best_idx; do { // load leaf ref @@ -1744,57 +2452,39 @@ THREAD_POOL_TASK_FUNC(lnk_assign_type_indices_task) { LNK_MergeTypes *task = raw_task; - CV_TypeIndexSource ti_source = task->ti_source; - LNK_LeafRefArray unique_leaf_refs = task->unique_leaf_refs_arr[ti_source]; - CV_TypeIndex min_type_index = task->min_type_indices[ti_source]; - U64 assigned_type_cap = task->assigned_type_caps[ti_source]; - CV_TypeIndex *assigned_type_ht = task->assigned_type_hts[ti_source]; + CV_TypeIndexSource ti_source = task->ti_source; + LNK_LeafRefArray unique_leaf_refs = task->unique_leaf_refs_arr[ti_source]; + CV_TypeIndex min_type_index = task->min_type_indices[ti_source]; + LNK_AssignedTiHash *at = &task->assigned_ti_arr[ti_source]; + CV_DebugH *debug_h_arr = task->input->debug_h_arr; + // Insert each unique leaf's assigned type index into the (unique-sized) hash->ti table, keyed by leaf + // hash. Unique leaves have distinct hashes (dedup is by hash), so each claims its own empty slot. + // search_ti (in the later fixup phase, after this barrier) recovers ti in one deref-free probe. for EachInRange(i, task->ranges[task_id]) { LNK_LeafRef *leaf_ref = unique_leaf_refs.v[i]; CV_TypeIndex type_index = min_type_index + i; - U64 hash = u64_hash_from_str8(str8_struct(leaf_ref)); - U64 best_idx = hash % assigned_type_cap; + U64 hash = debug_h_arr[leaf_ref->obj_idx].v[leaf_ref->leaf_idx]; + U64 best_idx = hash & (at->cap - 1); // cap is pow2 U64 idx = best_idx; - B32 is_inserted = 0; + B32 is_assigned = 0; do { - CV_TypeIndex curr_type_index = assigned_type_ht[idx]; - if (curr_type_index == 0) { - CV_TypeIndex cmp_type_index = ins_atomic_u32_eval_cond_assign(&assigned_type_ht[idx], type_index, curr_type_index); - if (cmp_type_index == curr_type_index) { - is_inserted = 1; + if (at->ti_arr[idx] == 0) { + CV_TypeIndex cmp = ins_atomic_u32_eval_cond_assign(&at->ti_arr[idx], type_index, 0); + if (cmp == 0) { + at->hash_arr[idx] = hash; // only this worker owns the slot now; read back in the fixup phase + is_assigned = 1; break; } } - // advance - idx = (idx + 1) == assigned_type_cap ? 0 : (idx + 1); + idx = (idx + 1) == at->cap ? 0 : (idx + 1); } while (idx != best_idx); - Assert(is_inserted); + Assert(is_assigned); } } -internal CV_TypeIndex -lnk_assigned_type_ht_search(U64 cap, CV_TypeIndex *ht, CV_TypeIndex min_type_index, LNK_LeafRefArray unique_leaf_refs, LNK_LeafRef *v, U64 hash) -{ - U64 best_idx = hash % cap; - U64 idx = best_idx; - do { - CV_TypeIndex type_index = ht[idx]; - if (type_index < min_type_index) { break; } - - U64 leaf_idx = type_index - min_type_index; - LNK_LeafRef *compar = unique_leaf_refs.v[leaf_idx]; - if (MemoryMatchStruct(compar,v)) { return type_index; } - - idx = (idx + 1) == cap ? 0 : (idx + 1); - } while(idx != best_idx); - - InvalidPath; - return 0; -} - internal void lnk_fixup_cv_type_indices(LNK_MergeTypes *ctx, U32 obj_idx, String8 data, CV_TypeIndexInfoList ti_info_list) { @@ -1805,21 +2495,11 @@ lnk_fixup_cv_type_indices(LNK_MergeTypes *ctx, U32 obj_idx, String8 data, CV_Typ // skip basic types if (ti < ctx->input->min_type_indices[n->source]) { continue; } - CV_TypeIndex final_ti = 0; - LNK_LeafRef leaf_ref = lnk_leaf_ref_from_ti(ctx->input, obj_idx, n->source, ti); - LNK_LeafHashTable *leaf_ht = &ctx->leaf_ht_arr[n->source]; - LNK_LeafRef *final_leaf = lnk_leaf_hash_table_search(leaf_ht, ctx->input, leaf_ref); - if (final_leaf) { - U64 final_hash = u64_hash_from_str8(str8_struct(final_leaf)); - final_ti = lnk_assigned_type_ht_search(ctx->assigned_type_caps [n->source], - ctx->assigned_type_hts [n->source], - ctx->min_type_indices [n->source], - ctx->unique_leaf_refs_arr[n->source], - final_leaf, - final_hash); - } + LNK_LeafRef leaf_ref = lnk_leaf_ref_from_ti(ctx->input, obj_idx, n->source, ti); + LNK_AssignedTiHash *assigned = &ctx->assigned_ti_arr[n->source]; + CV_TypeIndex final_ti = lnk_leaf_hash_table_search_ti(assigned, ctx->input, leaf_ref); #if BUILD_DEBUG - else { + if (final_ti == 0) { lnk_error_obj(LNK_Error_InvalidTypeIndex, ctx->input->obj_arr[obj_idx], "no itype 0x%x", ti); } #endif @@ -1988,23 +2668,9 @@ THREAD_POOL_TASK_FUNC(lnk_build_obj_ti_map) for EachIndex(leaf_idx, debug_t->count) { CV_Leaf leaf = cv_debug_t_get_leaf(debug_t, leaf_idx); CV_TypeIndexSource source = cv_type_index_source_from_leaf_kind(leaf.kind); - LNK_LeafRef leaf_ref = { obj_idx, leaf_idx }; - LNK_LeafHashTable *leaf_ht = &task->leaf_ht_arr[source]; - LNK_LeafRef *final_leaf = lnk_leaf_hash_table_search(leaf_ht, input, leaf_ref); - - if (final_leaf) { - U64 final_hash = u64_hash_from_str8(str8_struct(final_leaf)); - CV_TypeIndex final_ti = lnk_assigned_type_ht_search(task->assigned_type_caps [source], - task->assigned_type_hts [source], - task->min_type_indices [source], - task->unique_leaf_refs_arr[source], - final_leaf, - final_hash); - - obj_ti_map[leaf_idx] = final_ti; - } else { - obj_ti_map[leaf_idx] = 0; - } + LNK_LeafRef leaf_ref = { obj_idx, leaf_idx }; + LNK_AssignedTiHash *assigned = &task->assigned_ti_arr[source]; + obj_ti_map[leaf_idx] = lnk_leaf_hash_table_search_ti(assigned, input, leaf_ref); } task->result.obj_ti_maps[obj_idx] = obj_ti_map; @@ -2028,6 +2694,7 @@ lnk_merge_types(TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input, LNK U32Array indices; U32Array hash_indices; } hash_targets[] = { + { lnk_hash_debug_t_deep_task, input->ifc_indices }, // hash .ifc blobs first: int-obj leaves redirect into them { lnk_hash_debug_t_task, input->debug_p_indices }, // hash .debug$P first so we can mix in hashes for precompiled sub leaves when hashing leaves in .debug$T { lnk_hash_debug_t_task, input->int_obj_indices }, { lnk_hash_debug_t_deep_task, input->type_server_indices }, @@ -2084,14 +2751,24 @@ lnk_merge_types(TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input, LNK } ProfEnd(); + // bucket_arr (the ~1.3x-total-leaf-count probe tables) is only live through the dedup + extract + // phases: its last read is in lnk_get_present_buckets_task ("Copy present buckets") which copies + // bucket pointers into unique_leaf_refs. Allocate it in a dedicated arena so we can release that + // multi-GB working set immediately after the extract loop, before the merge-types/PDB-build peak. + // (A temp_begin on scratch.arena would not work: many surviving allocations -- unique_leaf_refs, + // assigned_ti, radix scratch -- land in scratch.arena after bucket_arr.) + Arena *bucket_arena = arena_alloc(.name = "LEAF_BUCKETS"); + ProfBegin("Leaf Hash Table Init"); for EachIndex(ti_source, CV_TypeIndexSource_COUNT) { U64 total_count = 0; for EachIndex(obj_idx, input->count) { total_count += input->debug_t_arr[obj_idx].source_counts[ti_source]; } task.leaf_ht_arr[ti_source].cap = total_count; - task.leaf_ht_arr[ti_source].cap = 1 + ((task.leaf_ht_arr[ti_source].cap * 13) / 10); // * 1.3 - task.leaf_ht_arr[ti_source].bucket_arr = push_array(scratch.arena, LNK_LeafRef *, task.leaf_ht_arr[ti_source].cap); + // pow2 cap so bucket index is hash & (cap-1) (mask) instead of hash % cap (a 64-bit DIV in the + // densest dedup probe loop). u64_up_to_pow2(1.3*count) keeps load factor <= ~0.65. + task.leaf_ht_arr[ti_source].cap = u64_up_to_pow2(1 + ((task.leaf_ht_arr[ti_source].cap * 13) / 10)); // * 1.3, pow2 + task.leaf_ht_arr[ti_source].bucket_arr = push_array(bucket_arena, LNK_LeafRef *, task.leaf_ht_arr[ti_source].cap); #if PROFILE_TELEMETRY tmMessage(0, TMMF_ICON_NOTE, "%.*s Bucket Count: %.*s", str8_varg(cv_string_from_type_index_source(ti_source)), str8_varg(str8_from_count(scratch.arena, task.leaf_ht_arr[ti_source].cap))); @@ -2136,6 +2813,9 @@ lnk_merge_types(TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input, LNK task.indices = dedup_type_server_indices; tp_for_parallel_prof(tp, tp_temp, task.indices.count, lnk_leaf_dedup_task, &task, "Type Servers"); + + task.indices = input->ifc_indices; + tp_for_parallel_prof(tp, tp_temp, task.indices.count, lnk_leaf_dedup_task, &task, "IFC Blobs"); ProfEnd(); ProfBegin("Extract present buckets from the leaf hash tables"); @@ -2148,6 +2828,12 @@ lnk_merge_types(TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input, LNK task.unique_leaf_refs_arr[ti_source].count = sum_array_u64(tp->worker_count, task.counts[ti_source]); task.unique_leaf_refs_arr[ti_source].v = push_array_no_zero(scratch.arena, LNK_LeafRef *, task.unique_leaf_refs_arr[ti_source].count); + + // assigned-ti table sized to the unique (deduped) count -- not the total leaf count, which would + // add the bucket-parallel ti/hash arrays' worth of peak working set (~3GB on large links) + task.assigned_ti_arr[ti_source].cap = u64_up_to_pow2(1 + ((task.unique_leaf_refs_arr[ti_source].count * 13) / 10)); // * 1.3, pow2 -> mask index + task.assigned_ti_arr[ti_source].ti_arr = push_array(scratch.arena, CV_TypeIndex, task.assigned_ti_arr[ti_source].cap); + task.assigned_ti_arr[ti_source].hash_arr = push_array(scratch.arena, U64, task.assigned_ti_arr[ti_source].cap); task.offsets[ti_source] = offsets_from_counts_array_u64(scratch.arena, task.counts[ti_source], tp->worker_count); tp_for_parallel_prof(tp, 0, tp->worker_count, lnk_get_present_buckets_task, &task, "Copy present buckets"); @@ -2234,6 +2920,11 @@ lnk_merge_types(TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input, LNK } } + // bucket_arr is fully consumed (copied into unique_leaf_refs / sorted) -- release the probe tables + // now so this multi-GB working set is gone before the merge-types/PDB-build peak. + arena_release(bucket_arena); + for EachIndex(ti_source, CV_TypeIndexSource_COUNT) { task.leaf_ht_arr[ti_source].bucket_arr = 0; } + #if PROFILE_TELEMETRY tmMessage(0, TMMF_ICON_NOTE, "TPI Count: %.*s", str8_varg(str8_from_count(scratch.arena, task.unique_leaf_refs_arr[CV_TypeIndexSource_TPI].count))); tmMessage(0, TMMF_ICON_NOTE, "IPI Count: %.*s", str8_varg(str8_from_count(scratch.arena, task.unique_leaf_refs_arr[CV_TypeIndexSource_IPI].count))); @@ -2246,8 +2937,6 @@ lnk_merge_types(TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input, LNK ProfBegin("Assign type indices"); for EachIndex(ti_source, CV_TypeIndexSource_COUNT) { task.ti_source = ti_source; - task.assigned_type_caps[ti_source] = (task.unique_leaf_refs_arr[ti_source].count * 13) / 10; - task.assigned_type_hts [ti_source] = push_array(scratch.arena, CV_TypeIndex, task.assigned_type_caps[ti_source]); task.min_type_indices [ti_source] = CV_MinComplexTypeIndex; task.ranges = tp_divide_work(scratch.arena, task.unique_leaf_refs_arr[ti_source].count, tp->worker_count); tp_for_parallel_prof(tp, 0, tp->worker_count, lnk_assign_type_indices_task, &task, "Assign Type Indices"); @@ -3132,6 +3821,321 @@ THREAD_POOL_TASK_FUNC(lnk_push_dbi_sec_contrib_task) } } +//////////////////////////////// +// Type Garbage Collection +// +// After type merging, prune merged TPI/IPI leaves that are not reachable from any surviving +// symbol record (the GC roots), compact them, and remap all type indices. link.exe keeps a +// large unreferenced-type set; pruning it is a transparent PDB-size win (debug-info only -- the +// image is untouched). Runs on the final post-fixup type indices in place. + +typedef struct LNK_GCTypes +{ + LNK_CodeViewInput *cv; + U64 min [CV_TypeIndexSource_COUNT]; // first type index per source + U64 orig_n[CV_TypeIndexSource_COUNT]; // pre-GC leaf count per source + U8 *mark [CV_TypeIndexSource_COUNT]; // reachable bitmap, indexed by (ti - min) + CV_TypeIndex *remap [CV_TypeIndexSource_COUNT]; // old leaf idx -> new type index + U8 **leaf_v [CV_TypeIndexSource_COUNT]; // original leaf pointer arrays + U32 *udt_next; // TPI fwdref<->definition unique_name ring + Rng1U64 *sym_ranges; + B32 do_rewrite; // 0 = mark roots, 1 = rewrite to compacted indices + // transitive-closure frontier: indices marked but not yet expanded. Each leaf is appended once + // (the atomic mark gates it), so frontier[s] is sized orig_n[s] and fcount[s] is its atomic tail. + U32 *frontier[CV_TypeIndexSource_COUNT]; + U32 *fcount [CV_TypeIndexSource_COUNT]; // atomic append cursor per source + // per-source scratch (set before dispatch) + CV_TypeIndexSource cur_source; + U8 **cur_leaf_v; + Rng1U64 *cur_ranges; + U64 round_begin, round_end; // frontier slice processed this round +} LNK_GCTypes; + +typedef struct LNK_GCNamePair { U64 hash; U32 idx; } LNK_GCNamePair; + +internal int +lnk_gc_name_pair_is_before(void *raw_a, void *raw_b) +{ + LNK_GCNamePair *a = raw_a, *b = raw_b; + return a->hash != b->hash ? (a->hash < b->hash) : (a->idx < b->idx); +} + +internal void +lnk_gc_mark_ti(LNK_GCTypes *g, CV_TypeIndexSource s, CV_TypeIndex ti) +{ + U64 lo = g->min[s]; + if (ti >= lo) { U64 idx = ti - lo; if (idx < g->orig_n[s]) { g->mark[s][idx] = 1; } } +} + +// walk a record's type-index sites; mark roots (do_rewrite==0) or rewrite to compacted indices (==1) +internal void +lnk_gc_visit_offsets(LNK_GCTypes *g, String8 data, CV_TypeIndexInfoList ti_info_list) +{ + for EachNode(n, CV_TypeIndexInfo, ti_info_list.first) { + U8 *p = data.str + n->offset; + CV_TypeIndex ti = memory_read32(p); + if (g->do_rewrite) { + U64 lo = g->min[n->source]; + if (ti >= lo) { U64 idx = ti - lo; if (idx < g->orig_n[n->source]) { memory_write32(p, g->remap[n->source][idx]); } } + } else { + lnk_gc_mark_ti(g, n->source, ti); + } + } +} + +internal +THREAD_POOL_TASK_FUNC(lnk_gc_syms_task) +{ + LNK_GCTypes *g = raw_task; + Temp scratch = scratch_begin(0, 0); + for EachInRange(i, g->sym_ranges[task_id]) { + LNK_SymbolInput symbols = g->cv->symbol_inputs[i]; + for (U64 cursor = 0; cursor + sizeof(CV_SymbolHeader) <= symbols.raw_symbols.size; ) { + Temp temp = temp_begin(scratch.arena); + CV_Symbol symbol = {0}; + TryReadBreak(cv_read_symbol(symbols.raw_symbols, cursor, CV_SymbolAlign, &symbol), cursor); + CV_TypeIndexInfoList l = cv_get_symbol_type_index_offsets(temp.arena, symbol.kind, symbol.data); + lnk_gc_visit_offsets(g, symbol.data, l); + temp_end(temp); + } + } + scratch_end(scratch); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_gc_inlines_task) +{ + LNK_GCTypes *g = raw_task; + U64 obj_idx = task_id; + Temp scratch = scratch_begin(0, 0); + String8List inlinee_lines = cv_sub_section_from_debug_s(g->cv->debug_s_arr[obj_idx], CV_C13SubSectionKind_InlineeLines); + for EachNode(dn, String8Node, inlinee_lines.first) { + Temp temp = temp_begin(scratch.arena); + CV_TypeIndexInfoList l = cv_get_inlinee_type_index_offsets(temp.arena, dn->string); + lnk_gc_visit_offsets(g, dn->string, l); + temp_end(temp); + } + scratch_end(scratch); +} + +internal +THREAD_POOL_TASK_FUNC(lnk_gc_rewrite_leaves_task) +{ + LNK_GCTypes *g = raw_task; + Temp scratch = scratch_begin(0, 0); + for EachInRange(i, g->cur_ranges[task_id]) { + Temp temp = temp_begin(scratch.arena); + CV_Leaf leaf = cv_leaf_from_ptr(g->cur_leaf_v[i]); + CV_TypeIndexInfoList l = cv_get_leaf_type_index_offsets(temp.arena, leaf.kind, leaf.data); + lnk_gc_visit_offsets(g, leaf.data, l); + temp_end(temp); + } + scratch_end(scratch); +} + +typedef struct LNK_GCRingTask +{ + U8 **leaf_v; + Rng1U64 *ranges; + U64 *counts; + U64 *offsets; + LNK_GCNamePair *pairs; +} LNK_GCRingTask; + +// parallel: count UDT leaves with a unique_name per range (pass 0) / emit (hash,idx) pairs (pass 1) +internal +THREAD_POOL_TASK_FUNC(lnk_gc_ring_count_task) +{ + LNK_GCRingTask *t = raw_task; + U64 n = 0; + for EachInRange(i, t->ranges[task_id]) { + CV_Leaf leaf = cv_leaf_from_ptr(t->leaf_v[i]); + if (cv_is_udt(leaf.kind)) { + CV_UDTInfo ui = cv_get_udt_info(leaf.kind, leaf.data); + if (ui.props & CV_TypeProp_HasUniqueName) { n += 1; } + } + } + t->counts[task_id] = n; +} + +internal +THREAD_POOL_TASK_FUNC(lnk_gc_ring_fill_task) +{ + LNK_GCRingTask *t = raw_task; + U64 cur = t->offsets[task_id]; + for EachInRange(i, t->ranges[task_id]) { + CV_Leaf leaf = cv_leaf_from_ptr(t->leaf_v[i]); + if (cv_is_udt(leaf.kind)) { + CV_UDTInfo ui = cv_get_udt_info(leaf.kind, leaf.data); + if (ui.props & CV_TypeProp_HasUniqueName) { + U64 h = 14695981039346656037ull; + for EachIndex(c, ui.unique_name.size) { h = (h ^ ui.unique_name.str[c]) * 0x100000001b3ull; } + t->pairs[cur].hash = h; t->pairs[cur].idx = (U32)i; cur += 1; + } + } + } +} + +// mark a leaf reachable and, if this is its first mark, append it to its source's frontier so a +// later round expands it. Atomic mark gates the append, so each leaf lands on the frontier once. +internal void +lnk_gc_mark_enqueue(LNK_GCTypes *g, CV_TypeIndexSource ns, U64 ci) +{ + if (ci >= g->orig_n[ns]) { return; } + if (g->mark[ns][ci]) { return; } // fast non-atomic skip: already reachable (the common edge) + // only the worker that wins the 0->1 transition appends, so the atomic runs once per leaf (not + // once per reference edge). + if (!ins_atomic_u8_eval_assign(&g->mark[ns][ci], 1)) { + U32 pos = ins_atomic_u32_inc_eval(g->fcount[ns]) - 1; + g->frontier[ns][pos] = (U32)ci; + } +} + +// one bulk-synchronous round of transitive closure: expand the frontier slice [round_begin, +// round_end) of cur_source -- visit each leaf and enqueue the leaves it references (and its +// unique_name UDT counterparts). Frontier-driven, so total work is O(reachable leaves), not +// O(rounds * total leaves). +internal +THREAD_POOL_TASK_FUNC(lnk_gc_expand_task) +{ + LNK_GCTypes *g = raw_task; + CV_TypeIndexSource s = g->cur_source; + Temp scratch = scratch_begin(0, 0); + for EachInRange(local, g->cur_ranges[task_id]) { + U32 i = g->frontier[s][g->round_begin + local]; + + Temp temp = temp_begin(scratch.arena); + CV_Leaf leaf = cv_leaf_from_ptr(g->leaf_v[s][i]); + CV_TypeIndexInfoList l = cv_get_leaf_type_index_offsets(temp.arena, leaf.kind, leaf.data); + for EachNode(n, CV_TypeIndexInfo, l.first) { + CV_TypeIndex ti = memory_read32(leaf.data.str + n->offset); + U64 lo = g->min[n->source]; + if (ti >= lo) { lnk_gc_mark_enqueue(g, n->source, ti - lo); } + } + temp_end(temp); + + if (s == CV_TypeIndexSource_TPI) { + for (U32 j = g->udt_next[i]; j != i; j = g->udt_next[j]) { + lnk_gc_mark_enqueue(g, CV_TypeIndexSource_TPI, j); + } + } + } + scratch_end(scratch); +} + +internal void +lnk_gc_types(TP_Context *tp, Arena *arena, LNK_CodeViewInput *cv, LNK_MergedTypes *types) +{ + ProfBeginFunction(); + Temp scratch = scratch_begin(&arena, 1); + + LNK_GCTypes g = {0}; + g.cv = cv; + U64 total_leaves = 0; + for EachIndex(s, CV_TypeIndexSource_COUNT) { + g.min[s] = types->min_type_indices[s]; + g.orig_n[s] = types->count[s]; + g.leaf_v[s] = types->v[s]; + g.mark[s] = push_array(scratch.arena, U8, g.orig_n[s] ? g.orig_n[s] : 1); // zeroed + total_leaves += g.orig_n[s]; + } + + // mark roots: every type index referenced by a surviving symbol / inlinee record + g.do_rewrite = 0; + g.sym_ranges = tp_divide_work(scratch.arena, cv->symbol_input_count, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_gc_syms_task, &g); + tp_for_parallel(tp, 0, cv->obj_count, lnk_gc_inlines_task, &g); + + // link UDT leaves that share a unique_name into rings, so that marking any one (e.g. a + // forward ref reached as a member-pointer target) also keeps its full definition -- needed + // for the debugger to complete types referenced only by name. TPI only (IPI has no UDTs). + U64 n_tpi = g.orig_n[CV_TypeIndexSource_TPI]; + g.udt_next = push_array_no_zero(scratch.arena, U32, n_tpi ? n_tpi : 1); + for EachIndex(i, n_tpi) { g.udt_next[i] = (U32)i; } + { + LNK_GCRingTask rt = {0}; + rt.leaf_v = g.leaf_v[CV_TypeIndexSource_TPI]; + rt.ranges = tp_divide_work(scratch.arena, n_tpi, tp->worker_count); + rt.counts = push_array(scratch.arena, U64, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_gc_ring_count_task, &rt); + rt.offsets = offsets_from_counts_array_u64(scratch.arena, rt.counts, tp->worker_count); + U64 np = sum_array_u64(tp->worker_count, rt.counts); + rt.pairs = push_array_no_zero(scratch.arena, LNK_GCNamePair, np ? np : 1); + tp_for_parallel(tp, 0, tp->worker_count, lnk_gc_ring_fill_task, &rt); + + radsort(rt.pairs, np, lnk_gc_name_pair_is_before); + for (U64 a = 0; a < np; ) { + U64 b = a + 1; + while (b < np && rt.pairs[b].hash == rt.pairs[a].hash) { b += 1; } + for (U64 k = a; k < b; k += 1) { g.udt_next[rt.pairs[k].idx] = rt.pairs[(k + 1 < b) ? (k + 1) : a].idx; } + a = b; + } + } + + // transitive closure (parallel, bulk-synchronous): repeat rounds that visit each + // marked-but-unexpanded leaf and mark what it references, until a round marks nothing new + // seed the frontier with the root-marked leaves (one O(total leaves) scan), then expand + // frontier slices until both sources drain. Each leaf is expanded exactly once. + U32 fcount[CV_TypeIndexSource_COUNT] = {0}; + U64 start [CV_TypeIndexSource_COUNT] = {0}; + for EachIndex(s, CV_TypeIndexSource_COUNT) { + g.frontier[s] = push_array_no_zero(scratch.arena, U32, g.orig_n[s] ? g.orig_n[s] : 1); + g.fcount[s] = &fcount[s]; + for EachIndex(i, g.orig_n[s]) { if (g.mark[s][i]) { g.frontier[s][fcount[s]++] = (U32)i; } } + } + for (;;) { + B32 any = 0; + for EachIndex(s, CV_TypeIndexSource_COUNT) { + U64 begin = start[s], end = fcount[s]; // fcount may grow during the round (cross-source enqueues) + if (begin < end) { + any = 1; + g.cur_source = (CV_TypeIndexSource)s; + g.round_begin = begin; + g.round_end = end; + g.cur_ranges = tp_divide_work(scratch.arena, end - begin, tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_gc_expand_task, &g); + start[s] = end; + } + } + if (!any) { break; } + } + + // compact each source: assign new contiguous type indices to the kept leaves. The leaf + // pointer array is compacted IN PLACE (kept count only shrinks, so the write cursor never + // passes the read cursor) and remap lives in scratch -- so the GC adds nothing to the arena + // that survives into the (peak) PDB build. + U64 kept_total = 0; + for EachIndex(s, CV_TypeIndexSource_COUNT) { + g.remap[s] = push_array_no_zero(scratch.arena, CV_TypeIndex, g.orig_n[s] ? g.orig_n[s] : 1); + U8 **v = g.leaf_v[s]; + U64 new_n = 0; + for EachIndex(idx, g.orig_n[s]) { + if (g.mark[s][idx]) { g.remap[s][idx] = (CV_TypeIndex)(g.min[s] + new_n); v[new_n++] = v[idx]; } + else { g.remap[s][idx] = 0; /* T_NOTYPE; never referenced by a kept record */ } + } + types->count[s] = new_n; + kept_total += new_n; + } + + // rewrite all type-index references to the compacted indices + g.do_rewrite = 1; + tp_for_parallel(tp, 0, tp->worker_count, lnk_gc_syms_task, &g); + tp_for_parallel(tp, 0, cv->obj_count, lnk_gc_inlines_task, &g); + for EachIndex(s, CV_TypeIndexSource_COUNT) { + g.cur_source = (CV_TypeIndexSource)s; + g.cur_leaf_v = types->v[s]; + g.cur_ranges = tp_divide_work(scratch.arena, types->count[s], tp->worker_count); + tp_for_parallel(tp, 0, tp->worker_count, lnk_gc_rewrite_leaves_task, &g); + } + + if (lnk_get_log_status(LNK_Log_Debug)) { + lnk_log(LNK_Log_Debug, "type GC: kept %llu of %llu leaves (pruned %llu)", kept_total, total_leaves, total_leaves - kept_total); + } + + scratch_end(scratch); + ProfEnd(); +} + internal String8List lnk_build_pdb(TP_Context *tp, TP_Arena *tp_arena, String8 image_data, LNK_Config *config, LNK_SymbolTable *symtab, LNK_CodeViewInput *cv, LNK_MergedTypes cv_types, LNK_PDB_BuilderFlags builder_flags) { diff --git a/src/linker/lnk_debug_info.h b/src/linker/lnk_debug_info.h index dc094e0f9..ee6cd37da 100644 --- a/src/linker/lnk_debug_info.h +++ b/src/linker/lnk_debug_info.h @@ -98,6 +98,16 @@ typedef struct U64 symbol_input_count; LNK_SymbolInput *symbol_inputs; // [symbol_input_count] Rng1U64 *symbol_input_ranges; // [worker_count] + + // IFC (header-unit debug-record) resolution: + // redirects a consuming obj's local LF_IFC_RECORD placeholder TI to a leaf in + // an injected .ifc debug-records blob "obj". Consulted first in lnk_leaf_ref_from_ti. + // key = Compose64Bit(obj_idx, local_ti) + // value = Compose64Bit(blob_obj_idx, blob_leaf_idx) + B32 has_ifc_redirects; + HashMap ifc_redirect_hm; + Rng1U64 ifc_obj_range; // [min,max) range of injected blob objs in the parallel arrays + U32Array ifc_indices; // obj indices of injected .ifc blob objs (hashed/deduped first) } LNK_CodeViewInput; typedef struct @@ -115,10 +125,20 @@ typedef struct { U64 count; LNK_LeafRef **v; } LNK_LeafRefArray; typedef struct { - U64 cap; + U64 cap; // ~1.3x total (pre-dedup) leaf count LNK_LeafRef **bucket_arr; } LNK_LeafHashTable; +// Maps leaf hash -> assigned type index, sized to the UNIQUE (post-dedup) leaf count rather than the +// total leaf count -- folds the canonical-bucket + bucket->ti lookups into one probe (deref-free: the +// hash is stored on the slot) without the total-sized per-bucket arrays that would add ~3GB to peak. +typedef struct +{ + U64 cap; // ~1.3x unique leaf count + CV_TypeIndex *ti_arr; // assigned type index per slot; 0 == empty (ti is always >= CV_MinComplexTypeIndex) + U64 *hash_arr; // occupant leaf hash (parallel to ti_arr); disambiguates open-addressing collisions +} LNK_AssignedTiHash; + typedef struct LNK_LeafRange { struct LNK_LeafRange *next; @@ -150,6 +170,7 @@ typedef struct LNK_CodeViewInput *input; CV_DebugS *debug_s_arr; LNK_LeafHashTable leaf_ht_arr[CV_TypeIndexSource_COUNT]; + LNK_AssignedTiHash assigned_ti_arr[CV_TypeIndexSource_COUNT]; Arena **fixed_arenas; CV_TypeIndexSource ti_source; U32Array indices; @@ -174,8 +195,6 @@ typedef struct U64 pass_idx; // assign type indices - U64 assigned_type_caps [CV_TypeIndexSource_COUNT]; - CV_TypeIndex *assigned_type_hts [CV_TypeIndexSource_COUNT]; CV_TypeIndex min_type_indices [CV_TypeIndexSource_COUNT]; LNK_LeafRefArray unique_leaf_refs_arr[CV_TypeIndexSource_COUNT]; @@ -251,11 +270,12 @@ internal B32 lnk_match_leaf_ref (LNK_CodeViewInput internal U64 lnk_hash_cv_leaf (LNK_CodeViewInput *input, LNK_LeafRef leaf_ref, CV_TypeIndexInfoList ti_info_list, B32 discard_cycles); internal void lnk_hash_cv_leaf_deep (Arena *arena, LNK_CodeViewInput *input, LNK_LeafRef leaf_ref, CV_TypeIndexInfoList ti_info_list); internal LNK_LeafRef * lnk_leaf_hash_table_insert_or_update(LNK_LeafHashTable *leaf_ht, LNK_CodeViewInput *input, CV_DebugH *hashes, U64 hash, LNK_LeafRef *new_bucket); -internal LNK_LeafRef * lnk_leaf_hash_table_search (LNK_LeafHashTable *ht, LNK_CodeViewInput *input, LNK_LeafRef leaf_ref); +internal CV_TypeIndex lnk_leaf_hash_table_search_ti (LNK_AssignedTiHash *ht, LNK_CodeViewInput *input, LNK_LeafRef leaf_ref); // returns assigned ti for leaf_ref's hash class, 0 if absent internal LNK_MergedTypes lnk_merge_types (TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input, LNK_MergeTypeFlags merge_flags); internal void lnk_replace_type_names_with_hashes (TP_Context *tp, TP_Arena *arena, U64 leaf_count, U8 **leaf_arr, LNK_TypeNameHashMode mode, U64 hash_length, String8 map_name); //////////////////////////////// // PDB +internal void lnk_gc_types (TP_Context *tp, Arena *arena, LNK_CodeViewInput *cv, LNK_MergedTypes *types); internal String8List lnk_build_pdb(TP_Context *tp, TP_Arena *tp_arena, String8 image_data, LNK_Config *config, LNK_SymbolTable *symtab, LNK_CodeViewInput *cv, LNK_MergedTypes cv_types, LNK_PDB_BuilderFlags builder_flags); diff --git a/src/linker/lnk_lib.h b/src/linker/lnk_lib.h index eed5e2dea..8e92e47a9 100644 --- a/src/linker/lnk_lib.h +++ b/src/linker/lnk_lib.h @@ -15,6 +15,24 @@ typedef struct LNK_Lib String8Array symbol_names; String8 long_names; U64 input_idx; + + // lib-search barrier elision: a re-search of this lib can only queue new members if the set of + // undefined/weak symbols grew, or if anti-dep searching was just enabled, since the last search. + // these record the state at the last search so identical re-searches can skip the tp dispatch. + B32 was_searched; + B32 searched_anti_deps; + U64 searched_symbol_count; + + // lib-search FRONTIER cursor: search_chunks is append-only across the entire lib-search loop + // (it is only drained -> chunks AFTER the loop, in lnk_replace_weak_with_default_symbols), so an + // already-searched (symbol,lib) pair can never resolve a new member -- searching is a pure + // function of (lib, symbol->name) and the member queue dedup is idempotent. each worker records + // the position in its search_chunks[worker] list where this lib's last search ended; the next + // search rescans only the slots appended since. search_cursor_chunk[worker] == 0 means "start at + // list first" (nothing searched yet). reset to 0 when the anti-dep mode flips (anti-dep weak + // symbols must be re-tried under the new mode). + struct LNK_SymbolHashTrieChunk **search_cursor_chunk; // [worker_count], 0 = unsearched + U64 *search_cursor_idx; // [worker_count], intra-chunk resume index } LNK_Lib; typedef struct LNK_LibNode diff --git a/src/linker/lnk_obj.c b/src/linker/lnk_obj.c index c2f9649e9..25e5921a3 100644 --- a/src/linker/lnk_obj.c +++ b/src/linker/lnk_obj.c @@ -125,13 +125,16 @@ THREAD_POOL_TASK_FUNC(lnk_obj_initer) } // - // error check symbol table + // error check symbol table (+ memoize parsed symbols) // + LNK_ParsedSymbolLite *parsed_symbols = push_array(arena, LNK_ParsedSymbolLite, header.symbol_count); { COFF_SectionHeader *section_table = (COFF_SectionHeader *)str8_substr(input->data, header.section_table_range).str; COFF_ParsedSymbol symbol; for (U64 symbol_idx = 0; symbol_idx < header.symbol_count; symbol_idx += (1 + symbol.aux_symbol_count)) { symbol = coff_parse_symbol(header, raw_coff_string_table, raw_coff_symbol_table, symbol_idx); + U32 raw_off = symbol.raw_symbol ? safe_cast_u32((U8 *)symbol.raw_symbol - input->data.str) : 0; + parsed_symbols[symbol_idx] = (LNK_ParsedSymbolLite){ raw_off, safe_cast_u32(symbol.value), symbol.section_number, symbol.type, symbol.storage_class, symbol.aux_symbol_count }; COFF_SymbolValueInterpType interp = coff_interp_symbol(symbol.section_number, symbol.value, symbol.storage_class); if (interp == COFF_SymbolValueInterp_Regular) { if (symbol.section_number == 0 || symbol.section_number > header.section_count_no_null) { @@ -335,6 +338,7 @@ THREAD_POOL_TASK_FUNC(lnk_obj_initer) obj->path = push_str8_copy(arena, input->path); obj->header = header; obj->section_flags = section_flags; + obj->parsed_symbols = parsed_symbols; obj->comdats = comdats; obj->exclude_from_debug_info = input->exclude_from_debug_info; obj->hotpatch = hotpatch; @@ -584,8 +588,11 @@ lnk_obj_section_name_from_section_number(LNK_Obj *obj, U64 section_number) return lnk_obj_section_name_from_sect_idx(obj, section_number-1); } +// NOTE: skips section.name (coff_name_from_section_header does a string-table +// lookup); use when only the header/flags/ranges are needed (e.g. checking +// section flags). Callers that need the name should use lnk_obj_section_from_sect_idx. internal LNK_ObjSection -lnk_obj_section_from_sect_idx(LNK_Obj *obj, U64 sect_idx) +lnk_obj_section_from_sect_idx_no_name(LNK_Obj *obj, U64 sect_idx) { Assert(sect_idx < obj->header.section_count_no_null); LNK_ObjSection section = {0}; @@ -600,6 +607,15 @@ lnk_obj_section_from_sect_idx(LNK_Obj *obj, U64 sect_idx) return section; } +internal LNK_ObjSection +lnk_obj_section_from_sect_idx(LNK_Obj *obj, U64 sect_idx) +{ + // Under upstream's lazy section-name model LNK_ObjSection has no `name` field + // (callers fetch it on demand via lnk_obj_section_name_from_sect_idx), so this + // is identical to the _no_name accessor. + return lnk_obj_section_from_sect_idx_no_name(obj, sect_idx); +} + internal LNK_ObjSection lnk_obj_section_from_section_number(LNK_Obj *obj, U64 section_number) { @@ -660,19 +676,35 @@ lnk_coff_section_header_from_section_number(LNK_Obj *obj, U64 section_number) return §ion_table[sect_idx]; } +// NOTE: returns the memoized parse built in lnk_obj_initer. The struct is mutable: symbol-value +// patching (lnk_patch_*_task) writes value/section_number/storage_class here, NOT into the mmapped +// obj->data symbol table -- so obj->data symbol values are never written after load. raw_symbol still +// points into obj->data (read-only) for aux-record reads (coff_parse_weak_tag / coff_parse_secdef). internal COFF_ParsedSymbol -lnk_parsed_symbol_from_coff_symbol_idx(LNK_Obj *obj, U64 symbol_idx) +lnk_parsed_symbol_from_coff_symbol_idx_no_name(LNK_Obj *obj, U64 symbol_idx) { - String8 string_table = str8_substr(obj->data, obj->header.string_table_range); - String8 symbol_table = str8_substr(obj->data, obj->header.symbol_table_range); - + LNK_ParsedSymbolLite *lite = &obj->parsed_symbols[symbol_idx]; COFF_ParsedSymbol result = {0}; - if (obj->header.is_big_obj) { - result = coff_parse_symbol32(string_table, (COFF_Symbol32 *)symbol_table.str + symbol_idx); - } else { - result = coff_parse_symbol16(string_table, (COFF_Symbol16 *)symbol_table.str + symbol_idx); + result.value = lite->value; + result.section_number = lite->section_number; + result.type = lite->type; + result.storage_class = lite->storage_class; + result.aux_symbol_count = lite->aux_symbol_count; + result.raw_symbol = lite->raw_symbol_off ? (obj->data.str + lite->raw_symbol_off) : 0; // offset -> ptr + return result; +} + +internal COFF_ParsedSymbol +lnk_parsed_symbol_from_coff_symbol_idx(LNK_Obj *obj, U64 symbol_idx) +{ + COFF_ParsedSymbol result = lnk_parsed_symbol_from_coff_symbol_idx_no_name(obj, symbol_idx); + // name is excluded from the memo -- decode it from the (read-only) symbol record on demand. Patching + // never touches the name, so re-deriving from raw_symbol stays correct after value/section patches. + if (result.raw_symbol) { + String8 string_table = str8_substr(obj->data, obj->header.string_table_range); + if (obj->header.is_big_obj) { result.name = coff_parse_symbol32(string_table, (COFF_Symbol32 *)result.raw_symbol).name; } + else { result.name = coff_parse_symbol16(string_table, (COFF_Symbol16 *)result.raw_symbol).name; } } - return result; } @@ -768,7 +800,8 @@ lnk_raw_directives_from_obj(Arena *arena, LNK_Obj *obj) { String8List drectve_data = {0}; for (U64 sect_idx = 0; sect_idx < obj->header.section_count_no_null; sect_idx += 1) { - LNK_ObjSection section = lnk_obj_section_from_sect_idx(obj, sect_idx); + // only LnkInfo sections (rare) need the name; skip the string-table lookup for the rest + LNK_ObjSection section = lnk_obj_section_from_sect_idx_no_name(obj, sect_idx); if (*section.flags & COFF_SectionFlag_LnkInfo) { String8 section_name = lnk_obj_section_name_from_sect_idx(obj, sect_idx); if (str8_match(section_name, str8_lit(".drectve"), 0)) { diff --git a/src/linker/lnk_obj.h b/src/linker/lnk_obj.h index 9a21de590..0d3d32c95 100644 --- a/src/linker/lnk_obj.h +++ b/src/linker/lnk_obj.h @@ -5,6 +5,33 @@ // --- Input ------------------------------------------------------------------- +// Slim memoized symbol parse: every COFF_ParsedSymbol field EXCEPT name (the 16B String8). Sized by +// total symbol count, so dropping name saves ~16B/sym of peak. The name is the cold path -- it is +// re-decoded from raw_symbol on demand in lnk_parsed_symbol_from_coff_symbol_idx (the named accessor); +// the hot _no_name accessor and all symbol-value patching never touch it. +typedef struct LNK_ParsedSymbolLite +{ + U32 raw_symbol_off; // byte offset of the COFF symbol record within obj->data (0 = none). + // stored as an offset, not a pointer, to keep this struct 16B. + U32 value; // COFF symbol value is U32 (section-relative offset / size / etc.) + U32 section_number; + COFF_SymbolType type; // U16 + COFF_SymStorageClass storage_class; // U8 + U8 aux_symbol_count; +} LNK_ParsedSymbolLite; + +// /OPT:ICF static-COMDAT fold record (one per section, indexed by section_number-1). A static +// (internal-linkage) COMDAT has no external symbol to redirect, so instead of routing it through +// the symbol table we record its leader here: /OPT:REF marks the LEADER live (so the follower is +// dead-stripped with its associated .pdata/.xdata), and the final section map redirects any +// residual reloc to the leader's contrib. set==0 means the section is not folded. +typedef struct LNK_ICFFold +{ + U32 leader_obj_idx; // input_idx of the leader's obj + U32 leader_sn; // leader section number + B8 set; +} LNK_ICFFold; + typedef struct LNK_Obj { String8 path; @@ -12,6 +39,8 @@ typedef struct LNK_Obj COFF_FileHeaderInfo header; COFF_SectionFlags *section_flags; + LNK_ParsedSymbolLite *parsed_symbols; // memoized parse per symbol_idx (aux slots zeroed), name excluded. + // Mutable: symbol-value patching writes here, NOT obj->data. // flags B8 hotpatch; @@ -23,6 +52,7 @@ typedef struct LNK_Obj U32 *comdats; U32Node **associated_sections; LNK_SymbolHashTrie **symlinks; + LNK_ICFFold *icf_fold; // /OPT:ICF static-COMDAT fold map (per section, sn-1 indexed); 0 if ICF off // link struct LNK_LibMemberRef *link_member; @@ -145,11 +175,13 @@ internal LNK_Symbol * lnk_obj_get_comdat_symlink(LNK_Obj *obj, U64 section_n internal COFF_SectionHeader * lnk_coff_section_header_from_section_number(LNK_Obj *obj, U64 section_number); internal COFF_ParsedSymbol lnk_parsed_symbol_from_coff_symbol_idx(LNK_Obj *obj, U64 symbol_idx); +internal COFF_ParsedSymbol lnk_parsed_symbol_from_coff_symbol_idx_no_name(LNK_Obj *obj, U64 symbol_idx); internal U64 lnk_obj_sect_idx_from_section_number(LNK_Obj *obj, U64 section_number); internal U64 lnk_obj_section_number_from_sect_idx(LNK_Obj *obj, U64 sect_idx); internal String8 lnk_obj_section_name_from_section_number(LNK_Obj *obj, U64 section_number); internal String8 lnk_obj_section_name_from_sect_idx(LNK_Obj *obj, U64 sect_idx); internal LNK_ObjSection lnk_obj_section_from_sect_idx(LNK_Obj *obj, U64 sect_idx); +internal LNK_ObjSection lnk_obj_section_from_sect_idx_no_name(LNK_Obj *obj, U64 sect_idx); internal LNK_ObjSection lnk_obj_section_from_section_number(LNK_Obj *obj, U64 section_number); internal COFF_RelocArray lnk_coff_relocs_from_section_header(LNK_Obj *obj, COFF_SectionHeader *section_header); internal String8 lnk_coff_string_table_from_obj(LNK_Obj *obj); diff --git a/src/linker/lnk_symbol_table.c b/src/linker/lnk_symbol_table.c index 8cb6c4e5f..42b267a03 100644 --- a/src/linker/lnk_symbol_table.c +++ b/src/linker/lnk_symbol_table.c @@ -105,12 +105,14 @@ lnk_can_replace_symbol(LNK_Symbol *dst, LNK_Symbol *src) { B32 can_replace = 0; - COFF_ParsedSymbol dst_parsed = lnk_parsed_from_symbol(dst); - COFF_ParsedSymbol src_parsed = lnk_parsed_from_symbol(src); - COFF_SymbolValueInterpType dst_interp = lnk_interp_from_symbol(dst); - COFF_SymbolValueInterpType src_interp = lnk_interp_from_symbol(src); + // only scalar fields + raw_symbol are needed here (names come from dst->name/src->name); + // skip the symbol-name string-table scan and avoid re-parsing for interp LNK_ObjSymbolRef dst_ref = lnk_ref_from_symbol(dst); LNK_ObjSymbolRef src_ref = lnk_ref_from_symbol(src); + COFF_ParsedSymbol dst_parsed = lnk_parsed_symbol_from_coff_symbol_idx_no_name(dst_ref.obj, dst_ref.symbol_idx); + COFF_ParsedSymbol src_parsed = lnk_parsed_symbol_from_coff_symbol_idx_no_name(src_ref.obj, src_ref.symbol_idx); + COFF_SymbolValueInterpType dst_interp = coff_interp_from_parsed_symbol(dst_parsed); + COFF_SymbolValueInterpType src_interp = coff_interp_from_parsed_symbol(src_parsed); LNK_Obj *dst_obj = dst_ref.obj; LNK_Obj *src_obj = src_ref.obj; @@ -332,9 +334,10 @@ lnk_can_replace_symbol(LNK_Symbol *dst, LNK_Symbol *src) internal void lnk_on_symbol_replace(LNK_Symbol *dst, LNK_Symbol *src) { - COFF_ParsedSymbol dst_parsed = lnk_parsed_from_symbol(dst); - COFF_SymbolValueInterpType dst_interp = lnk_interp_from_symbol(dst); + // only scalar fields are needed below, so skip the symbol-name string-table scan LNK_ObjSymbolRef dst_ref = lnk_ref_from_symbol(dst); + COFF_ParsedSymbol dst_parsed = lnk_parsed_symbol_from_coff_symbol_idx_no_name(dst_ref.obj, dst_ref.symbol_idx); + COFF_SymbolValueInterpType dst_interp = coff_interp_from_parsed_symbol(dst_parsed); if (dst_interp == COFF_SymbolValueInterp_Regular) { // remove replaced section from the output @@ -531,7 +534,9 @@ lnk_parsed_from_symbol(LNK_Symbol *symbol) internal COFF_SymbolValueInterpType lnk_interp_from_symbol(LNK_Symbol *symbol) { - COFF_ParsedSymbol symbol_parsed = lnk_parsed_from_symbol(symbol); + // interp only needs scalar fields; skip the symbol-name string-table scan + LNK_ObjSymbolRef ref = lnk_ref_from_symbol(symbol); + COFF_ParsedSymbol symbol_parsed = lnk_parsed_symbol_from_coff_symbol_idx_no_name(ref.obj, ref.symbol_idx); return coff_interp_from_parsed_symbol(symbol_parsed); } @@ -556,6 +561,7 @@ lnk_symbol_table_push_(LNK_SymbolTable *symtab, Arena *arena, U64 worker_id, LNK { U64 hash = lnk_symbol_table_hasher(symbol->name); COFF_SymbolValueInterpType interp = lnk_interp_from_symbol(symbol); + symbol->interp = interp; // cache for the lib search hot loop LNK_SymbolHashTrieChunkList *chunks; if (interp == COFF_SymbolValueInterp_Weak || interp == COFF_SymbolValueInterp_Undefined) { chunks = &symtab->search_chunks[worker_id]; @@ -585,6 +591,21 @@ lnk_symbol_table_search(LNK_SymbolTable *symtab, String8 name) return trie ? trie->symbol : 0; } +internal U64 +lnk_symbol_table_search_symbol_count(LNK_SymbolTable *symtab) +{ + // total number of weak/undefined symbols inserted into search_chunks. this only ever grows + // during the lib-search loop (symbols are never removed mid-loop), so it serves as a monotonic + // version stamp for the set of symbols the lib search would scan. + U64 count = 0; + for EachIndex(worker_id, symtab->arena->count) { + for EachNode(c, LNK_SymbolHashTrieChunk, symtab->search_chunks[worker_id].first) { + count += c->count; + } + } + return count; +} + internal LNK_Symbol * lnk_symbol_table_searchf(LNK_SymbolTable *symtab, char *fmt, ...) { @@ -772,17 +793,9 @@ THREAD_POOL_TASK_FUNC(lnk_replace_weak_with_default_symbol_task) COFF_SymbolValueInterpType resolve_interp = coff_interp_from_parsed_symbol(resolve_parsed); if (resolve_interp == COFF_SymbolValueInterp_Weak) { COFF_SymbolWeakExt *weak_ext = coff_parse_weak_tag(resolve_parsed, symbol_ref.obj->header.is_big_obj); - if (symbol_ref.obj->header.is_big_obj) { - COFF_Symbol32 *symbol32 = symbol_parsed.raw_symbol; - symbol32->section_number = COFF_Symbol_UndefinedSection; - symbol32->value = 0; - symbol32->storage_class = COFF_SymStorageClass_External; - } else { - COFF_Symbol16 *symbol16 = symbol_parsed.raw_symbol; - symbol16->section_number = COFF_Symbol_UndefinedSection; - symbol16->value = 0; - symbol16->storage_class = COFF_SymStorageClass_External; - } + symbol_ref.obj->parsed_symbols[symbol_ref.symbol_idx].section_number = COFF_Symbol_UndefinedSection; + symbol_ref.obj->parsed_symbols[symbol_ref.symbol_idx].value = 0; + symbol_ref.obj->parsed_symbols[symbol_ref.symbol_idx].storage_class = COFF_SymStorageClass_External; } else { symbol->first_ref->v = resolve; } diff --git a/src/linker/lnk_symbol_table.h b/src/linker/lnk_symbol_table.h index da579bb1e..89caf3b8b 100644 --- a/src/linker/lnk_symbol_table.h +++ b/src/linker/lnk_symbol_table.h @@ -19,9 +19,10 @@ typedef struct LNK_ObjSymbolRefNode typedef struct LNK_Symbol { - String8 name; - LNK_ObjSymbolRefNode *first_ref; - LNK_ObjSymbolRefNode *last_ref; + String8 name; + LNK_ObjSymbolRefNode *first_ref; + LNK_ObjSymbolRefNode *last_ref; + COFF_SymbolValueInterpType interp; // cached at push so the lib search can skip re-parsing (and page-faulting) resolved symbols } LNK_Symbol; // --- Symbol Containers ------------------------------------------------------- @@ -124,6 +125,7 @@ internal U64 lnk_symbol_table_hasher(String8 string); internal LNK_SymbolTable * lnk_symbol_table_init(TP_Arena *arena); internal void lnk_symbol_table_push(LNK_SymbolTable *symtab, LNK_Symbol *symbol); internal LNK_Symbol * lnk_symbol_table_search(LNK_SymbolTable *symtab, String8 name); +internal U64 lnk_symbol_table_search_symbol_count(LNK_SymbolTable *symtab); internal LNK_Symbol * lnk_symbol_table_searchf(LNK_SymbolTable *symtab, char *fmt, ...); // --- Symbol Contrib Helpers -------------------------------------------------- diff --git a/src/linker/pdb_ext/pdb_builder.c b/src/linker/pdb_ext/pdb_builder.c index be6960122..3865bbf24 100644 --- a/src/linker/pdb_ext/pdb_builder.c +++ b/src/linker/pdb_ext/pdb_builder.c @@ -1783,7 +1783,11 @@ psi_addr_map_compar_is_before(void *raw_a, void *raw_b) } else if (a->isect_off.off != b->isect_off.off) { is_before = a->isect_off.off < b->isect_off.off; } else { - is_before = str8_compar_case_sensitive(&a->name, &b->name) < 0; + int cmp = str8_compar_case_sensitive(&a->name, &b->name); + // unique, element-stable tiebreaker (offset is unique per record and travels with the + // element across radsort swaps) so equal-key runs (many ICF-folded symbols at one + // address) do not degrade radsort's quicksort to O(n^2) + is_before = cmp != 0 ? (cmp < 0) : (a->offset < b->offset); } return is_before; @@ -1797,12 +1801,22 @@ gsi_record_sort_by_name(PDB_GsiSortRecord *arr, U64 count) ProfEnd(); } -internal void -gsi_record_sort_by_sc(PDB_GsiSortRecord *arr, U64 count) +// returns a record-index permutation sorted ascending by (isect, off) -- the PSI address map +// order. A stable parallel radix sort on the U64 (isect<<32|off) key; equal addresses (e.g. +// ICF-folded symbols) keep input order, which is fine for an address->symbol map. +internal U32 * +gsi_record_sort_by_sc(TP_Context *tp, Arena *arena, PDB_GsiSortRecord *arr, U64 count) { ProfBeginFunction(); - radsort(arr, count, psi_addr_map_compar_is_before); + U64 *keys = push_array_no_zero(arena, U64, count ? count : 1); + U32 *idx = push_array_no_zero(arena, U32, count ? count : 1); + for EachIndex(i, count) { + keys[i] = ((U64)arr[i].isect_off.isect << 32) | (U64)arr[i].isect_off.off; + idx[i] = (U32)i; + } + lnk_radix_sort_u64_pairs(tp, arena, count, keys, idx); ProfEnd(); + return idx; } internal @@ -1839,7 +1853,9 @@ gsi_symbol_is_before(void *raw_a, void *raw_b) } } } - is_before = cmp < 0; + // unique, element-stable tiebreaker (symbol pointer travels with the element across + // radsort swaps) prevents O(n^2) radsort on equal-key runs (ICF-folded symbols) + is_before = cmp != 0 ? (cmp < 0) : ((U64)a < (U64)b); } return is_before; @@ -1870,7 +1886,9 @@ gsi_pub_symbol_is_before(void *raw_a, void *raw_b) } } } - is_before = cmp < 0; + // unique, element-stable tiebreaker (symbol pointer travels with the element across + // radsort swaps) prevents O(n^2) radsort on equal-key runs (ICF-folded publics) + is_before = cmp != 0 ? (cmp < 0) : ((U64)a < (U64)b); } return is_before; @@ -2209,15 +2227,15 @@ psi_build(TP_Context *tp, PDB_PsiContext *psi, MSF_Context *msf, MSF_StreamNumbe ProfBegin("Address Map"); ProfBegin("Sort"); - gsi_record_sort_by_sc(gsi_build.sort_record_arr, gsi_build.hash_record_count); + U32 *sorted = gsi_record_sort_by_sc(tp, scratch.arena, gsi_build.sort_record_arr, gsi_build.hash_record_count); ProfEnd(); - + ProfBegin("Offset Fill"); U64 addr_map_count = gsi_build.hash_record_count; U64 addr_map_size = addr_map_count * sizeof(U32); U32 *addr_map = push_array_no_zero(scratch.arena, U32, addr_map_count); for (U64 i = 0; i < addr_map_count; i += 1) { - addr_map[i] = gsi_build.sort_record_arr[i].offset; + addr_map[i] = gsi_build.sort_record_arr[sorted[i]].offset; } ProfEnd(); @@ -2772,12 +2790,39 @@ dbi_build_sec_con(Arena *arena, PDB_DbiContext *dbi) // sort section contribs so they are binary searchable lnk_radix_sort_dbi_sc_array(sc_array, dbi->sec_contrib_list.count, dbi->section_list.count + 1); - + + // coalesce adjacent contributions that belong to the same module, section, and + // flags and are perfectly contiguous. The section contribution substream is only + // used to map an image address range back to its module, so merging contiguous + // same-module runs is loss-less (data/reloc CRCs are unused here, always 0). UE + // objects emit one COMDAT section per function, so after layout these collapse + // heavily -- matching link.exe, which stores one entry per contiguous run. + ProfBegin("Coalesce sect contribs"); + U64 sc_count = 0; + for (U64 r = 0; r < dbi->sec_contrib_list.count; r += 1) { + PDB_DbiSectionContrib *c = &sc_array[r]; + if (sc_count > 0) { + PDB_DbiSectionContrib *p = &sc_array[sc_count - 1]; + if (p->base.sec == c->base.sec && + p->base.mod == c->base.mod && + p->base.flags == c->base.flags && + (U64)c->base.sec_off >= (U64)p->base.sec_off) { + // absorb c into the run, extending over any alignment padding gap + U64 c_end = (U64)c->base.sec_off + (U64)c->base.size; + U64 p_end = (U64)p->base.sec_off + (U64)p->base.size; + if (c_end > p_end) { p->base.size = (U32)(c_end - (U64)p->base.sec_off); } + continue; + } + } + sc_array[sc_count++] = *c; + } + ProfEnd(); + // push section contrib info ProfBegin("List Push"); String8List sec_con_list = {0}; str8_list_push(arena, &sec_con_list, str8((U8*)version, sizeof(*version))); - str8_list_push(arena, &sec_con_list, str8((U8*)sc_array, sizeof(sc_array[0])*dbi->sec_contrib_list.count)); + str8_list_push(arena, &sec_con_list, str8((U8*)sc_array, sizeof(sc_array[0])*sc_count)); ProfEnd(); ProfEnd();