diff --git a/build.bat b/build.bat
index f636ea9cd..d4e797971 100644
--- a/build.bat
+++ b/build.bat
@@ -46,6 +46,7 @@ if "%spall%"=="1"                   set auto_compile_flags=%auto_compile_flags%
 if "%asan%"=="1"                    set auto_compile_flags=%auto_compile_flags% -fsanitize=address && echo [asan enabled]
 if "%ubsan%"=="1"                   set auto_compile_flags=%auto_compile_flags% -fsanitize=undefined && echo [ubsan enabled]
 if "%opengl%"=="1"                  set auto_compile_flags=%auto_compile_flags% -DR_BACKEND=R_BACKEND_OPENGL && echo [opengl render backend]
+if "%worklist_selfcheck%"=="1"      set auto_compile_flags=%auto_compile_flags% -DICF_WORKLIST_SELFCHECK=1 && echo [icf worklist selfcheck enabled]
 if "%dwarf%"=="1" if "%clang%"=="1" set auto_compile_flags=%auto_compile_flags% -gdwarf && echo [dwarf debug info]
 if "%dwarf%"==""  if "%clang%"=="1" set auto_compile_flags=%auto_compile_flags% -gcodeview
 if "%pgo%"=="1" (
@@ -142,7 +143,14 @@ pushd build
 if "%raddbg%"=="1"                     set didbuild=1 && %compile% ..\src\raddbg\raddbg_main.c                               %compile_link% %link_icon% %out%raddbg.exe || exit /b 1
 if "%raddbg_non_graphical%"=="1"       set didbuild=1 && %compile% -DWM_STUB=1 -DR_BACKEND=R_BACKEND_STUB ..\src\raddbg\raddbg_main.c %compile_link% %link_icon% %out%raddbg_non_graphical.exe || exit /b 1
 if "%com_shim%"=="1"                   set didbuild=1 && %compile% ..\src\com_shim\com_shim_main.c                           %compile_link% %out%com_shim.exe || exit /b 1
-if "%radlink%"=="1"                    set didbuild=1 && %compile% ..\src\linker\lnk.c                                       %compile_link% %linker% /NOIMPLIB %linker% /NATVIS:"%~dp0\src\linker\linker.natvis" %out%radlink.exe || exit /b 1
+:: NOTE: -DBLAKE3_ATOMICS=1 makes BLAKE3 use C11 _Atomic (plain atomic load) for
+:: get_cpu_features instead of MSVC's _InterlockedOr `lock or` barrier on every
+:: compress dispatch (was a ~5.5s main-thread hot spot). MSVC C11 atomics require
+:: /std:c11 /experimental:c11atomics. Kept external so the vendored blake3 source
+:: stays pristine.
+set radlink_msvc_flags=
+if "%msvc%"=="1" set radlink_msvc_flags=/std:c11 /experimental:c11atomics -DBLAKE3_ATOMICS=1
+if "%radlink%"=="1"                    set didbuild=1 && %compile% %radlink_msvc_flags% ..\src\linker\lnk.c                    %compile_link% %linker% /NOIMPLIB %linker% /NATVIS:"%~dp0\src\linker\linker.natvis" %out%radlink.exe || exit /b 1
 if "%radbin%"=="1"                     set didbuild=1 && %compile% ..\src\radbin\radbin_main.c                               %compile_link% %out%radbin.exe || exit /b 1
 if "%raddump%"=="1"                    set didbuild=1 && %compile% ..\src\raddump\raddump_main.c                             %compile_link% %out%raddump.exe || exit /b 1
 if "%ryan_scratch%"=="1"               set didbuild=1 && %compile% ..\src\scratch\ryan_scratch.c                             %compile_link% %out%ryan_scratch.exe || exit /b 1
diff --git a/src/base/base_arena.c b/src/base/base_arena.c
index f765f7005..bc7f34a8e 100644
--- a/src/base/base_arena.c
+++ b/src/base/base_arena.c
@@ -346,6 +346,54 @@ arena_pop_to(Arena *arena, U64 pos)
   
 }
 
+//- rjf: arena decommit of unused (rewound/free) pages
+
+internal void
+arena_decommit_unused(Arena *arena)
+{
+  // NOTE(perf): decommit committed-but-unused pages so they stop counting against
+  // working set, while keeping the reservation. Only touches pages strictly above
+  // the live `pos` high-water of each block in the active chain, and the unused
+  // bodies of free-list blocks. Live data (<= pos) is never touched. The push path
+  // re-commits on demand (arena_push grows `cmt`), so reuse is transparent.
+  if(arena->flags & ArenaFlag_LargePages)
+  {
+    // large pages cannot be partially decommitted safely; skip.
+    return;
+  }
+  U64 page_size = get_system_info()->page_size;
+
+  // rjf: active chain -- decommit committed region above each block's live pos
+  for(Arena *n = arena->current; n != 0; n = n->prev)
+  {
+    U64 pos_aligned = AlignPow2(n->pos, page_size);
+    if(pos_aligned < n->cmt)
+    {
+      U8 *decommit_ptr = (U8 *)n + pos_aligned;
+      U64 decommit_size = n->cmt - pos_aligned;
+      AsanPoisonMemoryRegion(decommit_ptr, decommit_size);
+      decommit_memory(decommit_ptr, decommit_size);
+      n->cmt = pos_aligned;
+    }
+  }
+
+#if ARENA_FREE_LIST
+  // rjf: free chain -- decommit everything above the first (header) page
+  for(Arena *n = arena->free_last; n != 0; n = n->prev)
+  {
+    U64 keep = AlignPow2(ARENA_HEADER_SIZE, page_size);
+    if(keep < n->cmt)
+    {
+      U8 *decommit_ptr = (U8 *)n + keep;
+      U64 decommit_size = n->cmt - keep;
+      AsanPoisonMemoryRegion(decommit_ptr, decommit_size);
+      decommit_memory(decommit_ptr, decommit_size);
+      n->cmt = keep;
+    }
+  }
+#endif
+}
+
 //- rjf: arena push/pop helpers
 
 internal void
diff --git a/src/base/base_arena.h b/src/base/base_arena.h
index b3ba37827..a61f5bd3f 100644
--- a/src/base/base_arena.h
+++ b/src/base/base_arena.h
@@ -76,6 +76,9 @@ internal void *arena_push(Arena *arena, U64 size, U64 align, B32 zero);
 internal U64   arena_pos(Arena *arena);
 internal void  arena_pop_to(Arena *arena, U64 pos);
 
+//- rjf: arena decommit of unused (rewound/free) pages
+internal void arena_decommit_unused(Arena *arena);
+
 //- rjf: arena push/pop helpers
 internal void arena_clear(Arena *arena);
 internal void arena_pop(Arena *arena, U64 amt);
diff --git a/src/base/base_thread_context.c b/src/base/base_thread_context.c
index 421636101..d42be68ed 100644
--- a/src/base/base_thread_context.c
+++ b/src/base/base_thread_context.c
@@ -82,6 +82,21 @@ tctx_get_scratch(Arena **conflicts, U64 count)
   return result;
 }
 
+//- rjf: scratch decommit (release committed-but-unused scratch pages back to OS)
+
+internal void
+tctx_scratch_decommit(void)
+{
+  TCTX *tctx = tctx_selected();
+  for(U64 i = 0; i < ArrayCount(tctx->arenas); i += 1)
+  {
+    if(tctx->arenas[i] != 0)
+    {
+      arena_decommit_unused(tctx->arenas[i]);
+    }
+  }
+}
+
 //- rjf: lane metadata
 
 internal LaneCtx
diff --git a/src/base/base_thread_context.h b/src/base/base_thread_context.h
index 494d1804f..e52b36476 100644
--- a/src/base/base_thread_context.h
+++ b/src/base/base_thread_context.h
@@ -90,6 +90,7 @@ internal TCTX *tctx_selected(void);
 
 //- rjf: scratch arenas
 internal Arena *tctx_get_scratch(Arena **conflicts, U64 count);
+internal void tctx_scratch_decommit(void);
 #define scratch_begin(conflicts, count) temp_begin(tctx_get_scratch((conflicts), (count)))
 #define scratch_end(scratch) temp_end(scratch)
 
diff --git a/src/coff/coff_parse.c b/src/coff/coff_parse.c
index 1db0b5826..85835b977 100644
--- a/src/coff/coff_parse.c
+++ b/src/coff/coff_parse.c
@@ -166,11 +166,16 @@ coff_section_header_array_from_name(Arena *arena, String8 string_table, COFF_Sec
 }
 
 
+// NOTE: name-skipping variants. coff_read_symbol_name does a cstr scan over the
+// memory-mapped string table, which is the dominant cost when parsing symbols in
+// bulk; callers that only need the scalar fields (value/section/storage_class/aux,
+// e.g. symbol-value interpretation) should use these to avoid that scan. The full
+// coff_parse_symbol{16,32} below are these plus the name read, so the scalar-field
+// logic lives in exactly one place.
 internal COFF_ParsedSymbol
-coff_parse_symbol32(String8 string_table, COFF_Symbol32 *sym32)
+coff_parse_symbol32_no_name(COFF_Symbol32 *sym32)
 {
   COFF_ParsedSymbol result = {0};
-  result.name              = coff_read_symbol_name(string_table, &sym32->name);
   result.value             = sym32->value;
   result.section_number    = sym32->section_number;
   result.type              = sym32->type;
@@ -181,10 +186,9 @@ coff_parse_symbol32(String8 string_table, COFF_Symbol32 *sym32)
 }
 
 internal COFF_ParsedSymbol
-coff_parse_symbol16(String8 string_table, COFF_Symbol16 *sym16)
+coff_parse_symbol16_no_name(COFF_Symbol16 *sym16)
 {
   COFF_ParsedSymbol result = {0};
-  result.name              = coff_read_symbol_name(string_table, &sym16->name);
   result.value             = sym16->value;
   if (sym16->section_number == COFF_Symbol_DebugSection16) {
     result.section_number = COFF_Symbol_DebugSection32;
@@ -200,6 +204,22 @@ coff_parse_symbol16(String8 string_table, COFF_Symbol16 *sym16)
   return result;
 }
 
+internal COFF_ParsedSymbol
+coff_parse_symbol32(String8 string_table, COFF_Symbol32 *sym32)
+{
+  COFF_ParsedSymbol result = coff_parse_symbol32_no_name(sym32);
+  result.name              = coff_read_symbol_name(string_table, &sym32->name);
+  return result;
+}
+
+internal COFF_ParsedSymbol
+coff_parse_symbol16(String8 string_table, COFF_Symbol16 *sym16)
+{
+  COFF_ParsedSymbol result = coff_parse_symbol16_no_name(sym16);
+  result.name              = coff_read_symbol_name(string_table, &sym16->name);
+  return result;
+}
+
 internal COFF_ParsedSymbol
 coff_parse_symbol(COFF_FileHeaderInfo header, String8 string_table, String8 symbol_table, U32 symbol_idx)
 {
diff --git a/src/coff/coff_parse.h b/src/coff/coff_parse.h
index edcb29520..028e292f5 100644
--- a/src/coff/coff_parse.h
+++ b/src/coff/coff_parse.h
@@ -261,6 +261,8 @@ internal String8                 coff_name_from_section_header      (String8 str
 
 internal COFF_ParsedSymbol coff_parse_symbol32(String8 string_table, COFF_Symbol32 *sym32);
 internal COFF_ParsedSymbol coff_parse_symbol16(String8 string_table, COFF_Symbol16 *sym16);
+internal COFF_ParsedSymbol coff_parse_symbol32_no_name(COFF_Symbol32 *sym32);
+internal COFF_ParsedSymbol coff_parse_symbol16_no_name(COFF_Symbol16 *sym16);
 internal COFF_ParsedSymbol coff_parse_symbol  (COFF_FileHeaderInfo header, String8 string_table, String8 symbol_table, U32 symbol_idx);
 
 internal COFF_Symbol32Array coff_symbol_array_from_data_16(Arena *arena, String8 data, U64 symbol_array_off, U64 symbol_count);
diff --git a/src/linker/codeview_ext/ifc.c b/src/linker/codeview_ext/ifc.c
new file mode 100644
index 000000000..e5693740f
--- /dev/null
+++ b/src/linker/codeview_ext/ifc.c
@@ -0,0 +1,107 @@
+// Copyright (c) Epic Games Tools
+// Licensed under the MIT license (https://opensource.org/license/mit/)
+
+read_only global U8 g_ifc_signature[4] = { 0x54, 0x51, 0x45, 0x1A };
+read_only global U8 g_uba_signature[4] = { 0x55, 0x42, 0x41, 0x01 }; // "UBA\x01"
+
+internal IFC_File
+ifc_file_read(Arena *arena, String8 path, String8 *error_out)
+{
+  IFC_File ifc = {0};
+  ifc.path = push_str8_copy(arena, path);
+
+  String8 data = lnk_read_data_from_file_path(arena, 0, path);
+  ifc.data = data;
+  if (data.size < 4) {
+    *error_out = push_str8f(arena, "IFC '%S' is too small (%llu bytes)", path, data.size);
+    return ifc;
+  }
+
+  // detect UBA-compressed input (out of scope)
+  if (MemoryMatch(data.str, g_uba_signature, sizeof(g_uba_signature))) {
+    *error_out = push_str8f(arena, "IFC '%S' is UBA-compressed (magic 'UBA\\x01'); materialize a raw .ifc (UBA decompress unsupported)", path);
+    return ifc;
+  }
+
+  // validate signature
+  if ( ! MemoryMatch(data.str, g_ifc_signature, sizeof(g_ifc_signature))) {
+    *error_out = push_str8f(arena, "IFC '%S' has bad signature (expected 54 51 45 1A)", path);
+    return ifc;
+  }
+
+  // --- parse header ---
+  U64 off = 4;
+  if (off + 32 > data.size) { goto truncated; }
+  MemoryCopy(ifc.content_hash, data.str + off, 32);
+  off += 32;
+
+  if (off + 4 > data.size) { goto truncated; }
+  U8 major = data.str[off+0];
+  U8 minor = data.str[off+1];
+  U8 abi   = data.str[off+2]; (void)abi;
+  U8 arch  = data.str[off+3];
+  off += 4;
+
+  // assert version 0.44 + x64; error otherwise (encoding proven only for these)
+  if ( ! (major == 0 && minor == 44)) {
+    *error_out = push_str8f(arena, "IFC '%S' unsupported version %u.%u (expected 0.44)", path, major, minor);
+    return ifc;
+  }
+  if (arch != 2) {
+    *error_out = push_str8f(arena, "IFC '%S' unsupported architecture %u (expected 2 == x64)", path, arch);
+    return ifc;
+  }
+
+  U32 cplusplus;           off += str8_deserial_read_struct(data, off, &cplusplus); (void)cplusplus;
+  U32 string_table_bytes;  off += str8_deserial_read_struct(data, off, &string_table_bytes);
+  U32 string_table_size;   off += str8_deserial_read_struct(data, off, &string_table_size);
+  U32 unit;                off += str8_deserial_read_struct(data, off, &unit); (void)unit;
+  U32 src_path;            off += str8_deserial_read_struct(data, off, &src_path); (void)src_path;
+  U32 global_scope;        off += str8_deserial_read_struct(data, off, &global_scope); (void)global_scope;
+  U32 toc;                 off += str8_deserial_read_struct(data, off, &toc);
+  U32 partition_count;     off += str8_deserial_read_struct(data, off, &partition_count);
+  if (off > data.size) { goto truncated; }
+
+  // string table
+  if ((U64)string_table_bytes + string_table_size > data.size) {
+    *error_out = push_str8f(arena, "IFC '%S' string table out of bounds", path);
+    return ifc;
+  }
+  String8 string_table = str8(data.str + string_table_bytes, string_table_size);
+
+  // --- partition summary table ---
+  String8 needle = str8_lit(".msvc.trait.debug-records");
+  U64     po     = toc;
+  for (U32 i = 0; i < partition_count; ++i, po += 16) {
+    if (po + 16 > data.size) { goto truncated; }
+    U32 name_off, p_off, count, entity_size;
+    str8_deserial_read_struct(data, po + 0,  &name_off);
+    str8_deserial_read_struct(data, po + 4,  &p_off);
+    str8_deserial_read_struct(data, po + 8,  &count);
+    str8_deserial_read_struct(data, po + 12, &entity_size);
+
+    if (name_off >= string_table.size) { continue; }
+    String8 name = str8_cstring((char *)string_table.str + name_off);
+    if (str8_match(name, needle, 0)) {
+      // entity_size == 1 -> count is a byte length
+      if ((U64)p_off + count > data.size) {
+        *error_out = push_str8f(arena, "IFC '%S' debug-records partition out of bounds", path);
+        return ifc;
+      }
+      ifc.debug_records = str8(data.str + p_off, count);
+      break;
+    }
+  }
+
+  if (ifc.debug_records.size == 0) {
+    *error_out = push_str8f(arena, "IFC '%S' has no '.msvc.trait.debug-records' partition", path);
+    return ifc;
+  }
+
+  ifc.is_valid = 1;
+  return ifc;
+
+truncated:
+  *error_out = push_str8f(arena, "IFC '%S' is truncated", path);
+  return ifc;
+}
diff --git a/src/linker/codeview_ext/ifc.h b/src/linker/codeview_ext/ifc.h
new file mode 100644
index 000000000..fb3169b75
--- /dev/null
+++ b/src/linker/codeview_ext/ifc.h
@@ -0,0 +1,43 @@
+// Copyright (c) Epic Games Tools
+// Licensed under the MIT license (https://opensource.org/license/mit/)
+
+#pragma once
+
+////////////////////////////////
+// MSVC IFC (header-unit module interface) reader
+//
+// radlink consumes the `.msvc.trait.debug-records` partition embedded in an
+// MSVC `.ifc` (C++20 module / header-unit interface) file. That partition is a
+// raw CodeView type-leaf stream (entity_size == 1, NO u32 signature, first leaf
+// at offset 0, TI base 0x1000). A consuming `.obj` references this stream via
+// LF_IFC_RECORD (0x1522) leaves -- see lnk_debug_info.c.
+//
+// File layout (microsoft/ifc-spec, FileHeader):
+//   u8[4]  signature = { 0x54,0x51,0x45,0x1A }   ("TQE\x1a")
+//   u8[32] content_hash (sha256)  -- record.GUID(16)++record.hash(16) == first 32 bytes here
+//   u8     major, minor           -- assert 0.44
+//   u8     abi
+//   u8     arch                   -- 2 == x64
+//   u32    cplusplus
+//   u32    string_table_bytes (off), u32 string_table_size
+//   u32    unit
+//   u32    src_path (textoffset)
+//   u32    global_scope
+//   u32    toc                    -- offset to partition summary table
+//   u32    partition_count
+//   u8     internal_partition
+// Partition summary entry (16 bytes): { u32 name(textoffset); u32 offset; u32 count; u32 entity_size }
+
+typedef struct IFC_File
+{
+  String8 data;          // whole .ifc bytes (owning view into arena)
+  String8 path;          // .ifc path (copied)
+  U8      content_hash[32];
+  String8 debug_records; // .msvc.trait.debug-records blob {ptr,size}; size 0 if absent
+  B32     is_valid;
+} IFC_File;
+
+// Reads `path`, validates magic/version/arch, locates `.msvc.trait.debug-records`.
+// On error fills *error_out and returns is_valid=0. Detects UBA-compressed inputs
+// (magic "UBA\x01") and reports them (decompression is out of scope).
+internal IFC_File ifc_file_read(Arena *arena, String8 path, String8 *error_out);
diff --git a/src/linker/lnk.c b/src/linker/lnk.c
index 2850b08b9..c37033e23 100644
--- a/src/linker/lnk.c
+++ b/src/linker/lnk.c
@@ -78,6 +78,8 @@
 #include "pdb_ext/msf_builder.c"
 #include "pdb_ext/pdb.c"
 #include "pdb_ext/pdb_helpers.c"
+// fwd decl: parallel radix sort (defined later in this TU) so pdb_builder.c can use it
+internal void lnk_radix_sort_u64_pairs(TP_Context *tp, Arena *arena, U64 n, U64 *keys, U32 *vals);
 #include "pdb_ext/pdb_builder.c"
 
 // --- RDI ---------------------------------------------------------------------
@@ -110,6 +112,7 @@
 #include "lnk_debug_helper.h"
 #include "lnk_obj.h"
 #include "lnk_lib.h"
+#include "codeview_ext/ifc.h"
 #include "lnk_debug_info.h"
 #include "lnk.h"
 
@@ -123,6 +126,7 @@
 #include "lnk_obj.c"
 #include "lnk_debug_helper.c"
 #include "lnk_lib.c"
+#include "codeview_ext/ifc.c"
 #include "lnk_debug_info.c"
 
 // -----------------------------------------------------------------------------
@@ -1641,19 +1645,35 @@ THREAD_POOL_TASK_FUNC(lnk_search_lib_task)
   LNK_LibMemberInfo    *lib_member_infos = task->lib_member_infos;
   LNK_LibMemberRefList *member_ref_list  = &task->member_ref_lists[task_id];
 
-  for EachNode(c, LNK_SymbolHashTrieChunk, symtab->search_chunks[task_id].first) {
-    for EachIndex(i, c->count) {
+  // FRONTIER cursor: resume from where this worker last left off for this lib. search_chunks is
+  // append-only across the lib-search loop, so slots before the cursor were already searched and
+  // (search being a pure fn of (lib, symbol->name), member-queue dedup idempotent) can never resolve
+  // a new member. reset_cursor (anti-dep mode flip) forces a full rescan from list first.
+  LNK_SymbolHashTrieChunk *start_chunk = task->reset_cursor ? 0 : lib->search_cursor_chunk[task_id];
+  U64                      start_idx   = task->reset_cursor ? 0 : lib->search_cursor_idx[task_id];
+
+  LNK_SymbolHashTrieChunk *first_chunk = start_chunk ? start_chunk : symtab->search_chunks[task_id].first;
+  LNK_SymbolHashTrieChunk *end_chunk   = symtab->search_chunks[task_id].last;
+  U64                      end_count    = end_chunk ? end_chunk->count : 0;
+
+  for (LNK_SymbolHashTrieChunk *c = first_chunk; c != 0; c = c->next) {
+    U64 i_begin = (c == start_chunk) ? start_idx : 0;
+    U64 i_end   = c->count;
+    for (U64 i = i_begin; i < i_end; i += 1) {
       LNK_Symbol *symbol = c->v[i].symbol;
 
-      LNK_ObjSymbolRef           symbol_ref    = lnk_ref_from_symbol(symbol);
-      COFF_ParsedSymbol          symbol_parsed = lnk_parsed_from_symbol(symbol);
-      COFF_SymbolValueInterpType symbol_interp = coff_interp_from_parsed_symbol(symbol_parsed);
+      // interp is cached on the symbol at push time, so the common case (resolved symbols, which
+      // stay in search_chunks but can never match a lib) costs one field read instead of re-parsing
+      // -- and page-faulting -- the COFF symbol record out of the mmap'd obj on every lib pass.
+      COFF_SymbolValueInterpType symbol_interp = symbol->interp;
       if (symbol_interp == COFF_SymbolValueInterp_Undefined) {
         U32 member_idx;
         if (lnk_search_lib(lib, symbol->name, &member_idx)) {
           lnk_queue_lib_member(arena, task->imports_hm, task->link->lib_member_infos_hm, member_ref_list, symbol, lib, lib_member_infos, member_idx);
         }
       } else if (symbol_interp == COFF_SymbolValueInterp_Weak) {
+        LNK_ObjSymbolRef   symbol_ref    = lnk_ref_from_symbol(symbol);
+        COFF_ParsedSymbol  symbol_parsed = lnk_parsed_symbol_from_coff_symbol_idx_no_name(symbol_ref.obj, symbol_ref.symbol_idx);
         COFF_SymbolWeakExt *weak_ext = coff_parse_weak_tag(symbol_parsed, symbol_ref.obj->header.is_big_obj);
         if (weak_ext->characteristics == COFF_WeakExt_SearchLibrary) {
           U32 member_idx;
@@ -1664,11 +1684,11 @@ THREAD_POOL_TASK_FUNC(lnk_search_lib_task)
           if (search_anti_deps) {
             LNK_ObjSymbolRef dep_symbol = {0};
             if (lnk_resolve_weak_symbol(symtab, symbol_ref, &dep_symbol)) {
-              COFF_ParsedSymbol          dep_parsed = lnk_parsed_symbol_from_coff_symbol_idx(dep_symbol.obj, dep_symbol.symbol_idx);
+              COFF_ParsedSymbol          dep_parsed = lnk_parsed_symbol_from_coff_symbol_idx_no_name(dep_symbol.obj, dep_symbol.symbol_idx);
               COFF_SymbolValueInterpType dep_interp = coff_interp_from_parsed_symbol(dep_parsed);
               if (dep_interp == COFF_SymbolValueInterp_Weak) {
                 U32 member_idx;
-                if (lnk_search_lib(lib, symbol_parsed.name, &member_idx)) {
+                if (lnk_search_lib(lib, symbol->name, &member_idx)) {
                   lnk_queue_lib_member(arena, task->imports_hm, task->link->lib_member_infos_hm, member_ref_list, symbol, lib, lib_member_infos, member_idx);
                 }
               }
@@ -1678,6 +1698,13 @@ THREAD_POOL_TASK_FUNC(lnk_search_lib_task)
       }
     }
   }
+
+  // advance cursor to the end of search_chunks as observed at the start of this scan. the tp dispatch
+  // is a barrier and load_inputs runs serially between dispatches, so no concurrent append occurs;
+  // we stamp the (end_chunk, end_count) snapshot taken before the loop so any slot appended after the
+  // snapshot is rescanned next time, never skipped.
+  lib->search_cursor_chunk[task_id] = end_chunk;
+  lib->search_cursor_idx[task_id]   = end_count;
 }
 
 internal LNK_Lib *
@@ -1816,16 +1843,43 @@ lnk_link_inputs(TP_Context      *tp,
         } else {
           // search symbols in lib
           MemoryZeroTyped(member_ref_lists, tp->worker_count);
-          LNK_SearchLibTask search_task = {
-            .search_anti_deps = search_anti_deps,
-            .link             = link,
-            .imports_hm       = &imports_hm,
-            .lib              = lib,
-            .symtab           = symtab,
-            .lib_member_infos = lib_member_infos,
-            .member_ref_lists = member_ref_lists
-          };
-          tp_for_parallel(tp, arena, tp->worker_count, lnk_search_lib_task, &search_task);
+
+          // barrier elision: the search task scans every undefined/weak symbol in search_chunks and
+          // queues any member that resolves one. search_chunks only grows during this loop and the
+          // dedup against already-queued members is idempotent, so if neither the symbol set nor the
+          // anti-dep mode changed since this lib was last searched, the dispatch can only re-queue
+          // (deduped to) nothing. skipping it avoids waking+joining every worker for no work.
+          U64 search_symbol_count = lnk_symbol_table_search_symbol_count(symtab);
+          B32 can_skip_search = lib->was_searched &&
+                                lib->searched_symbol_count == search_symbol_count &&
+                                lib->searched_anti_deps    == search_anti_deps;
+          if ( ! can_skip_search) {
+            // lazily alloc the per-worker frontier cursor arrays (cleared = unsearched/start).
+            if (lib->search_cursor_chunk == 0) {
+              lib->search_cursor_chunk = push_array(link->arena, LNK_SymbolHashTrieChunk *, tp->worker_count);
+              lib->search_cursor_idx   = push_array(link->arena, U64,                       tp->worker_count);
+            }
+
+            // anti-dep mode flipped since last search of this lib: anti-dep weak symbols before the
+            // cursor were skipped under the old mode, so they must be re-tried -> rescan from start.
+            B32 reset_cursor = lib->was_searched && lib->searched_anti_deps != search_anti_deps;
+
+            LNK_SearchLibTask search_task = {
+              .search_anti_deps = search_anti_deps,
+              .reset_cursor     = reset_cursor,
+              .link             = link,
+              .imports_hm       = &imports_hm,
+              .lib              = lib,
+              .symtab           = symtab,
+              .lib_member_infos = lib_member_infos,
+              .member_ref_lists = member_ref_lists
+            };
+            tp_for_parallel(tp, arena, tp->worker_count, lnk_search_lib_task, &search_task);
+
+            lib->was_searched          = 1;
+            lib->searched_symbol_count = search_symbol_count;
+            lib->searched_anti_deps    = search_anti_deps;
+          }
         }
 
         LNK_LibMemberRefList queued_members = {0};
@@ -2405,6 +2459,13 @@ lnk_link_image(TP_Context *tp, TP_Arena *arena, LNK_Config *config, LNK_Inputer
     ProfEnd();
   }
 
+  //
+  // fold identical code COMDATs (routes followers to a leader; /OPT:REF then GCs them)
+  //
+  if (config->opt_icf == LNK_SwitchState_Yes) {
+    lnk_opt_icf(tp, arena->v[0], symtab, config, link->objs);
+  }
+
   //
   // discard COMDAT sections that are not referenced
   //
@@ -2517,6 +2578,7 @@ THREAD_POOL_TASK_FUNC(lnk_walk_relocs_and_mark_ref_sections_task)
   LNK_ObjList      objs   = task->objs;
 
   U8                    **is_live             = 0;
+  LNK_Obj               **objs_by_idx         = 0; // input_idx -> obj, for /OPT:ICF static-fold redirect
   U64                    *active_thread_count = 0;
   LNK_RelocRefsBatchList  *global_batch_list   = 0;
   if (task_id == 0) {
@@ -2524,11 +2586,13 @@ THREAD_POOL_TASK_FUNC(lnk_walk_relocs_and_mark_ref_sections_task)
     global_batch_list   = push_array(scratch.arena, LNK_RelocRefsBatchList, 1);
 
     // alloc live flags and set live status on every non-COMDAT section
-    is_live = push_array_no_zero(scratch.arena, U8 *, objs.count);
+    is_live     = push_array_no_zero(scratch.arena, U8 *,      objs.count);
+    objs_by_idx = push_array_no_zero(scratch.arena, LNK_Obj *, objs.count ? objs.count : 1);
     {
       U64 obj_idx = 0;
       for EachNode(n, LNK_ObjNode, task->objs.first) {
-        is_live[obj_idx] = push_array(scratch.arena, U8, n->data.header.section_count_no_null + 1);
+        is_live[obj_idx]     = push_array(scratch.arena, U8, n->data.header.section_count_no_null + 1);
+        objs_by_idx[obj_idx] = &n->data;
 
         for EachIndex(sect_idx, n->data.header.section_count_no_null) {
           is_live[obj_idx][sect_idx + 1] = !(n->data.section_flags[sect_idx] & COFF_SectionFlag_LnkCOMDAT);
@@ -2589,6 +2653,7 @@ THREAD_POOL_TASK_FUNC(lnk_walk_relocs_and_mark_ref_sections_task)
     }
   }
   tp_broadcast(&is_live);
+  tp_broadcast(&objs_by_idx);
   tp_broadcast(&global_batch_list);
   tp_broadcast(&active_thread_count);
 
@@ -2652,32 +2717,86 @@ THREAD_POOL_TASK_FUNC(lnk_walk_relocs_and_mark_ref_sections_task)
           if (ref_interp == COFF_SymbolValueInterp_Regular) {
             Temp temp = temp_begin(scratch2.arena);
 
+            // /OPT:ICF static-COMDAT redirect: if the referenced section was folded into a leader,
+            // mark the LEADER section live (and walk the LEADER's relocs/associated sections) instead
+            // of the dead follower. The follower then dead-strips, taking its .pdata/.xdata with it.
+            LNK_Obj *walk_obj      = ref_symbol.obj;
+            U32      seed_sn       = ref_parsed.section_number;
+            if (walk_obj->icf_fold && seed_sn != 0 && seed_sn <= walk_obj->header.section_count_no_null &&
+                walk_obj->icf_fold[seed_sn - 1].set) {
+              LNK_ICFFold fold = walk_obj->icf_fold[seed_sn - 1];
+              walk_obj = objs_by_idx[fold.leader_obj_idx];
+              seed_sn  = fold.leader_sn;
+            }
+
             HashMap visited_sections_hm = {0};
             U32Node *stack = push_array(temp.arena, U32Node, 1);
-            stack->data    = ref_parsed.section_number;
+            stack->data    = seed_sn;
             do {
               U32 section_number = stack->data;
               SLLStackPop(stack);
 
               // is section number valid?
-              if (section_number == 0 || section_number > ref_symbol.obj->header.section_count_no_null) { continue; }
+              if (section_number == 0 || section_number > walk_obj->header.section_count_no_null) { continue; }
 
               // detect cyclic associative sections
               if (hash_map_search_u64_u64(&visited_sections_hm, section_number)) { continue; }
               hash_map_push_u64_u64(temp.arena, &visited_sections_hm, section_number, 1);
 
               // push associated section
-              for EachNode(associated_n, U32Node, ref_symbol.obj->associated_sections[section_number]) {
-                if (hash_map_search_u64_u64(&visited_sections_hm, associated_n->data)) { continue; }
+              for EachNode(associated_n, U32Node, walk_obj->associated_sections[section_number]) {
+                U32 assoc_sn = associated_n->data;
+
+                // /OPT:ICF static-COMDAT: an associative section can itself be a folded static
+                // follower (e.g. an associative .text$ funclet/thunk). Keeping the follower live here
+                // would defeat the fold and leave it with relocs into removed targets. Instead skip
+                // the follower and mark its LEADER live (cross-obj), so the follower dead-strips while
+                // its address still resolves to the identical leader.
+                if (walk_obj->icf_fold && assoc_sn >= 1 && assoc_sn <= walk_obj->header.section_count_no_null &&
+                    walk_obj->icf_fold[assoc_sn - 1].set) {
+                  LNK_ICFFold afold      = walk_obj->icf_fold[assoc_sn - 1];
+                  LNK_Obj    *leader_obj = objs_by_idx[afold.leader_obj_idx];
+                  U32         leader_sn  = afold.leader_sn;
+                  // mark the leader .text and each of its associative sections (.pdata/.xdata) live,
+                  // enqueuing each eligible one's relocs so their targets (handler .text, .xdata) are
+                  // kept live too -- exactly as a normal seed-walk of the leader would do.
+                  U32 leader_sns[64];
+                  U32 leader_sn_count = 0;
+                  leader_sns[leader_sn_count++] = leader_sn;
+                  for EachNode(lan, U32Node, leader_obj->associated_sections[leader_sn]) {
+                    if (leader_sn_count < ArrayCount(leader_sns)) { leader_sns[leader_sn_count++] = lan->data; }
+                  }
+                  for EachIndex(lsi, leader_sn_count) {
+                    U32 lsn = leader_sns[lsi];
+                    if (lsn == 0 || lsn > leader_obj->header.section_count_no_null) { continue; }
+                    U8 lseen = ins_atomic_u8_eval_assign(&is_live[leader_obj->input_idx][lsn], 1);
+                    if (lseen) { continue; }
+                    COFF_SectionFlags lflags = leader_obj->section_flags[lsn - 1];
+                    if (lflags & (COFF_SectionFlag_LnkRemove | COFF_SectionFlag_LnkInfo | LNK_SECTION_FLAG_DEBUG)) { continue; }
+                    LNK_RelocRefs lrefs = {0};
+                    lrefs.obj    = leader_obj;
+                    lrefs.relocs = lnk_coff_reloc_info_from_section_number(leader_obj, lsn);
+                    LNK_RelocRefsBatch *lbatch = last_batch;
+                    if (last_batch == 0 || last_batch->count >= ArrayCount(last_batch->v)) {
+                      if (free_list.head.node) { lbatch = lnk_reloc_ref_batch_list_pop(&free_list); MemoryZeroStruct(lbatch); }
+                      else                     { lbatch = push_array(scratch.arena, LNK_RelocRefsBatch, 1); }
+                      SLLQueuePush(first_batch, last_batch, lbatch);
+                    }
+                    lbatch->v[lbatch->count++] = lrefs;
+                  }
+                  continue; // do not mark the follower live
+                }
+
+                if (hash_map_search_u64_u64(&visited_sections_hm, assoc_sn)) { continue; }
                 U32Node *stack_n = push_array(temp.arena, U32Node, 1);
-                stack_n->data = associated_n->data;
+                stack_n->data = assoc_sn;
                 SLLStackPush(stack, stack_n);
               }
 
-              COFF_SectionFlags   section_flags  = ref_symbol.obj->section_flags[section_number-1];
+              COFF_SectionFlags   section_flags  = walk_obj->section_flags[section_number-1];
 
               // on first section visit, set live flag and enqueue section
-              U8 was_visited = ins_atomic_u8_eval_assign(&is_live[ref_symbol.obj->input_idx][section_number], 1);
+              U8 was_visited = ins_atomic_u8_eval_assign(&is_live[walk_obj->input_idx][section_number], 1);
               if (was_visited) { continue; }
 
               // is section eligible for walking?
@@ -2686,8 +2805,8 @@ THREAD_POOL_TASK_FUNC(lnk_walk_relocs_and_mark_ref_sections_task)
               if (section_flags & LNK_SECTION_FLAG_DEBUG)     { continue; }
 
               LNK_RelocRefs refs = {0};
-              refs.obj    = ref_symbol.obj;
-              refs.relocs = lnk_coff_reloc_info_from_section_number(ref_symbol.obj, section_number);
+              refs.obj         = walk_obj;
+              refs.relocs = lnk_coff_reloc_info_from_section_number(walk_obj, section_number);
 
               // get a batch node
               LNK_RelocRefsBatch *batch = last_batch;
@@ -2793,6 +2912,1611 @@ THREAD_POOL_TASK_FUNC(lnk_walk_relocs_and_mark_ref_sections_task)
   ProfEnd();
 }
 
+internal U64
+lnk_icf_mix(U64 h, U64 x)
+{
+  h ^= x;
+  h *= 0x100000001b3ull;
+  h ^= h >> 29;
+  return h;
+}
+
+// Flat open-addressing U64 -> U64 map used by /OPT:ICF. The tree-based HashMap is far too slow
+// for the millions of per-round dense-id lookups; this linear-probing table keeps the hot loops
+// cache-friendly. Empty slots hold key == LNK_ICF_EMPTY (a value the stored hashes never take).
+#define LNK_ICF_EMPTY 0xffffffffffffffffull
+typedef struct LNK_ICFMap
+{
+  U64 *keys;
+  U64 *vals;
+  U64  mask;
+} LNK_ICFMap;
+
+// avalanche so structured keys (e.g. (input_idx<<32)|section_number, whose low bits repeat
+// across objects) scatter across slots instead of clustering into long probe chains
+internal U64
+lnk_icf_scramble(U64 x)
+{
+  x ^= x >> 33;
+  x *= 0xff51afd7ed558ccdull;
+  x ^= x >> 33;
+  x *= 0xc4ceb9fe1a85ec53ull;
+  x ^= x >> 33;
+  return x;
+}
+
+internal LNK_ICFMap
+lnk_icf_map_make(Arena *arena, U64 capacity)
+{
+  U64 cap = 1;
+  while (cap < capacity*2) { cap <<= 1; }
+  LNK_ICFMap m = {0};
+  m.keys = push_array_no_zero(arena, U64, cap);
+  m.vals = push_array_no_zero(arena, U64, cap);
+  m.mask = cap - 1;
+  // LNK_ICF_EMPTY is all-0xFF bytes -> a single memset (bandwidth-optimal, vectorized) instead of a
+  // scalar per-U64 store loop. cap can be 32M+ entries (256MB) on the monolithic link; the scalar
+  // loop was a serial first-touch page-fault sink.
+  MemorySet(m.keys, 0xff, cap*sizeof(U64));
+  return m;
+}
+
+// look up key; returns stored value or fallback if absent
+internal U64
+lnk_icf_map_get(LNK_ICFMap *m, U64 key, U64 fallback)
+{
+  U64 slot = lnk_icf_scramble(key) & m->mask;
+  for (;;) {
+    U64 k = m->keys[slot];
+    if (k == key)          { return m->vals[slot]; }
+    if (k == LNK_ICF_EMPTY) { return fallback; }
+    slot = (slot + 1) & m->mask;
+  }
+}
+
+internal void
+lnk_icf_map_put(LNK_ICFMap *m, U64 key, U64 val)
+{
+  U64 slot = lnk_icf_scramble(key) & m->mask;
+  for (;;) {
+    U64 k = m->keys[slot];
+    if (k == LNK_ICF_EMPTY || k == key) { m->keys[slot] = key; m->vals[slot] = val; return; }
+    slot = (slot + 1) & m->mask;
+  }
+}
+
+// Thread-safe insert for the cand_map build (lnk_opt_icf): the keys are UNIQUE (one entry per
+// (input_idx, sn) candidate -- a 1:1 map), so each insert claims a distinct empty slot. Claim it
+// with an atomic CAS EMPTY->key; only the CAS winner writes vals[slot]. Because keys are unique no
+// two writers ever target the same key, so the CAS is the sole arbiter of slot ownership and the
+// vals[] store has exactly one writer. The map is only READ later (lnk_icf_map_get, after the build
+// barrier), and a key-keyed probe finds its key regardless of WHICH slot collisions placed it in --
+// so the resulting key->val mapping (hence ICF output) is identical for any insertion order.
+internal void
+lnk_icf_map_put_atomic(LNK_ICFMap *m, U64 key, U64 val)
+{
+  U64 slot = lnk_icf_scramble(key) & m->mask;
+  for (;;) {
+    U64 k = m->keys[slot];
+    if (k == LNK_ICF_EMPTY) {
+      // CAS returns the prior value; if it was EMPTY we won the slot.
+      if (ins_atomic_u64_eval_cond_assign(&m->keys[slot], key, LNK_ICF_EMPTY) == LNK_ICF_EMPTY) {
+        m->vals[slot] = val; // sole writer of this slot's value (keys are unique -> one CAS winner)
+        return;
+      }
+      // lost the race for this slot: re-read the same slot (another key may now own it, or it is
+      // still EMPTY-but-someone-else-is-mid-CAS -> the re-read resolves it). do NOT advance yet.
+      continue;
+    }
+    slot = (slot + 1) & m->mask;
+  }
+}
+
+typedef struct LNK_ICFCand
+{
+  LNK_Obj *obj;
+  U32      sn;          // section number
+  U32      reloc_first; // index into flattened reloc-target arrays
+  U32      reloc_count;
+  U64      key0;        // round-0 content key
+  B8       is_static;   // static (internal-linkage) COMDAT: folded via icf_fold, not symbol redirect
+  // NOTE: per-candidate equivalence-class "color" lives in a SEPARATE dense U64 colors[] array
+  // (see lnk_opt_icf), not here -- the refine/scan loops gather colors by candidate index millions
+  // of times; an 8B dense array keeps them cache-dense vs pulling this whole 40B struct per access.
+} LNK_ICFCand;
+
+typedef struct LNK_ICFCandMapPutTask
+{
+  Rng1U64     *ranges;
+  LNK_ICFCand *cands;
+  LNK_ICFMap  *cand_map;
+} LNK_ICFCandMapPutTask;
+
+// Parallel build of the (input_idx, sn) -> cand_idx+1 lookup. The serial build was cache-miss-bound
+// (~642ms): each lnk_icf_map_put probes a scrambled (random) slot. Spreading the inserts across the
+// pool with the atomic-CAS claim (lnk_icf_map_put_atomic) parallelizes those misses. Output-neutral
+// because keys are unique and the map is read only by key afterward (see lnk_icf_map_put_atomic).
+internal
+THREAD_POOL_TASK_FUNC(lnk_icf_candmap_put_task)
+{
+  LNK_ICFCandMapPutTask *task = raw_task;
+  for EachInRange(ci, task->ranges[task_id]) {
+    lnk_icf_map_put_atomic(task->cand_map, Compose64Bit(task->cands[ci].obj->input_idx, task->cands[ci].sn), ci + 1);
+  }
+}
+
+typedef struct LNK_ICFHashTask
+{
+  Rng1U64         *ranges;
+  LNK_ICFCand     *cands;
+  LNK_ICFMap      *cand_map;
+  LNK_SymbolTable *symtab;
+  U8              *rt_iscand;
+  U64             *rt_target;
+} LNK_ICFHashTask;
+
+// per-candidate content hash + relocation-target resolution (parallel; each candidate writes
+// its own disjoint reloc slice and key0, all reads are of immutable structures)
+internal
+THREAD_POOL_TASK_FUNC(lnk_icf_hash_task)
+{
+  LNK_ICFHashTask *task = raw_task;
+  for EachInRange(ci, task->ranges[task_id]) {
+    LNK_ICFCand        *c      = &task->cands[ci];
+    COFF_SectionHeader *header = lnk_coff_section_header_from_section_number(c->obj, c->sn);
+    COFF_RelocArray     relocs = lnk_coff_relocs_from_section_header(c->obj, header);
+    String8             data   = str8_substr(c->obj->data, rng_1u64(header->foff, header->foff + header->fsize));
+
+    blake3_hasher h; blake3_hasher_init(&h);
+    U32 flags_for_hash = c->obj->section_flags[c->sn - 1] & ~(COFF_SectionFlag_LnkCOMDAT | COFF_SectionFlag_LnkRemove);
+    blake3_hasher_update(&h, &flags_for_hash, sizeof(flags_for_hash));
+    blake3_hasher_update(&h, &header->fsize, sizeof(header->fsize));
+    blake3_hasher_update(&h, data.str, data.size);
+
+    for EachIndex(ri, relocs.count) {
+      COFF_Reloc *reloc = &relocs.v[ri];
+      blake3_hasher_update(&h, &reloc->type, sizeof(reloc->type));
+      blake3_hasher_update(&h, &reloc->apply_off, sizeof(reloc->apply_off));
+
+      COFF_ParsedSymbol          tp   = lnk_parsed_symbol_from_coff_symbol_idx(c->obj, reloc->isymbol);
+      COFF_SymbolValueInterpType ti   = coff_interp_from_parsed_symbol(tp);
+      LNK_ObjSymbolRef           tref = { c->obj, reloc->isymbol };
+      if (ti == COFF_SymbolValueInterp_Undefined || ti == COFF_SymbolValueInterp_Weak) {
+        LNK_ObjSymbolRef resolved = {0};
+        if (lnk_resolve_symbol(task->symtab, tref, &resolved)) {
+          tref = resolved;
+          tp   = lnk_parsed_symbol_from_coff_symbol_idx(tref.obj, tref.symbol_idx);
+          ti   = coff_interp_from_parsed_symbol(tp);
+        }
+      }
+
+      U8  iscand = 0;
+      U64 target = 0;
+      if (ti == COFF_SymbolValueInterp_Regular && tref.obj != 0) {
+        // Canonicalize a COMDAT definition to its selected leader so the SAME logical symbol keys
+        // identically regardless of which obj's local copy this reloc happens to name. Two byte-
+        // identical functions that each reference their own copy of a shared COMDAT (writable
+        // static guards, vtables, selectany globals) must otherwise get distinct per-obj keys and
+        // never fold. The symlink leader is the post-resolution canonical definition (built before
+        // ICF, read-only here), so keying by it is strictly more correct than per-obj.
+        // Keep the reloc's own target offset (tp.value): canonicalize only the SECTION IDENTITY
+        // (obj,sn) to the leader, never the offset, so distinct offsets into the same section stay
+        // distinct. The leader symbol itself always has value 0 (the symlink picks the value==0
+        // definition), but a reloc may name an interior offset.
+        LNK_Obj *kobj = tref.obj;
+        U64      ksn  = tp.section_number;
+        if (ksn >= 1 && ksn <= kobj->header.section_count_no_null &&
+            (kobj->section_flags[ksn - 1] & COFF_SectionFlag_LnkCOMDAT)) {
+          LNK_Symbol *leader = lnk_obj_get_comdat_symlink(kobj, ksn);
+          if (leader) {
+            LNK_ObjSymbolRef  lref = lnk_ref_from_symbol(leader);
+            // only section_number is needed here; skip the string-table name decode that the
+            // full lnk_parsed_from_symbol does per COMDAT reloc (millions of relocs in this phase).
+            if (lref.obj != 0) {
+              COFF_ParsedSymbol lp = lnk_parsed_symbol_from_coff_symbol_idx_no_name(lref.obj, lref.symbol_idx);
+              kobj = lref.obj; ksn = lp.section_number;
+            }
+          }
+        }
+        U64 cv = lnk_icf_map_get(task->cand_map, Compose64Bit(kobj->input_idx, ksn), 0);
+        if (cv) { iscand = 1; target = cv - 1; }
+        else    { target = lnk_icf_mix(Compose64Bit(kobj->input_idx, ksn), tp.value); }
+      } else {
+        U64 nh = 14695981039346656037ull;
+        for (U64 i = 0; i < tp.name.size; i += 1) { nh = lnk_icf_mix(nh, tp.name.str[i]); }
+        target = lnk_icf_mix(nh, tp.value);
+      }
+
+      U64 idx = (U64)c->reloc_first + ri;
+      task->rt_iscand[idx] = iscand;
+      task->rt_target[idx] = target;
+      if (!iscand) { blake3_hasher_update(&h, &target, sizeof(target)); }
+    }
+
+    U8 out[16]; blake3_hasher_finalize(&h, out, sizeof(out));
+    U64 lo = *(U64 *)&out[0], hi = *(U64 *)&out[8];
+    c->key0 = lnk_icf_mix(lo, hi);
+  }
+}
+
+// Split active[0,n) into worker_count contiguous ranges of ~equal RELOC WEIGHT (refine/verify cost
+// per candidate ~= its reloc_count), instead of equal candidate count. Fixes the load imbalance where
+// a worker holding high-reloc candidates runs long while others idle. Ranges are a pure function of
+// reloc_count[active[]] -> deterministic. Returns worker_count+1 boundary array (dummy tail like
+// tp_divide_work). active==0 means weight by cands[i] directly over [0,n).
+internal Rng1U64 *
+lnk_icf_divide_by_reloc(Arena *arena, U32 *active, U64 n, U32 worker_count, LNK_ICFCand *cands)
+{
+  Rng1U64 *ranges = push_array_no_zero(arena, Rng1U64, worker_count + 1);
+  U64 total_w = 0;
+  for EachIndex(i, n) { total_w += cands[active ? active[i] : i].reloc_count + 1; } // +1 so zero-reloc cands still count
+  U64 per_w = (total_w + worker_count - 1) / (worker_count ? worker_count : 1);
+  U64 cursor = 0, acc = 0, w = 0;
+  for (; w < worker_count; w += 1) {
+    U64 begin = cursor;
+    U64 target = (w + 1) * per_w;
+    while (cursor < n && acc < target) { acc += cands[active ? active[cursor] : cursor].reloc_count + 1; cursor += 1; }
+    ranges[w] = rng_1u64(begin, cursor);
+  }
+  // any remainder from rounding -> last worker
+  if (cursor < n) { ranges[worker_count - 1].max = n; }
+  ranges[worker_count] = rng_1u64(n, n);
+  return ranges;
+}
+
+typedef struct LNK_ICFRefineTask
+{
+  Rng1U64     *ranges;
+  LNK_ICFCand *cands;
+  U32         *colors; // dense per-candidate color array, U32 (color ids < color_base < 2^32);
+                       // separate from the 40B LNK_ICFCand so the hot color gathers touch 4B/elem.
+  U8          *rt_iscand;
+  U64         *rt_target;
+  U64         *newkey;
+  U32         *active; // when set, ranges index into active[] rather than cands[] directly
+} LNK_ICFRefineTask;
+
+// recompute each candidate's refinement key from its current color and its targets' colors
+// (parallel; reads the immutable color snapshot, writes its own newkey slot)
+internal
+THREAD_POOL_TASK_FUNC(lnk_icf_refine_task)
+{
+  LNK_ICFRefineTask *task   = raw_task;
+  U32               *colors = task->colors;
+  for EachInRange(ai, task->ranges[task_id]) {
+    U64          ci = task->active ? task->active[ai] : ai;
+    LNK_ICFCand *c  = &task->cands[ci];
+    U64 k = lnk_icf_mix(0x9e3779b97f4a7c15ull, colors[ci]);
+    for EachIndex(j, c->reloc_count) {
+      U64 idx = (U64)c->reloc_first + j;
+      U64 t   = task->rt_iscand[idx] ? colors[task->rt_target[idx]] : task->rt_target[idx];
+      k = lnk_icf_mix(k, t);
+    }
+    task->newkey[ci] = k;
+  }
+}
+
+// Parallel ICF fold: each group of same-colored candidates is independent (its members are distinct
+// (obj,sn) sections -> distinct symlink nodes and distinct icf_fold slots), so leader election +
+// byte/reloc verification + the fold writes can run concurrently across groups with no shared-state
+// races. The dominant cost folded in here is the per-follower byte-compare (str8_match/memcmp), which
+// was the largest single serial main-thread sink. Determinism is preserved: each group elects the
+// same leader (smallest (is_static,input_idx,sn)) and writes the same disjoint slots regardless of
+// which worker runs it. fold_count is accumulated per-worker.
+// parallel helpers for lnk_icf_dense_colors_active: the per-round gather (sk[i]=newkey[active[i]])
+// and the final scatter (cands[active[sv[k]]].color = ...) are random-access over the candidate
+// arrays (cache-miss bound). Spreading them across workers hides the memory latency. The boundary
+// counting between them stays a cheap sequential pass over the sorted keys.
+typedef struct LNK_ICFDenseTask
+{
+  Rng1U64     *ranges;
+  U32         *active;
+  U64         *newkey;
+  U64         *sk;
+  U32         *sv;
+  U32         *color_at; // per-sorted-position final color (filled serially, scattered in parallel)
+  U32         *colors;   // dense per-candidate color array (scatter target)
+  U8          *keep;     // per-sorted-position: 1 if its key-run has size>=2 (survives to next round)
+  U32         *out_slot; // per-sorted-position: exclusive prefix of keep[] -> deterministic emit slot
+  U32         *next_active; // emit target (survivors, in sorted-color order)
+  // parallel prefix-sum group scan (replaces the per-round serial scan over sk[] -- the refine-window
+  // sawtooth bottleneck, lnk_link_image self). boundary (run start) and keep (run size>=2 survivor)
+  // are per-position bits reading only sk[k], sk[k-1], sk[k+1] -> chunk-local. mark: write keep[] +
+  // per-chunk local boundary/keep/surviving-class counts. serial: tiny exclusive prefix over the
+  // worker_count chunk totals (NOT over n). apply: each chunk re-derives color_at[]/out_slot[] from
+  // its prefixed base -> identical values to the old serial running counter regardless of chunk split.
+  U64         *chunk_nc;    // [worker_count] in: local boundary count; serial-prefix -> chunk's exclusive base
+  U64         *chunk_keep;  // [worker_count] in: local keep count;     serial-prefix -> chunk's exclusive base
+  U64         *chunk_scls;  // [worker_count] local surviving-class count (boundary && keep), summed serially
+  U64          base;        // color id base for this round
+  U64          n_total;     // n (for the k+1==N end-of-run test; spans chunk edges read-only)
+} LNK_ICFDenseTask;
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_icf_dense_gather_task)
+{
+  LNK_ICFDenseTask *t = raw_task;
+  for EachInRange(i, t->ranges[task_id]) { t->sk[i] = t->newkey[t->active[i]]; t->sv[i] = (U32)i; }
+}
+
+// mark pass: per sorted position compute boundary (run start) and keep (run size>=2 -> survives);
+// write keep[k]; accumulate this chunk's local boundary/keep/surviving-class counts. boundary/keep
+// read only sk[k-1..k+1] -> chunk-local, so each chunk's local sums are independent (the run that
+// straddles a chunk edge is counted once -- by whichever chunk owns its run-start position).
+internal
+THREAD_POOL_TASK_FUNC(lnk_icf_scan_mark_task)
+{
+  LNK_ICFDenseTask *t = raw_task;
+  Rng1U64 r = t->ranges[task_id];
+  U64 N = t->n_total;
+  U64 nloc = 0, kloc = 0, sloc = 0;
+  for (U64 k = r.min; k < r.max; k += 1) {
+    B32 boundary      = (k == 0     || t->sk[k] != t->sk[k - 1]);
+    B32 next_boundary = (k + 1 == N || t->sk[k + 1] != t->sk[k]); // start of next run (or end)
+    U8  survive       = (U8)!(boundary && next_boundary);         // singleton iff boundary && next_boundary
+    t->keep[k] = survive;
+    if (boundary)            { nloc += 1; }
+    if (survive)             { kloc += 1; }
+    if (boundary && survive) { sloc += 1; } // run-start of a surviving run -> one surviving class
+  }
+  t->chunk_nc[task_id]   = nloc;
+  t->chunk_keep[task_id] = kloc;
+  t->chunk_scls[task_id] = sloc;
+}
+
+// apply pass: chunk_nc[task_id]/chunk_keep[task_id] now hold this chunk's exclusive base (the running
+// class id / emit slot at the chunk's first position). Re-derive the per-position bits and run the
+// counters locally from that base to write color_at[k]/out_slot[k] -- byte-identical to the old serial
+// scan regardless of how the chunks split, so the downstream scatter/emit stay deterministic.
+internal
+THREAD_POOL_TASK_FUNC(lnk_icf_scan_apply_task)
+{
+  LNK_ICFDenseTask *t = raw_task;
+  Rng1U64 r  = t->ranges[task_id];
+  U64 nc     = t->chunk_nc[task_id];   // boundaries before this chunk (exclusive base)
+  U64 slot   = t->chunk_keep[task_id]; // emit slot at chunk start
+  for (U64 k = r.min; k < r.max; k += 1) {
+    B32 boundary = (k == 0 || t->sk[k] != t->sk[k - 1]);
+    if (boundary) { nc += 1; }
+    t->color_at[k] = (U32)(t->base + (nc - 1));
+    t->out_slot[k] = (U32)slot;
+    slot += t->keep[k];
+  }
+}
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_icf_dense_scatter_task)
+{
+  LNK_ICFDenseTask *t = raw_task;
+  for EachInRange(k, t->ranges[task_id]) { t->colors[t->active[t->sv[k]]] = t->color_at[k]; }
+}
+
+// parallel survivor emit: each kept sorted-position writes active[sv[k]] to its precomputed slot
+// out_slot[k]. Slots are a serial exclusive-prefix of keep[] (deterministic), so the emit order is
+// identical regardless of which worker runs which k -> reproducible. This moves the random-access
+// active[sv[k]] gather (the expensive serial part of the old group scan) onto the workers.
+internal
+THREAD_POOL_TASK_FUNC(lnk_icf_dense_emit_task)
+{
+  LNK_ICFDenseTask *t = raw_task;
+  for EachInRange(k, t->ranges[task_id]) {
+    if (t->keep[k]) { t->next_active[t->out_slot[k]] = t->active[t->sv[k]]; }
+  }
+}
+
+// densify the keys of just the active subset into class ids drawn from `base` upward (so they never
+// collide with the finalized colors of candidates that already dropped out as singletons). Returns
+// the number of distinct classes among the active set. Equal keys land in one class regardless of
+// sort tie order, so color values are deterministic.
+// Densify active subset into class ids from `base` up. ALSO emits, in the same sorted pass (run
+// lengths are free), the compacted next-round active set (members of classes that still have size>=2)
+// into next_active[], and the count of such classes. This fuses the per-round singleton-drop scan
+// (was two random-access colors[active[i]] gathers + a cls_size histogram) into the group scan.
+// Determinism: equal keys land in one class regardless of sort tie order; next_active is emitted in
+// sorted-color order (deterministic). Returns distinct class count nc.
+internal U64
+lnk_icf_dense_colors_active(TP_Context *tp, Arena *arena, U32 *active, U64 n, U64 *newkey, U32 *colors, U64 base,
+                            U32 *next_active, U64 *out_next_count, U64 *out_next_class_count)
+{
+  Temp t  = temp_begin(arena);
+  U64 *sk       = push_array_no_zero(t.arena, U64, n ? n : 1);
+  U32 *sv       = push_array_no_zero(t.arena, U32, n ? n : 1); // index into active[]
+  U32 *color_at = push_array_no_zero(t.arena, U32, n ? n : 1); // per-sorted-position color
+
+  LNK_ICFDenseTask dt = {0};
+  dt.active = active; dt.newkey = newkey; dt.sk = sk; dt.sv = sv; dt.color_at = color_at; dt.colors = colors;
+
+  // parallel gather: random-access newkey[active[i]] -> sk[i], spread across workers. NOTE radix
+  // (next line) allocates scratch on t.arena, so ranges built before radix would be stale -- rebuild
+  // ranges AFTER radix for the scatter. (This stale-ranges trap is why the parallel path was wrong.)
+  dt.ranges = tp_divide_work(t.arena, n, tp->worker_count);
+  tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_dense_gather_task, &dt);
+  lnk_radix_sort_u64_pairs(tp, t.arena, n, sk, sv);
+  dt.ranges = tp_divide_work(t.arena, n, tp->worker_count); // rebuilt AFTER radix for scatter
+
+  // PARALLEL prefix-sum group scan over the SORTED keys (replaces the per-round serial scan that was
+  // the refine-window sawtooth bottleneck). mark: per-position keep bit + per-chunk local counts ->
+  // serial: tiny exclusive prefix over the worker_count chunk totals (NOT over n) -> apply: each chunk
+  // re-derives color_at[]/out_slot[] from its prefixed base. The result is a pure prefix of per-position
+  // bits, byte-identical to the old serial running counter regardless of chunk split -> deterministic.
+  U8  *keep      = push_array_no_zero(t.arena, U8,  n ? n : 1);
+  U32 *out_slot  = push_array_no_zero(t.arena, U32, n ? n : 1);
+  U64 W          = tp->worker_count;
+  U64 *chunk_nc  = push_array_no_zero(t.arena, U64, W ? W : 1);
+  U64 *chunk_kp  = push_array_no_zero(t.arena, U64, W ? W : 1);
+  U64 *chunk_sc  = push_array_no_zero(t.arena, U64, W ? W : 1);
+
+  dt.keep = keep; dt.out_slot = out_slot; dt.next_active = next_active;
+  dt.chunk_nc = chunk_nc; dt.chunk_keep = chunk_kp; dt.chunk_scls = chunk_sc;
+  dt.base = base; dt.n_total = n;
+  tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_scan_mark_task, &dt);
+
+  // exclusive prefix over the W chunk totals -> chunk_nc[w]/chunk_kp[w] become the running class id /
+  // emit slot at chunk w's first position (the exclusive base apply counts from); sum surviving classes.
+  U64 nc = 0, next_count = 0, next_class_count = 0;
+  for EachIndex(w, W) {
+    U64 cn = chunk_nc[w], ck = chunk_kp[w], cs = chunk_sc[w];
+    chunk_nc[w] = nc; chunk_kp[w] = next_count;
+    nc += cn; next_count += ck; next_class_count += cs;
+  }
+  tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_scan_apply_task, &dt);
+
+#if defined(ICF_SCAN_SELFCHECK)
+  { // SELF-CHECK (debug-only, gated): recompute the serial scan into shadow arrays, assert byte-exact
+    Temp tc = temp_begin(t.arena);
+    U8  *xkeep = push_array_no_zero(tc.arena, U8,  n ? n : 1);
+    U32 *xslot = push_array_no_zero(tc.arena, U32, n ? n : 1);
+    U32 *xcol  = push_array_no_zero(tc.arena, U32, n ? n : 1);
+    U64 xnc=0, xnext=0, xncls=0, run0=0, mism=0;
+    for EachIndex(k, n) {
+      B32 b = (k == 0 || sk[k] != sk[k - 1]);
+      if (b && k != 0) { U8 s=(k-run0>=2)?1:0; if(s)xncls++; for(U64 m=run0;m<k;m++){xkeep[m]=s;xslot[m]=(U32)xnext;xnext+=s;} run0=k; }
+      if (b) xnc++;
+      xcol[k]=(U32)(base+(xnc-1));
+    }
+    { U8 s=(n-run0>=2)?1:0; if(s)xncls++; for(U64 m=run0;m<n;m++){xkeep[m]=s;xslot[m]=(U32)xnext;xnext+=s;} }
+    for EachIndex(k,n){ if(xkeep[k]!=keep[k]||xslot[k]!=out_slot[k]||xcol[k]!=color_at[k]) mism++; }
+    if(mism||xnc!=nc||xnext!=next_count||xncls!=next_class_count){
+      lnk_log(LNK_Log_Debug,"ICF_SCAN_SELFCHECK MISMATCH n=%llu mism=%llu nc(p=%llu s=%llu) next(p=%llu s=%llu) ncls(p=%llu s=%llu)",n,mism,nc,xnc,next_count,xnext,next_class_count,xncls);
+    }
+    temp_end(tc);
+  }
+#endif
+
+  tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_dense_scatter_task, &dt); // parallel color scatter
+  tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_dense_emit_task, &dt);    // parallel survivor emit
+
+  temp_end(t);
+  if (out_next_count)       { *out_next_count       = next_count; }
+  if (out_next_class_count) { *out_next_class_count = next_class_count; }
+  return nc;
+}
+
+// radix digit width (also defined at the radix-sort section below; hoisted here so the persistent
+// refine region can size its histogram). Keep both in sync.
+#if !defined(LNK_RADIX_BITS)
+#define LNK_RADIX_BITS 8
+#define LNK_RADIX_SIZE (1 << LNK_RADIX_BITS)
+#endif
+
+// ============================================================================================
+// PERSISTENT-WORKER ICF REFINE REGION
+//
+// The per-round refine loop (refine -> dense_colors_active{gather,radix,scan,scatter,emit}) used to
+// re-enter the thread pool ~10x per round via tp_for_parallel, ~18 rounds => ~126 fork-join cycles.
+// Each phase is tiny, so the pool wake->work->sleep latency dominated lnk_link_image self time (the
+// sawtooth). lnk_icf_refine_region collapses ALL rounds into ONE tp_for_parallel of worker_count
+// participants; every old phase boundary becomes a barrier_wait(tp->barrier) inside the region. The
+// tiny serial glue (radix per-pass 256xW prefix, group-scan W-chunk prefix, convergence/buffer swap)
+// runs on worker 0 behind a barrier while the others wait -- same work, no pool re-entry.
+//
+// ORDER PRESERVATION: every phase keeps its EXACT chunking and math. ranges are rebuilt each round by
+// worker 0 into preallocated buffers (lnk_icf_divide_by_reloc / tp_divide_work logic, in-place) and
+// read by all workers after a barrier -- identical boundaries to the per-round path. The radix sort,
+// scan_mark/apply, scatter and emit bodies are the SAME computations as the standalone tasks; only the
+// dispatch changed (barrier instead of tp_for_parallel). Result is byte-identical to the per-round
+// path (verified by ICF_SCAN_SELFCHECK + the canonical DLL cmp).
+// ============================================================================================
+
+typedef struct LNK_ICFRegion
+{
+  // immutable inputs
+  TP_Context  *tp;
+  LNK_ICFCand *cands;
+  U8          *rt_iscand;
+  U64         *rt_target;
+  U64          cand_count;
+  U32          worker_count;
+
+  // ping-pong active sets + colors/keys (driver owns the buffers; pointers swap each round on worker 0)
+  U32         *active;       // current active set (refine reads)
+  U32         *active2;      // next active set (emit writes)
+  U32         *colors;       // dense per-candidate color (scatter target)
+  U64         *newkey;       // refine writes; gather reads
+
+  // per-round scratch (preallocated to cand_count once; reused every round)
+  U64         *sk;           // sorted keys
+  U32         *sv;           // sorted values (index into active[])
+  U32         *color_at;     // per-sorted-position color
+  U8          *keep;         // per-sorted-position survivor bit
+  U32         *out_slot;     // per-sorted-position emit slot
+  U64         *kbuf;         // radix double-buffer (keys)
+  U32         *vbuf;         // radix double-buffer (vals)
+  U32         *hist;         // radix histogram [worker_count * LNK_RADIX_SIZE]
+  U64         *chunk_nc;     // [worker_count] group-scan boundary counts
+  U64         *chunk_kp;     // [worker_count] group-scan keep counts
+  U64         *chunk_sc;     // [worker_count] group-scan surviving-class counts
+  U64         *chunk_max;    // [worker_count] radix per-pass max-key reduction
+  Rng1U64     *refine_ranges;// [worker_count+1] reloc-weighted (refine)
+  Rng1U64     *work_ranges;  // [worker_count+1] even split over n (gather/scan/scatter/emit/radix)
+
+  // round-loop control (written by worker 0, broadcast via barrier)
+  U64          active_count;
+  U64          active_class_count;
+  U64          color_base;
+  U64          round_n;          // active_count snapshot for the current round's phases
+  U64          radix_pass_count; // passes for current round's radix sort
+  U64         *radix_ksrc;       // current radix source/dest (swap on worker 0 each pass)
+  U32         *radix_vsrc;
+  U64         *radix_kdst;
+  U32         *radix_vdst;
+  B32          converged;        // set by worker 0 -> all break together
+  U64          max_rounds;       // hard cap on rounds (region stops, may hand off to worklist tail)
+} LNK_ICFRegion;
+
+// in-place reloc-weighted division (mirrors lnk_icf_divide_by_reloc; writes into preallocated ranges)
+internal void
+lnk_icf_divide_by_reloc_into(Rng1U64 *ranges, U32 *active, U64 n, U32 worker_count, LNK_ICFCand *cands)
+{
+  // Count-based split: O(worker_count), no scan, no random gather. The old reloc-weighted
+  // split did two O(n) cache-miss gathers (cands[active[i]].reloc_count) per round x ~18
+  // rounds = ~5.6s serial. Work-split only -> output independent of it.
+  U64 W = worker_count ? worker_count : 1;
+  for (U64 w = 0; w < worker_count; w += 1) {
+    ranges[w] = rng_1u64((w * n) / W, ((w + 1) * n) / W);
+  }
+  ranges[worker_count] = rng_1u64(n, n);
+  (void)active; (void)cands;
+}
+
+// in-place even division (mirrors tp_divide_work; writes into preallocated ranges)
+internal void
+lnk_tp_divide_work_into(Rng1U64 *ranges, U64 item_count, U32 worker_count)
+{
+  U64 per_count = CeilIntegerDiv(item_count, worker_count);
+  for (U64 i = 0; i < worker_count; i += 1) {
+    ranges[i] = rng_1u64(Min(item_count, i * per_count), Min(item_count, i * per_count + per_count));
+  }
+  ranges[worker_count] = rng_1u64(item_count, item_count);
+}
+
+// One persistent parallel region spanning ALL refine rounds. worker_count participants; phase
+// boundaries are barriers; serial glue runs on worker 0. Each worker owns ranges[worker_id].
+internal
+THREAD_POOL_TASK_FUNC(lnk_icf_refine_region_task)
+{
+  LNK_ICFRegion *rs = raw_task;
+  U64 wid = worker_id; // 1:1 worker<->task: every participant blocks on the first barrier before any
+                       // can steal a second task, so task_id == worker_id throughout (see note below).
+  U64 W   = rs->worker_count;
+
+  for (U64 round = 0; round < rs->max_rounds; round += 1) {
+    // -------- round setup (worker 0): build ranges, snapshot active_count --------
+    if (wid == 0) {
+      rs->round_n = rs->active_count;
+      lnk_icf_divide_by_reloc_into(rs->refine_ranges, rs->active, rs->active_count, (U32)W, rs->cands);
+    }
+    barrier_wait(rs->tp->barrier);
+    U64 n = rs->round_n;
+
+    // -------- PHASE: refine (reloc-weighted ranges) --------
+    {
+      Rng1U64 r = rs->refine_ranges[wid];
+      for EachInRange(ai, r) {
+        U64          ci = rs->active[ai];
+        LNK_ICFCand *c  = &rs->cands[ci];
+        U64 k = lnk_icf_mix(0x9e3779b97f4a7c15ull, rs->colors[ci]);
+        for EachIndex(j, c->reloc_count) {
+          U64 idx = (U64)c->reloc_first + j;
+          U64 t   = rs->rt_iscand[idx] ? rs->colors[rs->rt_target[idx]] : rs->rt_target[idx];
+          k = lnk_icf_mix(k, t);
+        }
+        rs->newkey[ci] = k;
+      }
+    }
+    barrier_wait(rs->tp->barrier);
+
+    // -------- build even-split ranges (worker 0) for gather --------
+    if (wid == 0) { lnk_tp_divide_work_into(rs->work_ranges, n, (U32)W); }
+    barrier_wait(rs->tp->barrier);
+
+    // -------- PHASE: gather  sk[i]=newkey[active[i]], sv[i]=i --------
+    {
+      Rng1U64 r = rs->work_ranges[wid];
+      for EachInRange(i, r) { rs->sk[i] = rs->newkey[rs->active[i]]; rs->sv[i] = (U32)i; }
+    }
+    barrier_wait(rs->tp->barrier);
+
+    // -------- PHASE: parallel LSD radix sort of (sk, sv) over n --------
+    // worker 0 establishes pass_count from a parallel max reduction; each pass = parallel hist ->
+    // serial 256xW prefix (worker 0) -> parallel scatter -> pointer swap (worker 0). Identical to
+    // lnk_radix_sort_u64_pairs, just barrier-driven (no temp/scratch alloc inside the region).
+    {
+      // parallel max-key reduction into chunk_max[wid]
+      {
+        Rng1U64 r = rs->work_ranges[wid];
+        U64 m = 0;
+        for EachInRange(i, r) { if (rs->sk[i] > m) m = rs->sk[i]; }
+        rs->chunk_max[wid] = m;
+      }
+      barrier_wait(rs->tp->barrier);
+      if (wid == 0) {
+        U64 max_key = 0;
+        for EachIndex(w, W) { if (rs->chunk_max[w] > max_key) max_key = rs->chunk_max[w]; }
+        U64 pass_count;
+        if (max_key != 0) {
+          U64 sig_bits   = 64 - clz64(max_key);
+          U64 sig_passes = (sig_bits + LNK_RADIX_BITS - 1) / LNK_RADIX_BITS;
+          pass_count = sig_passes + (sig_passes & 1);
+        } else {
+          pass_count = 0;
+        }
+        if (n < 2) { pass_count = 0; }
+        rs->radix_pass_count = pass_count;
+        rs->radix_ksrc = rs->sk;   rs->radix_vsrc = rs->sv;
+        rs->radix_kdst = rs->kbuf; rs->radix_vdst = rs->vbuf;
+      }
+      barrier_wait(rs->tp->barrier);
+
+      U64 pass_count = rs->radix_pass_count;
+      for (U64 pass = 0; pass < pass_count; pass += 1) {
+        U64  shift = pass * LNK_RADIX_BITS;
+        U64 *ksrc  = rs->radix_ksrc; U32 *vsrc = rs->radix_vsrc;
+        U64 *kdst  = rs->radix_kdst; U32 *vdst = rs->radix_vdst;
+
+        // hist (parallel): zero own row, count digits over own range
+        {
+          U32 *h = rs->hist + wid * LNK_RADIX_SIZE;
+          MemoryZero(h, sizeof(U32) * LNK_RADIX_SIZE);
+          Rng1U64 r = rs->work_ranges[wid];
+          for EachInRange(i, r) { h[(ksrc[i] >> shift) & (LNK_RADIX_SIZE - 1)] += 1; }
+        }
+        barrier_wait(rs->tp->barrier);
+
+        // exclusive prefix across (bucket, worker) -- serial, worker 0 (identical order to the task path)
+        if (wid == 0) {
+          U64 running = 0;
+          for EachIndex(bucket, LNK_RADIX_SIZE) {
+            for EachIndex(w, W) {
+              U32 *slot = &rs->hist[w * LNK_RADIX_SIZE + bucket];
+              U32  c    = *slot;
+              *slot     = (U32)running;
+              running  += c;
+            }
+          }
+        }
+        barrier_wait(rs->tp->barrier);
+
+        // scatter (parallel): each worker writes its disjoint contiguous run
+        {
+          U32 *h = rs->hist + wid * LNK_RADIX_SIZE;
+          Rng1U64 r = rs->work_ranges[wid];
+          for EachInRange(i, r) {
+            U64 d   = (ksrc[i] >> shift) & (LNK_RADIX_SIZE - 1);
+            U32 pos = h[d]++;
+            kdst[pos] = ksrc[i];
+            vdst[pos] = vsrc[i];
+          }
+        }
+        barrier_wait(rs->tp->barrier);
+
+        // swap src/dst (worker 0); even pass_count leaves sorted data back in sk/sv
+        if (wid == 0) {
+          rs->radix_ksrc = kdst; rs->radix_vsrc = vdst;
+          rs->radix_kdst = ksrc; rs->radix_vdst = vsrc;
+        }
+        barrier_wait(rs->tp->barrier);
+      }
+      // ranges unchanged across radix; work_ranges already an even split over n for scan/scatter/emit
+    }
+
+    // -------- PHASE: scan mark (per-position keep bit + per-chunk local counts) --------
+    {
+      Rng1U64 r = rs->work_ranges[wid];
+      U64 N = n;
+      U64 nloc = 0, kloc = 0, sloc = 0;
+      for (U64 k = r.min; k < r.max; k += 1) {
+        B32 boundary      = (k == 0     || rs->sk[k] != rs->sk[k - 1]);
+        B32 next_boundary = (k + 1 == N || rs->sk[k + 1] != rs->sk[k]);
+        U8  survive       = (U8)!(boundary && next_boundary);
+        rs->keep[k] = survive;
+        if (boundary)            { nloc += 1; }
+        if (survive)             { kloc += 1; }
+        if (boundary && survive) { sloc += 1; }
+      }
+      rs->chunk_nc[wid] = nloc; rs->chunk_kp[wid] = kloc; rs->chunk_sc[wid] = sloc;
+    }
+    barrier_wait(rs->tp->barrier);
+
+    // -------- serial: exclusive prefix over W chunk totals (worker 0) --------
+    if (wid == 0) {
+      U64 nc = 0, next_count = 0, next_class_count = 0;
+      for EachIndex(w, W) {
+        U64 cn = rs->chunk_nc[w], ck = rs->chunk_kp[w], cs = rs->chunk_sc[w];
+        rs->chunk_nc[w] = nc; rs->chunk_kp[w] = next_count;
+        nc += cn; next_count += ck; next_class_count += cs;
+      }
+      // stash round totals in chunk_max[0..2] (free scratch; radix max-reduce reuses it fresh next
+      // round, and the convergence block below reads it within this same round before the swap).
+      rs->chunk_max[0] = nc; rs->chunk_max[1] = next_count; rs->chunk_max[2] = next_class_count;
+    }
+    barrier_wait(rs->tp->barrier);
+
+    U64 base = rs->color_base;
+
+    // -------- PHASE: scan apply (re-derive color_at[]/out_slot[] from chunk's exclusive base) --------
+    {
+      Rng1U64 r = rs->work_ranges[wid];
+      U64 nc   = rs->chunk_nc[wid];
+      U64 slot = rs->chunk_kp[wid];
+      for (U64 k = r.min; k < r.max; k += 1) {
+        B32 boundary = (k == 0 || rs->sk[k] != rs->sk[k - 1]);
+        if (boundary) { nc += 1; }
+        rs->color_at[k] = (U32)(base + (nc - 1));
+        rs->out_slot[k] = (U32)slot;
+        slot += rs->keep[k];
+      }
+    }
+    barrier_wait(rs->tp->barrier);
+
+    // -------- PHASE: color scatter --------
+    {
+      Rng1U64 r = rs->work_ranges[wid];
+      for EachInRange(k, r) { rs->colors[rs->active[rs->sv[k]]] = rs->color_at[k]; }
+    }
+    barrier_wait(rs->tp->barrier);
+
+    // -------- PHASE: survivor emit --------
+    {
+      Rng1U64 r = rs->work_ranges[wid];
+      for EachInRange(k, r) {
+        if (rs->keep[k]) { rs->active2[rs->out_slot[k]] = rs->active[rs->sv[k]]; }
+      }
+    }
+    barrier_wait(rs->tp->barrier);
+
+    // -------- serial: convergence + buffer swap (worker 0), broadcast via barrier --------
+    if (wid == 0) {
+      U64 nc               = rs->chunk_max[0];
+      U64 next_count       = rs->chunk_max[1];
+      U64 next_class_count = rs->chunk_max[2];
+      rs->color_base += nc;
+      if (nc == rs->active_class_count) {
+        rs->converged = 1;
+      } else {
+        U32 *tmp = rs->active; rs->active = rs->active2; rs->active2 = tmp;
+        rs->active_count       = next_count;
+        rs->active_class_count = next_class_count;
+        if (rs->active_count == 0) { rs->converged = 1; }
+      }
+    }
+    barrier_wait(rs->tp->barrier);
+
+    if (rs->converged) { break; }
+  }
+}
+
+// Is an object section an ICF fold candidate? Only externally-defined COMDATs are folded: the
+// follower is redirected at its shared symbol-table node, so every reference (including from other
+// objects) resolves to the leader and /OPT:REF then drops the follower. Returns 1 = candidate.
+internal U32
+lnk_icf_section_kind(LNK_Obj *obj, U64 sect_idx, B32 include_static)
+{
+  COFF_SectionFlags flags = obj->section_flags[sect_idx];
+  if (~flags & COFF_SectionFlag_LnkCOMDAT) { return 0; }
+  if ( flags & COFF_SectionFlag_LnkRemove) { return 0; }
+  // fold code and read-only initialized data (const tables, vtables, string literals); folding
+  // identical read-only data lets functions that reference it fold too (cascade). Mutable data
+  // is never folded.
+  B32 is_code   = (flags & COFF_SectionFlag_CntCode) != 0;
+  B32 is_rodata = (flags & COFF_SectionFlag_CntInitializedData) && !(flags & COFF_SectionFlag_MemWrite);
+  if (!is_code && !is_rodata) { return 0; }
+  U32 sn = (U32)sect_idx + 1;
+  COFF_SectionHeader *header = lnk_coff_section_header_from_section_number(obj, sn);
+  if (header->fsize == 0) { return 0; }
+  LNK_Symbol *sym = lnk_obj_get_comdat_symlink(obj, sn);
+  if (sym == 0) { return include_static ? 2 : 0; } // static COMDAT: opt-in /OPT:ICFSTATIC, folded via icf_fold + dead-strip
+  LNK_ObjSymbolRef sym_ref = lnk_ref_from_symbol(sym);
+  if (sym_ref.obj != obj) { return 0; } // same-name follower (already removed)
+  COFF_ParsedSymbol sym_parsed = lnk_parsed_from_symbol(sym);
+  if (sym_parsed.section_number != sn || sym_parsed.value != 0) { return 0; }
+  return 1; // external leader
+}
+
+typedef struct LNK_ICFCollectTask
+{
+  LNK_Obj     **objs;
+  U64          *counts;
+  U64          *offsets;
+  LNK_ICFCand  *cands;
+  B32           include_static;
+} LNK_ICFCollectTask;
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_icf_count_task)
+{
+  LNK_ICFCollectTask *t = raw_task;
+  LNK_Obj *obj = t->objs[task_id];
+  U64 n = 0;
+  for EachIndex(si, obj->header.section_count_no_null) { if (lnk_icf_section_kind(obj, si, t->include_static)) { n += 1; } }
+  t->counts[task_id] = n;
+}
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_icf_fill_task)
+{
+  LNK_ICFCollectTask *t = raw_task;
+  LNK_Obj *obj = t->objs[task_id];
+  U64 cur = t->offsets[task_id];
+  for EachIndex(si, obj->header.section_count_no_null) {
+    U32 kind = lnk_icf_section_kind(obj, si, t->include_static);
+    if (kind) {
+      LNK_ICFCand *c = &t->cands[cur++];
+      c->obj = obj; c->sn = (U32)si + 1;
+      c->is_static = (kind == 2);
+      // count relocs here (parallel, the section is already in hand) so lnk_opt_icf only needs a
+      // cheap serial prefix sum for reloc_first instead of re-parsing every section serially.
+      COFF_SectionHeader *header = lnk_coff_section_header_from_section_number(obj, c->sn);
+      c->reloc_first = 0;
+      c->reloc_count = (U32)lnk_coff_relocs_from_section_header(obj, header).count;
+      c->key0 = 0;
+    }
+  }
+}
+
+// --- parallel LSD radix sort (U64 key + U32 value), 8 x 8-bit passes -------------------------
+// 8-bit digits keep the serial per-pass offset prefix tiny (256*workers, not 65536*workers);
+// the extra passes are parallel histogram+scatter, so they cost little.
+#if !defined(LNK_RADIX_BITS)
+#define LNK_RADIX_BITS 8
+#define LNK_RADIX_SIZE (1 << LNK_RADIX_BITS)
+#endif
+typedef struct LNK_RadixSortTask
+{
+  Rng1U64 *ranges;
+  U64     *ksrc; U32 *vsrc; // read
+  U64     *kdst; U32 *vdst; // write (scatter)
+  U32     *hist;            // [worker_count * LNK_RADIX_SIZE], per-worker digit offsets
+  U64      shift;
+} LNK_RadixSortTask;
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_radix_hist_task)
+{
+  LNK_RadixSortTask *t = raw_task;
+  U32 *h = t->hist + (U64)task_id * LNK_RADIX_SIZE;
+  for EachInRange(i, t->ranges[task_id]) { h[(t->ksrc[i] >> t->shift) & (LNK_RADIX_SIZE - 1)] += 1; }
+}
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_radix_scatter_task)
+{
+  LNK_RadixSortTask *t = raw_task;
+  U32 *h = t->hist + (U64)task_id * LNK_RADIX_SIZE;
+  for EachInRange(i, t->ranges[task_id]) {
+    U64 d   = (t->ksrc[i] >> t->shift) & (LNK_RADIX_SIZE - 1);
+    U32 pos = h[d]++;
+    t->kdst[pos] = t->ksrc[i];
+    t->vdst[pos] = t->vsrc[i];
+  }
+}
+
+// sort (keys[], vals[]) ascending by key, in place. arena supplies scratch (double buffers + histograms).
+internal void
+lnk_radix_sort_u64_pairs(TP_Context *tp, Arena *arena, U64 n, U64 *keys, U32 *vals)
+{
+  if (n < 2) { return; }
+  U64 W = tp->worker_count;
+
+  // Only sort over the passes that can actually differ: many call sites key by dense color ids
+  // (small ints), so the top 4-6 radix bytes are all zero. Each skipped pass is a full
+  // hist+scatter over n pairs (memory-bound). An even pass_count keeps the result in the
+  // original arrays (the double-buffer swap invariant), so round max_key up to a full byte.
+  U64 max_key = 0;
+  for EachIndex(i, n) { if (keys[i] > max_key) { max_key = keys[i]; } }
+  U64 pass_count = 64 / LNK_RADIX_BITS;
+  if (max_key != 0) {
+    U64 sig_bits   = 64 - clz64(max_key);
+    U64 sig_passes = (sig_bits + LNK_RADIX_BITS - 1) / LNK_RADIX_BITS;
+    pass_count = sig_passes + (sig_passes & 1); // keep even so data lands back in keys/vals
+  } else {
+    pass_count = 0; // all keys equal (0); already trivially sorted, scatter is identity
+  }
+  if (pass_count == 0) { return; }
+
+  Temp scratch = scratch_begin(&arena, 1); // internal buffers freed on return
+  LNK_RadixSortTask t = {0};
+  t.ranges = tp_divide_work(scratch.arena, n, W);
+  t.hist   = push_array_no_zero(scratch.arena, U32, W * LNK_RADIX_SIZE);
+  U64 *kbuf = push_array_no_zero(scratch.arena, U64, n);
+  U32 *vbuf = push_array_no_zero(scratch.arena, U32, n);
+
+  U64 *ksrc = keys, *kdst = kbuf;
+  U32 *vsrc = vals, *vdst = vbuf;
+  for (U64 pass = 0; pass < pass_count; pass += 1) {
+    t.shift = pass * LNK_RADIX_BITS;
+    t.ksrc = ksrc; t.vsrc = vsrc; t.kdst = kdst; t.vdst = vdst;
+
+    MemoryZero(t.hist, sizeof(U32) * W * LNK_RADIX_SIZE);
+    tp_for_parallel(tp, 0, W, lnk_radix_hist_task, &t);
+
+    // exclusive prefix across (bucket, worker) so each worker writes a disjoint contiguous run
+    U64 running = 0;
+    for EachIndex(bucket, LNK_RADIX_SIZE) {
+      for EachIndex(w, W) {
+        U32 *slot = &t.hist[w * LNK_RADIX_SIZE + bucket];
+        U32  c    = *slot;
+        *slot     = (U32)running;
+        running  += c;
+      }
+    }
+
+    tp_for_parallel(tp, 0, W, lnk_radix_scatter_task, &t);
+
+    U64 *kt = ksrc; ksrc = kdst; kdst = kt;
+    U32 *vt = vsrc; vsrc = vdst; vdst = vt;
+  }
+  // even pass count -> sorted data is back in the original keys/vals arrays
+  scratch_end(scratch);
+}
+
+// assign each candidate a dense equivalence-class id from its key, via a parallel sort + a
+// cheap sequential group scan. Returns the number of distinct classes (for convergence).
+internal U64
+lnk_icf_dense_colors(TP_Context *tp, Arena *arena, U64 n, U64 *keys, U32 *colors)
+{
+  Temp t  = temp_begin(arena);
+  U64 *sk = push_array_no_zero(t.arena, U64, n ? n : 1);
+  U32 *sv = push_array_no_zero(t.arena, U32, n ? n : 1);
+  for EachIndex(ci, n) { sk[ci] = keys[ci]; sv[ci] = (U32)ci; }
+  lnk_radix_sort_u64_pairs(tp, t.arena, n, sk, sv);
+  U64 nc = 0;
+  for EachIndex(k, n) {
+    if (k == 0 || sk[k] != sk[k - 1]) { nc += 1; }
+    colors[sv[k]] = (U32)(nc - 1);
+  }
+  temp_end(t);
+  return nc;
+}
+
+// ICF fold VERIFY (parallel, read-only). Per color group: elect the leader, then byte+reloc-verify
+// each follower against it. The dominant cost is the str8_match/memcmp byte compare over millions of
+// candidate sections -- read-only, so it parallelizes across groups with no shared writes. Output:
+// leader_sci[k] = the elected leader's sci-index for sorted position k IF k is a verified follower
+// that folds, else max_U32. The serial apply pass (in lnk_opt_icf) consumes this in group order, so
+// the symlink/icf_fold writes stay serial+deterministic (cross-class symlink chains must not race).
+typedef struct LNK_ICFFoldVerifyTask
+{
+  Rng1U64     *ranges;      // group-index ranges per worker
+  U32         *group_first;
+  U32         *sci;
+  LNK_ICFCand *cands;
+  U8          *rt_iscand;
+  U64         *rt_target;
+  U32         *colors;
+  U32         *leader_sci;  // out: per sorted position -> leader's sci index, or max_U32
+  U32         *group_leader_oi; // out: per group -> elected leader sorted-position (for apply)
+} LNK_ICFFoldVerifyTask;
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_icf_fold_verify_task)
+{
+  LNK_ICFFoldVerifyTask *task = raw_task;
+  for EachInRange(gi, task->ranges[task_id]) {
+    U64 i = task->group_first[gi];
+    U64 j = task->group_first[gi + 1];
+    task->group_leader_oi[gi] = (U32)i;
+    if (j - i < 2) { continue; }
+
+    U64 leader_oi = i;
+    for (U64 k = i + 1; k < j; k += 1) {
+      LNK_ICFCand *a = &task->cands[task->sci[leader_oi]];
+      LNK_ICFCand *b = &task->cands[task->sci[k]];
+      B32 b_better = (b->is_static <  a->is_static) ||
+                     (b->is_static == a->is_static &&
+                      Compose64Bit(b->obj->input_idx, b->sn) < Compose64Bit(a->obj->input_idx, a->sn));
+      if (b_better) { leader_oi = k; }
+    }
+    task->group_leader_oi[gi] = (U32)leader_oi;
+
+    LNK_ICFCand        *L       = &task->cands[task->sci[leader_oi]];
+    COFF_SectionHeader *Lheader = lnk_coff_section_header_from_section_number(L->obj, L->sn);
+    String8             Ldata   = str8_substr(L->obj->data, rng_1u64(Lheader->foff, Lheader->foff + Lheader->fsize));
+
+    for (U64 k = i; k < j; k += 1) {
+      if (k == leader_oi) { continue; }
+      LNK_ICFCand        *F       = &task->cands[task->sci[k]];
+      COFF_SectionHeader *Fheader = lnk_coff_section_header_from_section_number(F->obj, F->sn);
+      if (Fheader->fsize != Lheader->fsize) { continue; }
+      if (F->reloc_count != L->reloc_count) { continue; }
+      String8 Fdata = str8_substr(F->obj->data, rng_1u64(Fheader->foff, Fheader->foff + Fheader->fsize));
+      if (!str8_match(Ldata, Fdata, 0)) { continue; }
+      B32 relocs_match = 1;
+      for EachIndex(t, L->reloc_count) {
+        U64 li = L->reloc_first + t, fi = F->reloc_first + t;
+        U64 lt = task->rt_iscand[li] ? task->colors[task->rt_target[li]] : task->rt_target[li];
+        U64 ft = task->rt_iscand[fi] ? task->colors[task->rt_target[fi]] : task->rt_target[fi];
+        if (lt != ft) { relocs_match = 0; break; }
+      }
+      if (!relocs_match) { continue; }
+      task->leader_sci[k] = task->sci[leader_oi]; // verified fold
+    }
+  }
+}
+
+// ============================================================================================
+// ICF DIRTY-CLASS WORKLIST (TAIL ACCELERATOR)
+//
+// The persistent refine region re-sorts the ENTIRE active set (~10.65M candidates) every round until a
+// round splits nothing. Measured churn (live UE link): the partition is ~99.99% stable by round ~7; in
+// the last ~11 rounds only tens-to-hundreds of candidates land in a class that actually SPLITS, yet each
+// round still pays the full ~10.65M-key sort. The work is fixed by the sort size, NOT by the (tiny)
+// amount of real refinement left. That tail is ~3-4s of pure waste.
+//
+// A candidate's refine key can only change if its OWN color changed (its class split) OR one of its
+// reloc TARGETS changed color (a target's class split). So once a class is stable AND none of the
+// classes it points at split, it can NEVER split again -- re-keying/re-sorting it is provably a no-op.
+//
+// The worklist tracks exactly the classes that can still split (DIRTY classes). Each round it re-keys
+// only members of dirty classes, against a FROZEN colors[] snapshot (Jacobi: all keys this round read
+// pre-round colors; new colors commit only at the round barrier -- reading mid-round colors would
+// over-refine past the unique coarsest stable partition and may never terminate). When a class splits,
+// every candidate that REFERENCES a member of the split class is enqueued next round (reverse
+// target->referrers index). Dirty set empty => fixpoint.
+//
+// This reaches the IDENTICAL coarsest stable partition as the full region (same refinement operator; we
+// only skip recomputation that is provably a no-op). The absolute color-id VALUES differ (ids from a
+// separate running counter), but EVERY downstream consumer -- fold grouping (sorts by color, groups =
+// equal-color runs), leader election (by candidate identity, not id), fold-verify (tests
+// colors[a]==colors[b]) -- depends ONLY on the equivalence relation, never on id values or class
+// ordering. So an id-renamed but partition-identical colors[] is byte-identical output. (Asserted every
+// round by ICF_WORKLIST_SELFCHECK against the full region partition over the whole UE link.)
+// ============================================================================================
+
+// Build CSR reverse adjacency: for target candidate T, the referrers (candidates C with a reloc C->T)
+// are rev_adj[rev_off[T] .. rev_off[T+1]). Deterministic (cands iterated in order -> referrers sorted by
+// referrer index). Self-edges (C references its own class) are kept; harmless (re-keys C's own class).
+// serial in-place sort of (keys[],vals[]) ascending by (key, then val). Used INSIDE the worklist per
+// dirty class -- classes are small (a few..hundreds of members), so a pool-free serial sort beats
+// re-entering the thread pool thousands of times per round. Stable on equal keys is not required (we
+// sub-sort sub-runs by val anyway), but we break key ties by val for a deterministic order.
+internal void
+lnk_icf_small_sort(U64 *keys, U32 *vals, U64 n)
+{
+  // insertion sort for tiny n; introsort-lite (here: just insertion + a simple shellsort gap for
+  // larger classes) keeps it dependency-free and deterministic. n is bounded by the largest class.
+  for (U64 gap = n / 2; gap > 0; gap /= 2) {
+    for (U64 i = gap; i < n; i += 1) {
+      U64 k = keys[i]; U32 v = vals[i];
+      U64 j = i;
+      while (j >= gap) {
+        U64 pk = keys[j - gap]; U32 pv = vals[j - gap];
+        if (pk < k || (pk == k && pv <= v)) { break; }
+        keys[j] = pk; vals[j] = pv; j -= gap;
+      }
+      keys[j] = k; vals[j] = v;
+    }
+  }
+}
+
+internal void
+lnk_icf_build_reverse_index(Arena *arena, U64 cand_count, LNK_ICFCand *cands, U8 *rt_iscand, U64 *rt_target,
+                            U32 **out_rev_off, U32 **out_rev_adj)
+{
+  U32 *rev_off = push_array(arena, U32, cand_count + 1);
+  for EachIndex(ci, cand_count) {
+    LNK_ICFCand *c = &cands[ci];
+    for EachIndex(j, c->reloc_count) {
+      U64 idx = (U64)c->reloc_first + j;
+      if (rt_iscand[idx]) { rev_off[(U32)rt_target[idx] + 1] += 1; }
+    }
+  }
+  for EachIndex(t, cand_count) { rev_off[t + 1] += rev_off[t]; }
+  U64 edge_count = rev_off[cand_count];
+  U32 *rev_adj = push_array_no_zero(arena, U32, edge_count ? edge_count : 1);
+  Temp t = temp_begin(arena);
+  U32 *cursor = push_array(t.arena, U32, cand_count ? cand_count : 1);
+  for EachIndex(ci, cand_count) {
+    LNK_ICFCand *c = &cands[ci];
+    for EachIndex(j, c->reloc_count) {
+      U64 idx = (U64)c->reloc_first + j;
+      if (rt_iscand[idx]) {
+        U32 tg = (U32)rt_target[idx];
+        rev_adj[rev_off[tg] + cursor[tg]] = (U32)ci;
+        cursor[tg] += 1;
+      }
+    }
+  }
+  temp_end(t);
+  *out_rev_off = rev_off;
+  *out_rev_adj = rev_adj;
+}
+
+// Dirty-class worklist tail. Precondition: colors[] holds a stable-so-far partition; `active`/
+// `active_count` is the current active set (members of classes of size>=2); color_base = next free id.
+// Runs the worklist to fixpoint, updating colors[] in place. Returns the round count via *out_rounds and
+// 1 on success / 0 if the hang-guard tripped (caller must NOT ship -- it is a bug).
+//
+// Data model: the work unit is a CLASS (a color). Per-color member lists live in one flat slab `mem[]`;
+// a class's members are mem[slot_first[s] .. slot_first[s]+slot_count[s]). `slot_of_color` maps a live
+// color -> slot+1. A split replaces the parent slot's mem range, in sorted (newkey, cand-idx) order, with
+// its sub-runs: the first sub-run KEEPS the parent slot & color; later sub-runs get fresh slots & fresh
+// color ids. Singletons are dropped (final). Determinism: members in ascending cand index within a class;
+// sub-runs sub-sorted by cand index; new ids from a deterministic running counter.
+internal B32
+lnk_icf_worklist_refine(TP_Context *tp, Arena *arena, U64 cand_count, LNK_ICFCand *cands,
+                        U32 *colors, U8 *rt_iscand, U64 *rt_target,
+                        U32 *rev_off, U32 *rev_adj,
+                        U32 *active, U64 active_count, U64 color_base, U64 *out_rounds)
+{
+  (void)tp; (void)cand_count;
+  Temp t = temp_begin(arena);
+
+  // capacities: members <= active_count; every split adds >=1 new slot but total live members never
+  // exceeds active_count, and a class of size m can split into at most m sub-classes, so total slots
+  // ever created <= active_count. Size all slot-indexed arrays to active_count (+slack).
+  U64 cap = active_count + 16;
+  U32 *mem        = push_array_no_zero(t.arena, U32, cap);
+  U32 *slot_first = push_array_no_zero(t.arena, U32, cap);
+  U32 *slot_count = push_array_no_zero(t.arena, U32, cap);
+  U32 *slot_color = push_array_no_zero(t.arena, U32, cap);
+  U64  slot_n = 0, mem_n = 0;
+  LNK_ICFMap slot_of_color = lnk_icf_map_make(t.arena, cap);
+
+  // initial slots: group the active set by current color (one sort). Active members are already only in
+  // classes of size>=2, so every produced slot has count>=2.
+  {
+    U64 *sk = push_array_no_zero(t.arena, U64, active_count ? active_count : 1);
+    U32 *sv = push_array_no_zero(t.arena, U32, active_count ? active_count : 1);
+    for EachIndex(i, active_count) { sk[i] = (U64)colors[active[i]]; sv[i] = active[i]; }
+    lnk_radix_sort_u64_pairs(tp, t.arena, active_count, sk, sv); // by color, ties by cand idx (stable-by-value)
+    for (U64 a = 0; a < active_count; ) {
+      U32 col = (U32)sk[a];
+      U64 b = a; while (b < active_count && (U32)sk[b] == col) { b += 1; }
+      U32 first = (U32)mem_n;
+      for (U64 q = a; q < b; q += 1) { mem[mem_n++] = sv[q]; }
+      slot_first[slot_n] = first; slot_count[slot_n] = (U32)(b - a); slot_color[slot_n] = col;
+      lnk_icf_map_put(&slot_of_color, col, slot_n + 1);
+      slot_n += 1;
+      a = b;
+    }
+  }
+
+  // dirty worklist = slots to re-densify this round. Seed with all slots.
+  U32 *dirty      = push_array_no_zero(t.arena, U32, cap);
+  U32 *next_dirty = push_array_no_zero(t.arena, U32, cap);
+  U8  *in_dirty   = push_array(t.arena, U8, cap); // dedup membership for next_dirty (cleared after swap)
+  U64 dirty_n = 0;
+  for EachIndex(s, slot_n) { dirty[dirty_n++] = (U32)s; }
+
+  // staged this round (committed only at the round barrier so re-keying reads FROZEN colors[]):
+  //   - new color assignments (ci -> color); applied to colors[] after the round
+  //   - newly created slots (from 2nd+ sub-runs of splits)
+  // The parent slot's mem range is rewritten in place immediately (disjoint from other slots' ranges and
+  // independent of colors[]), so member order is settled during the dirty loop; only colors[]/slot
+  // registration is deferred.
+  U32 *stage_ci    = push_array_no_zero(t.arena, U32, active_count ? active_count : 1);
+  U32 *stage_col   = push_array_no_zero(t.arena, U32, active_count ? active_count : 1);
+  U32 *split_mem   = push_array_no_zero(t.arena, U32, active_count ? active_count : 1); // members of split classes
+  U32 *ns_slot     = push_array_no_zero(t.arena, U32, cap); // staged new slot indices
+  U32 *ns_first    = push_array_no_zero(t.arena, U32, cap);
+  U32 *ns_count    = push_array_no_zero(t.arena, U32, cap);
+  U32 *ns_color    = push_array_no_zero(t.arena, U32, cap);
+
+  // hang-guard: hard round cap AND non-progress detection. The worklist tail converges in <~20 cheap
+  // rounds; if dirty_n fails to strictly shrink for too many consecutive rounds, it is a re-dirty bug.
+  enum { LNK_ICF_WL_MAX_ROUNDS = 40, LNK_ICF_WL_STALL_LIMIT = 8 };
+  U64 rounds = 0, stall = 0, prev_dirty_n = max_U64;
+  B32 ok = 1;
+
+  while (dirty_n > 0) {
+    if (rounds >= LNK_ICF_WL_MAX_ROUNDS) { ok = 0; break; }
+    if (dirty_n >= prev_dirty_n) { stall += 1; if (stall >= LNK_ICF_WL_STALL_LIMIT) { ok = 0; break; } }
+    else { stall = 0; }
+    prev_dirty_n = dirty_n;
+    rounds += 1;
+
+    U64 stage_n = 0, split_n = 0, ns_n = 0;
+
+    for EachIndex(di, dirty_n) {
+      U32 slot  = dirty[di];
+      U32 m     = slot_count[slot];
+      if (m < 2) { continue; }
+      U32 first = slot_first[slot];
+
+      // re-key this class's members against the FROZEN colors[] snapshot (colors[] not yet mutated)
+      Temp tk = temp_begin(t.arena);
+      U64 *kk = push_array_no_zero(tk.arena, U64, m);
+      U32 *vv = push_array_no_zero(tk.arena, U32, m);
+      for EachIndex(r, m) {
+        U32 ci = mem[first + r];
+        LNK_ICFCand *c = &cands[ci];
+        U64 k = lnk_icf_mix(0x9e3779b97f4a7c15ull, colors[ci]);
+        for EachIndex(j, c->reloc_count) {
+          U64 idx = (U64)c->reloc_first + j;
+          U64 tt  = rt_iscand[idx] ? colors[rt_target[idx]] : rt_target[idx];
+          k = lnk_icf_mix(k, tt);
+        }
+        kk[r] = k; vv[r] = ci;
+      }
+      lnk_icf_small_sort(kk, vv, m); // serial, pool-free: group members by new key (ties by cand idx)
+
+      U64 sub = 0;
+      for EachIndex(r, m) { if (r == 0 || kk[r] != kk[r - 1]) { sub += 1; } }
+      if (sub == 1) { temp_end(tk); continue; } // class did not split
+
+      // class splits. Rewrite parent mem range in sorted vv order (settles member order now). The 1st
+      // sub-run keeps slot/color; later sub-runs are staged as new slots with fresh ids. colors[] writes
+      // are STAGED (deferred to commit) so other dirty classes this round still read frozen colors.
+      for EachIndex(r, m) { mem[first + r] = vv[r]; }
+      U64 run0 = 0; B32 first_run = 1;
+      for (U64 r = 1; r <= m; r += 1) {
+        B32 boundary = (r == m) || (kk[r] != kk[r - 1]);
+        if (!boundary) { continue; }
+        U64 rn = r - run0;
+        U32 run_first = (U32)(first + run0);
+        U32 run_color;
+        if (first_run) {
+          run_color = slot_color[slot]; first_run = 0;
+          // parent slot's count shrinks to the 1st sub-run; staged so dirty loop still sees old count.
+          ns_slot[ns_n] = slot; // reuse parent slot
+        } else {
+          run_color = (U32)color_base; color_base += 1;
+          ns_slot[ns_n] = max_U32;  // allocate a fresh slot at commit
+        }
+        ns_first[ns_n] = run_first; ns_count[ns_n] = (U32)rn; ns_color[ns_n] = run_color; ns_n += 1;
+        for (U64 q = run0; q < r; q += 1) {
+          U32 ci = mem[first + q];
+          stage_ci[stage_n] = ci; stage_col[stage_n] = run_color; stage_n += 1;
+          split_mem[split_n++] = ci;
+        }
+        run0 = r;
+      }
+      temp_end(tk);
+    }
+
+    if (stage_n == 0) { break; } // nothing split -> fixpoint
+
+    // COMMIT (round barrier): register slots, then apply colors[]. (Order matters only that all reads in
+    // the dirty loop above already finished -- they did.)
+    for EachIndex(s, ns_n) {
+      U32 rslot = ns_slot[s];
+      if (rslot == max_U32) { rslot = (U32)slot_n; slot_n += 1; }
+      slot_first[rslot] = ns_first[s]; slot_count[rslot] = ns_count[s]; slot_color[rslot] = ns_color[s];
+      lnk_icf_map_put(&slot_of_color, (U64)ns_color[s], rslot + 1);
+    }
+    for EachIndex(i, stage_n) { colors[stage_ci[i]] = stage_col[i]; }
+
+    // next dirty set: every referrer of a split-class member, mapped to its CURRENT class slot (size>=2).
+    U64 next_n = 0;
+    for EachIndex(i, split_n) {
+      U32 mem_ci = split_mem[i];
+      for (U32 e = rev_off[mem_ci]; e < rev_off[mem_ci + 1]; e += 1) {
+        U32 ref = rev_adj[e];
+        U64 sc  = lnk_icf_map_get(&slot_of_color, (U64)colors[ref], 0);
+        if (sc) {
+          U32 rslot = (U32)(sc - 1);
+          if (slot_count[rslot] >= 2 && !in_dirty[rslot]) { in_dirty[rslot] = 1; next_dirty[next_n++] = rslot; }
+        }
+      }
+    }
+    U32 *tmp = dirty; dirty = next_dirty; next_dirty = tmp;
+    dirty_n = next_n;
+    for EachIndex(i, dirty_n) { in_dirty[dirty[i]] = 0; }
+
+    if (lnk_get_log_status(LNK_Log_Debug)) {
+      lnk_log(LNK_Log_Debug, "/OPT:ICF worklist round %llu: split_members=%llu next_dirty=%llu slots=%llu",
+              rounds, split_n, dirty_n, slot_n);
+    }
+  }
+
+  temp_end(t);
+  if (out_rounds) { *out_rounds = rounds; }
+  return ok;
+}
+
+// relocations point at equivalent targets (iteratively), then folds each group's followers
+// into a single leader by routing them through the existing COMDAT symlink machinery and
+// letting /OPT:REF garbage-collect the now-unreferenced follower sections (and their
+// associated .pdata/.xdata/.debug$S). Mirrors link.exe /OPT:ICF.
+internal void
+lnk_opt_icf(TP_Context *tp, Arena *perm, LNK_SymbolTable *symtab, LNK_Config *config, LNK_ObjList objs_list)
+{
+  if (config->opt_icf != LNK_SwitchState_Yes) { return; }
+
+  ProfBeginFunction();
+  Temp scratch = scratch_begin(&perm, 1);
+  Arena *arena = scratch.arena;
+
+  U64        objs_count = objs_list.count;
+  LNK_Obj  **objs       = lnk_array_from_obj_list(arena, objs_list);
+
+  // allocate the per-section static-fold map on every obj (sn-1 indexed, zeroed = not folded). It
+  // lives on the obj (not scratch) because /OPT:REF and the final section map read it after ICF.
+  for EachIndex(oi, objs_count) {
+    objs[oi]->icf_fold = push_array(perm, LNK_ICFFold, objs[oi]->header.section_count_no_null ? objs[oi]->header.section_count_no_null : 1);
+  }
+
+  // collect candidate sections (parallel per obj): count, exact-size, then fill at per-obj
+  // offsets. Counting first avoids sizing the array to the total section count (which would be
+  // ~10x larger and waste ~1GB on UE-scale inputs).
+  LNK_ICFCollectTask col = {0};
+  col.include_static = (config->opt_icf_static == LNK_SwitchState_Yes);
+  col.objs    = objs;
+  col.counts  = push_array(arena, U64, objs_count ? objs_count : 1);
+  tp_for_parallel(tp, 0, objs_count, lnk_icf_count_task, &col);
+  col.offsets = offsets_from_counts_array_u64(arena, col.counts, objs_count);
+  U64          cand_count = sum_array_u64(objs_count, col.counts);
+  LNK_ICFCand *cands      = push_array_no_zero(arena, LNK_ICFCand, cand_count ? cand_count : 1);
+  col.cands   = cands;
+  tp_for_parallel(tp, 0, objs_count, lnk_icf_fill_task, &col);
+
+  if (cand_count < 2) { goto done; }
+
+  // build target lookup (input_idx, section_number) -> cand_idx+1, sized to the candidate count
+  LNK_ICFMap cand_map = lnk_icf_map_make(arena, cand_count);
+  {
+    LNK_ICFCandMapPutTask put_task = {0};
+    put_task.ranges   = tp_divide_work(arena, cand_count, tp->worker_count);
+    put_task.cands    = cands;
+    put_task.cand_map = &cand_map;
+    tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_candmap_put_task, &put_task);
+  }
+
+  // assign each candidate a disjoint slice in the flattened reloc-target arrays. reloc_count was
+  // filled in parallel by lnk_icf_fill_task, so this is just a serial prefix sum (no re-parsing).
+  U64 total_relocs = 0;
+  for EachIndex(ci, cand_count) {
+    cands[ci].reloc_first = (U32)total_relocs;
+    total_relocs += cands[ci].reloc_count;
+  }
+  U8  *rt_iscand = push_array_no_zero(arena, U8,  total_relocs ? total_relocs : 1);
+  U64 *rt_target = push_array_no_zero(arena, U64, total_relocs ? total_relocs : 1);
+
+  {
+    LNK_ICFHashTask hash_task = {0};
+    hash_task.ranges    = tp_divide_work(arena, cand_count, tp->worker_count);
+    hash_task.cands     = cands;
+    hash_task.cand_map  = &cand_map;
+    hash_task.symtab    = symtab;
+    hash_task.rt_iscand = rt_iscand;
+    hash_task.rt_target = rt_target;
+    tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_hash_task, &hash_task);
+  }
+
+  // dense per-candidate equivalence-class colors (separate 8B array; hot refine/scan loops gather
+  // these by index millions of times -- keeping them out of the 40B LNK_ICFCand keeps the gathers
+  // cache-dense). zero-init: round 0 assigns real colors.
+  U32 *colors = push_array(arena, U32, cand_count ? cand_count : 1);
+
+  // assign initial colors from content key (parallel sort + group scan)
+  U64 *newkey = push_array_no_zero(arena, U64, cand_count ? cand_count : 1);
+  for EachIndex(ci, cand_count) { newkey[ci] = cands[ci].key0; }
+  U64 class_count = lnk_icf_dense_colors(tp, arena, cand_count, newkey, colors);
+  U64 color_base  = class_count; // next free class id; ids only ever grow, so finalized colors stick
+
+  // Active set = candidates still sharing a class with another. A singleton class can never split
+  // or merge, so once a candidate is alone its color is final and it leaves refinement. Each round
+  // re-densifies only the active set, shrinking the per-round sort from all ~N candidates down to
+  // just those that still have a content+reloc twin.
+  // ping-pong active buffers: refine reads `active`, densify writes compacted survivors into `active2`.
+  // They must be distinct (densify reads active[sv[m]] while writing next_active[]). Swap each round.
+  U32 *active  = push_array_no_zero(arena, U32, cand_count ? cand_count : 1);
+  U32 *active2 = push_array_no_zero(arena, U32, cand_count ? cand_count : 1);
+  U64  active_count = 0, active_class_count = 0;
+  {
+    Temp t = temp_begin(arena);
+    U32 *cls_size = push_array(t.arena, U32, class_count ? class_count : 1);
+    for EachIndex(ci, cand_count) { cls_size[colors[ci]] += 1; }
+    for EachIndex(ci, cand_count) { if (cls_size[colors[ci]] > 1) { active[active_count++] = (U32)ci; } }
+    for EachIndex(c, class_count)  { if (cls_size[c] > 1) { active_class_count += 1; } }
+    temp_end(t);
+  }
+
+  // iteratively refine classes by relocation-target classes until no class splits.
+  // PERSISTENT-WORKER REGION: collapse the ~18 rounds x ~10 phases of fork-join into ONE parallel
+  // region (lnk_icf_refine_region_task) where every phase boundary is a barrier and the serial glue
+  // (radix prefixes, group-scan prefix, convergence/swap) runs on worker 0. Byte-identical to the
+  // per-round path (preserves chunk boundaries, processing order, and serial-prefix math).
+  if (active_count > 0) {
+    LNK_ICFRegion rs = {0};
+    rs.tp           = tp;
+    rs.cands        = cands;
+    rs.rt_iscand    = rt_iscand;
+    rs.rt_target    = rt_target;
+    rs.cand_count   = cand_count;
+    rs.worker_count = tp->worker_count;
+    rs.active       = active;
+    rs.active2      = active2;
+    rs.colors       = colors;
+    rs.newkey       = newkey;
+
+    // per-round scratch, preallocated once to the max possible size (cand_count); reused every round.
+    U64 W = tp->worker_count;
+    rs.sk        = push_array_no_zero(arena, U64, cand_count ? cand_count : 1);
+    rs.sv        = push_array_no_zero(arena, U32, cand_count ? cand_count : 1);
+    rs.color_at  = push_array_no_zero(arena, U32, cand_count ? cand_count : 1);
+    rs.keep      = push_array_no_zero(arena, U8,  cand_count ? cand_count : 1);
+    rs.out_slot  = push_array_no_zero(arena, U32, cand_count ? cand_count : 1);
+    rs.kbuf      = push_array_no_zero(arena, U64, cand_count ? cand_count : 1);
+    rs.vbuf      = push_array_no_zero(arena, U32, cand_count ? cand_count : 1);
+    rs.hist      = push_array_no_zero(arena, U32, W * LNK_RADIX_SIZE);
+    rs.chunk_nc  = push_array_no_zero(arena, U64, W ? W : 1);
+    rs.chunk_kp  = push_array_no_zero(arena, U64, W ? W : 1);
+    rs.chunk_sc  = push_array_no_zero(arena, U64, W ? W : 1);
+    rs.chunk_max = push_array_no_zero(arena, U64, (W > 3 ? W : 3) ? (W > 3 ? W : 3) : 1);
+    rs.refine_ranges = push_array_no_zero(arena, Rng1U64, W + 1);
+    rs.work_ranges   = push_array_no_zero(arena, Rng1U64, W + 1);
+
+    rs.active_count       = active_count;
+    rs.active_class_count = active_class_count;
+    rs.color_base         = color_base;
+    rs.converged          = 0;
+
+    // TAIL ACCELERATOR: run the persistent region for a bounded number of warm-up rounds (churn is large
+    // here, the full parallel sort wins), then hand the still-active set to the dirty-class worklist for
+    // the long near-converged tail (rounds 8..18 re-sort ~10.65M to resolve a few hundred splits). Both
+    // compute the unique coarsest stable partition, so the final colors[] partition is identical; the
+    // worklist just reaches it in tail work ~= churn instead of ~= active_count.
+    // Worklist tail DISABLED by default: its CSR reverse-index (rev_adj = U32 x candidate-edge-count)
+    // costs ~10GB peak on the UE link, which hurts the page-fault-bound critical path more than the
+    // ~3-4s of tail re-sorts it saves. region_cap=64 lets the persistent region run to the true fixpoint
+    // (~19 rounds), so `converged` is set and the worklist handoff below is skipped (no reverse-index).
+    // Lower this back to 8 to re-enable the worklist on memory-rich machines.
+    U64 region_cap = 64; // was 8 (worklist warm-up)
+
+#if defined(ICF_WORKLIST_SELFCHECK)
+    // REFERENCE: clone pre-region state and run the region UNCAPPED to the true fixpoint into shadow
+    // colors[]; later assert the cap+worklist partition is identical every link. Reuses rs scratch
+    // buffers (sequential, not concurrent). active2/colors are restored to pre-region values first.
+    U32 *ref_colors  = push_array_no_zero(arena, U32, cand_count ? cand_count : 1);
+    MemoryCopy(ref_colors, colors, sizeof(U32) * cand_count);
+    {
+      U32 *ref_active  = push_array_no_zero(arena, U32, cand_count ? cand_count : 1);
+      U32 *ref_active2 = push_array_no_zero(arena, U32, cand_count ? cand_count : 1);
+      MemoryCopy(ref_active, active, sizeof(U32) * active_count);
+      LNK_ICFRegion ref = rs;            // copy all scratch pointers + sizes
+      ref.active  = ref_active;
+      ref.active2 = ref_active2;
+      ref.colors  = ref_colors;
+      ref.active_count = active_count; ref.active_class_count = active_class_count;
+      ref.color_base = color_base; ref.converged = 0;
+      ref.max_rounds = 64;             // uncapped (true fixpoint)
+      tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_refine_region_task, &ref);
+    }
+    // colors[] was untouched (ref ran on ref_colors); active/active2 untouched (ref used clones).
+#endif
+
+    rs.max_rounds = region_cap;
+    tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_refine_region_task, &rs);
+
+    // mirror final state back (active/colors already mutated in place; pointers may have swapped)
+    active             = rs.active;
+    active2            = rs.active2;
+    color_base         = rs.color_base;
+    active_count       = rs.active_count;
+    active_class_count = rs.active_class_count;
+
+    // hand off the tail to the dirty-class worklist if the region stopped at the cap (not yet converged)
+    if (!rs.converged && active_count > 0) {
+      U32 *rev_off = 0, *rev_adj = 0;
+      lnk_icf_build_reverse_index(arena, cand_count, cands, rt_iscand, rt_target, &rev_off, &rev_adj);
+      U64 wrounds = 0;
+      B32 wl_ok = lnk_icf_worklist_refine(tp, arena, cand_count, cands, colors, rt_iscand, rt_target,
+                                          rev_off, rev_adj, active, active_count, color_base, &wrounds);
+      AssertAlways(wl_ok); // hang-guard tripped (round cap / non-progress) -> BUG, do not ship
+      if (lnk_get_log_status(LNK_Log_Debug)) {
+        lnk_log(LNK_Log_Debug, "/OPT:ICF worklist tail: %llu rounds (active=%llu at handoff)", wrounds, active_count);
+      }
+    }
+
+#if defined(ICF_WORKLIST_SELFCHECK)
+    // ASSERT: the cap+worklist colors[] induce the SAME partition as the uncapped region (ref_colors).
+    // Partition equality = the map colors[ci] -> ref_colors[ci] is a consistent bijection (both ways).
+    {
+      Temp tc = temp_begin(arena);
+      LNK_ICFMap fwd = lnk_icf_map_make(tc.arena, cand_count); // colors -> ref_colors
+      LNK_ICFMap bwd = lnk_icf_map_make(tc.arena, cand_count); // ref_colors -> colors
+      U64 mism = 0;
+      for EachIndex(ci, cand_count) {
+        U64 a = (U64)colors[ci], b = (U64)ref_colors[ci];
+        U64 fa = lnk_icf_map_get(&fwd, a, max_U64);
+        if (fa == max_U64) { lnk_icf_map_put(&fwd, a, b); } else if (fa != b) { mism += 1; }
+        U64 fb = lnk_icf_map_get(&bwd, b, max_U64);
+        if (fb == max_U64) { lnk_icf_map_put(&bwd, b, a); } else if (fb != a) { mism += 1; }
+      }
+      if (mism) {
+        lnk_log(LNK_Log_Debug, "ICF_WORKLIST_SELFCHECK PARTITION MISMATCH: %llu cands disagree", mism);
+      }
+      AssertAlways(mism == 0); // worklist partition != full-region partition -> WRONG OUTPUT, do not ship
+      temp_end(tc);
+    }
+#endif
+  }
+  (void)active2; (void)color_base; (void)active_count; (void)active_class_count;
+
+  // group candidates by final color (parallel sort) and fold followers into a leader
+  U64 *keys = newkey; // reuse
+  U32 *sci  = push_array_no_zero(arena, U32, cand_count ? cand_count : 1);
+  for EachIndex(ci, cand_count) { keys[ci] = colors[ci]; sci[ci] = (U32)ci; }
+  lnk_radix_sort_u64_pairs(tp, arena, cand_count, keys, sci);
+
+  // serial pass: find group boundaries (cheap O(cand_count) scan over the sorted color keys). Each
+  // group [group_first[g], group_first[g+1]) shares one color. Verification + folding runs in
+  // parallel across groups (groups touch disjoint sections/symlink nodes -> race-free, deterministic).
+  U32 *group_first = push_array_no_zero(arena, U32, cand_count + 1);
+  U64  group_count = 0;
+  for (U64 i = 0; i < cand_count; ) {
+    group_first[group_count++] = (U32)i;
+    U64 color = keys[i];
+    U64 j = i + 1;
+    while (j < cand_count && keys[j] == color) { j += 1; }
+    i = j;
+  }
+  group_first[group_count] = (U32)cand_count;
+
+  // PARALLEL verify (read-only byte+reloc compare across groups) -> SERIAL apply (deterministic,
+  // group-ordered symlink/icf_fold writes; cross-class symlink chains must not race, so writes stay
+  // serial). leader_sci[k] = leader's sci index if sorted position k folds, else max_U32.
+  U32 *leader_sci      = push_array_no_zero(arena, U32, cand_count ? cand_count : 1);
+  for EachIndex(k, cand_count) { leader_sci[k] = max_U32; }
+  U32 *group_leader_oi = push_array_no_zero(arena, U32, group_count ? group_count : 1);
+  if (group_count) {
+    LNK_ICFFoldVerifyTask vt = {0};
+    vt.ranges          = tp_divide_work(arena, group_count, tp->worker_count);
+    vt.group_first     = group_first;
+    vt.sci             = sci;
+    vt.cands           = cands;
+    vt.rt_iscand       = rt_iscand;
+    vt.rt_target       = rt_target;
+    vt.colors          = colors;
+    vt.leader_sci      = leader_sci;
+    vt.group_leader_oi = group_leader_oi;
+    tp_for_parallel(tp, 0, tp->worker_count, lnk_icf_fold_verify_task, &vt);
+  }
+
+  U64 fold_count = 0;
+  for EachIndex(gi, group_count) {
+    U64 i = group_first[gi];
+    U64 j = group_first[gi + 1];
+    if (j - i < 2) { continue; }
+    U64                 leader_oi = group_leader_oi[gi];
+    LNK_ICFCand        *L         = &cands[sci[leader_oi]];
+    LNK_SymbolHashTrie *Lnode     = L->obj->symlinks[L->sn];
+    for (U64 k = i; k < j; k += 1) {
+      if (leader_sci[k] == max_U32) { continue; } // not a verified follower
+      LNK_ICFCand        *F     = &cands[sci[k]];
+      LNK_SymbolHashTrie *Fnode = F->obj->symlinks[F->sn];
+      if (Fnode) {
+        if (Lnode && Fnode != Lnode) { Fnode->symbol = Lnode->symbol; fold_count += 1; }
+      } else {
+        F->obj->icf_fold[F->sn - 1].leader_obj_idx = L->obj->input_idx;
+        F->obj->icf_fold[F->sn - 1].leader_sn      = L->sn;
+        F->obj->icf_fold[F->sn - 1].set            = 1;
+        fold_count += 1;
+      }
+    }
+  }
+
+  if (lnk_get_log_status(LNK_Log_Debug)) {
+    lnk_log(LNK_Log_Debug, "/OPT:ICF folded %llu of %llu code COMDATs into %llu classes", fold_count, cand_count, class_count);
+  }
+
+  done:;
+  scratch_end(scratch);
+  ProfEnd();
+}
+
 internal void
 lnk_opt_ref(TP_Context *tp, LNK_SymbolTable *symtab, LNK_Config *config, LNK_ObjList objs)
 {
@@ -2926,6 +4650,28 @@ THREAD_POOL_TASK_FUNC(lnk_set_comdat_leaders_contribs_task)
   ProfEnd();
 }
 
+// /OPT:ICF static-COMDAT followers have no external symbol to redirect, so point their section-map
+// entry at the leader's contrib. The follower section is dead-stripped, but any residual reloc to a
+// static symbol in it resolves its address via sect_map[obj][sn-1] -> leader's contrib.
+internal
+THREAD_POOL_TASK_FUNC(lnk_set_icf_static_leader_contribs_task)
+{
+  LNK_BuildImageTask *task    = raw_task;
+  U64                 obj_idx = task_id;
+  LNK_Obj            *obj     = task->objs[obj_idx];
+
+  if (obj->icf_fold == 0) { return; }
+
+  ProfBeginV("Set ICF Static Leader Contribs [%S]", obj->path);
+  for EachIndex(sect_idx, obj->header.section_count_no_null) {
+    LNK_ICFFold fold = obj->icf_fold[sect_idx];
+    if (!fold.set) { continue; }
+    LNK_SectionContrib *leader_sc = task->sect_map[fold.leader_obj_idx][fold.leader_sn - 1];
+    task->sect_map[obj_idx][sect_idx] = leader_sc;
+  }
+  ProfEnd();
+}
+
 internal
 THREAD_POOL_TASK_FUNC(lnk_flag_debug_symbols_task)
 {
@@ -2981,15 +4727,8 @@ THREAD_POOL_TASK_FUNC(lnk_patch_comdat_leaders_task)
             task->u.patch_symtabs.was_symbol_patched[obj_idx][symbol_idx] = 1;
           }
 
-          if (obj->header.is_big_obj) {
-            COFF_Symbol32 *symbol32  = symbol.raw_symbol;
-            symbol32->section_number = section_number;
-            symbol32->value          = value;
-          } else {
-            COFF_Symbol16 *symbol16  = symbol.raw_symbol;
-            symbol16->section_number = (U16)section_number;
-            symbol16->value          = value;
-          }
+          obj->parsed_symbols[symbol_idx].section_number = section_number;
+          obj->parsed_symbols[symbol_idx].value          = value;
         }
       }
     }
@@ -3010,11 +4749,33 @@ lnk_section_contrib_ptr_is_before(void *raw_a, void *raw_b)
   return u64_compar_is_before(&input_idx_a, &input_idx_b);
 }
 
+// chunks at/above this size are sorted by the parallel radix (all threads) before the per-chunk
+// task pass, so one giant section (e.g. merged .text) can't serialize the whole sort on one thread.
+#define LNK_SORT_CONTRIBS_RADIX_MIN (64u*1024u)
+
+// sort one chunk's contribs by Compose64Bit(obj_idx, obj_sect_idx) using the parallel radix sort.
+// The key is unique per contrib (one obj/section pair each), so this matches the radsort order.
+internal void
+lnk_sort_contribs_chunk_radix(TP_Context *tp, Arena *arena, LNK_SectionContribChunk *chunk)
+{
+  U64 n = chunk->count;
+  Temp t = temp_begin(arena);
+  U64 *keys = push_array_no_zero(t.arena, U64, n);
+  U32 *idx  = push_array_no_zero(t.arena, U32, n);
+  for EachIndex(i, n) { keys[i] = Compose64Bit(chunk->v[i]->u.obj_idx, chunk->v[i]->u.obj_sect_idx); idx[i] = (U32)i; }
+  lnk_radix_sort_u64_pairs(tp, t.arena, n, keys, idx);
+  LNK_SectionContrib **sorted = push_array_no_zero(t.arena, LNK_SectionContrib *, n);
+  for EachIndex(i, n) { sorted[i] = chunk->v[idx[i]]; }
+  MemoryCopy(chunk->v, sorted, n * sizeof(sorted[0]));
+  temp_end(t);
+}
+
 internal
 THREAD_POOL_TASK_FUNC(lnk_sort_contribs_task)
 {
   LNK_BuildImageTask *task = raw_task;
   LNK_SectionContribChunk *chunk = task->u.sort_contribs.chunks[task_id];
+  if (chunk->count >= LNK_SORT_CONTRIBS_RADIX_MIN) { return; } // big chunks done via parallel radix
   ProfBeginV("[%llu]", chunk->count);
   radsort(chunk->v, chunk->count, lnk_section_contrib_ptr_is_before);
   ProfEnd();
@@ -3053,15 +4814,8 @@ THREAD_POOL_TASK_FUNC(lnk_patch_common_block_leaders_task)
     COFF_ParsedSymbol       parsed_symbol  = lnk_parsed_from_symbol(symbol);
     U64                     section_number = task->u.patch_symtabs.common_block_sect->sect_idx + 1;
 
-    if (symbol_ref.obj->header.is_big_obj) {
-      COFF_Symbol32 *symbol32 = parsed_symbol.raw_symbol;
-      symbol32->value          = contrib->u.offset;
-      symbol32->section_number = safe_cast_u32(section_number);
-    } else {
-      COFF_Symbol16 *symbol16 = parsed_symbol.raw_symbol;
-      symbol16->value          = contrib->u.offset;
-      symbol16->section_number = safe_cast_u16(section_number);
-    }
+    symbol_ref.obj->parsed_symbols[symbol_ref.symbol_idx].value          = contrib->u.offset;
+    symbol_ref.obj->parsed_symbols[symbol_ref.symbol_idx].section_number = safe_cast_u32(section_number);
 
     task->u.patch_symtabs.was_symbol_patched[symbol_ref.obj->input_idx][symbol_ref.symbol_idx] = 1;
   }
@@ -3086,17 +4840,9 @@ THREAD_POOL_TASK_FUNC(lnk_patch_common_block_symbols_task)
       COFF_ParsedSymbol defn_parsed = lnk_parsed_from_symbol(defn);
       Assert(lnk_interp_from_symbol(defn) == COFF_SymbolValueInterp_Regular);
       if (defn) {
-        if (obj->header.is_big_obj) {
-          COFF_Symbol32 *symbol32  = symbol.raw_symbol;
-          symbol32->section_number = defn_parsed.section_number;
-          symbol32->value          = safe_cast_u32(defn_parsed.value);
-          symbol32->storage_class  = COFF_SymStorageClass_Static;
-        } else {
-          COFF_Symbol16 *symbol16  = symbol.raw_symbol;
-          symbol16->section_number = safe_cast_u16(defn_parsed.section_number);
-          symbol16->value          = safe_cast_u32(defn_parsed.value);
-          symbol16->storage_class  = COFF_SymStorageClass_Static;
-        }
+        obj->parsed_symbols[symbol_idx].section_number = defn_parsed.section_number;
+        obj->parsed_symbols[symbol_idx].value          = defn_parsed.value;
+        obj->parsed_symbols[symbol_idx].storage_class  = COFF_SymStorageClass_Static;
       }
     }
   }
@@ -3134,15 +4880,8 @@ THREAD_POOL_TASK_FUNC(lnk_patch_regular_symbols_task)
         value          = sc->u.off + symbol.value;
       }
 
-      if (obj->header.is_big_obj) {
-        COFF_Symbol32 *symbol32  = symbol.raw_symbol;
-        symbol32->section_number = section_number;
-        symbol32->value          = value;
-      } else {
-        COFF_Symbol16 *symbol16  = symbol.raw_symbol;
-        symbol16->section_number = safe_cast_u16(section_number);
-        symbol16->value          = value;
-      }
+      obj->parsed_symbols[symbol_idx].section_number = section_number;
+      obj->parsed_symbols[symbol_idx].value          = value;
     }
   }
   ProfEnd();
@@ -3179,17 +4918,9 @@ lnk_patch_obj_symtab(LNK_SymbolTable *symtab, LNK_Obj *obj, B8 *was_symbol_patch
         value          = fixup_src.value;
       }
 
-      if (obj->header.is_big_obj) {
-        COFF_Symbol32 *symbol32  = fixup_dst.raw_symbol;
-        symbol32->section_number = section_number;
-        symbol32->value          = value;
-        symbol32->storage_class  = COFF_SymStorageClass_Static;
-      } else {
-        COFF_Symbol16 *symbol16  = fixup_dst.raw_symbol;
-        symbol16->section_number = (U16)section_number;
-        symbol16->value          = value;
-        symbol16->storage_class  = COFF_SymStorageClass_Static;
-      }
+      obj->parsed_symbols[symbol_idx].section_number = section_number;
+      obj->parsed_symbols[symbol_idx].value          = value;
+      obj->parsed_symbols[symbol_idx].storage_class  = COFF_SymStorageClass_Static;
 
       was_symbol_patched[symbol_idx] = 1;
     }
@@ -3235,6 +4966,14 @@ THREAD_POOL_TASK_FUNC(lnk_image_fill_task)
   for EachNode(n, LNK_ImageFillNode, task->u.image_fill.fill_nodes[task_id]) {
     for EachIndex(i, n->sc_count) {
       LNK_SectionContrib *sc = n->sc[i];
+      // fast-path: the vast majority of contribs are a single data-node -> one direct copy, skipping
+      // the list-walk + cursor bookkeeping on the hot 739MB image-write loop.
+      if (sc->first_data_node.next == 0) {
+        U64 image_off = sc->u.off + n->base_foff;
+        Assert(image_off + sc->first_data_node.string.size <= image_data.size);
+        MemoryCopyStr8(image_data.str + image_off, sc->first_data_node.string);
+        continue;
+      }
       U64 cursor = 0;
       for EachNode(data_n, String8Node, &sc->first_data_node) {
         U64 image_off = sc->u.off + n->base_foff + cursor;
@@ -3870,17 +5609,9 @@ THREAD_POOL_TASK_FUNC(lnk_patch_section_symbols_task)
         if (sect && (~sect->flags & COFF_SectionFlag_LnkRemove)) {
           if (~sect->flags & COFF_SectionFlag_MemDiscardable) {
             LNK_SectionContrib *first_sc = lnk_get_first_section_contrib(sect);
-            if (obj->header.is_big_obj) {
-              COFF_Symbol32 *symbol32 = symbol.raw_symbol;
-              symbol32->section_number = safe_cast_u32(first_sc->u.sect_idx + 1);
-              symbol32->value          = first_sc->u.off;
-              symbol32->storage_class  = COFF_SymStorageClass_Static;
-            } else {
-              COFF_Symbol16 *symbol16 = symbol.raw_symbol;
-              symbol16->section_number = safe_cast_u16(first_sc->u.sect_idx + 1);
-              symbol16->value          = first_sc->u.off;
-              symbol16->storage_class  = COFF_SymStorageClass_Static;
-            }
+            obj->parsed_symbols[symbol_idx].section_number = safe_cast_u32(first_sc->u.sect_idx + 1);
+            obj->parsed_symbols[symbol_idx].value          = first_sc->u.off;
+            obj->parsed_symbols[symbol_idx].storage_class  = COFF_SymStorageClass_Static;
           } else {
             lnk_error_obj(LNK_Error_SectRefsDiscardedMemory, obj, "symbol %S (No. 0x%llx) references section with discard flag", symbol.name, symbol_idx);
           }
@@ -3898,17 +5629,9 @@ THREAD_POOL_TASK_FUNC(lnk_patch_section_symbols_task)
           LNK_Section *fallback_sect = task->image_sects.v[task->image_sects.count-1];
           U32 fallback_section_number = safe_cast_u32(fallback_sect->sect_idx + 1);
           U32 fallback_section_offset = safe_cast_u32(fallback_voff - fallback_sect->voff);
-          if (obj->header.is_big_obj) {
-            COFF_Symbol32 *symbol32 = symbol.raw_symbol;
-            symbol32->section_number = fallback_section_number;
-            symbol32->value          = fallback_section_offset;
-            symbol32->storage_class  = COFF_SymStorageClass_Static;
-          } else {
-            COFF_Symbol16 *symbol16 = symbol.raw_symbol;
-            symbol16->section_number = safe_cast_u16(fallback_section_number);
-            symbol16->value          = fallback_section_offset;
-            symbol16->storage_class  = COFF_SymStorageClass_Static;
-          }
+          obj->parsed_symbols[symbol_idx].section_number = fallback_section_number;
+          obj->parsed_symbols[symbol_idx].value          = fallback_section_offset;
+          obj->parsed_symbols[symbol_idx].storage_class  = COFF_SymStorageClass_Static;
 
           lnk_error_obj(LNK_Warning_UndefinedSectionSymbol, obj, "undefined section symbol %S (No. 0x%llx) refers to an image section that doesn't exist; patching to %#llx", symbol.name, symbol_idx, fallback_voff);
         }
@@ -4563,6 +6286,14 @@ lnk_build_image(TP_Arena *arena, TP_Context *tp, LNK_Config *config, LNK_SymbolT
         Assert(cursor == total_chunk_count);
       }
 
+      // big chunks (e.g. merged .text) first, each sorted across all threads via parallel radix --
+      // otherwise a single huge chunk serializes on one worker. Small chunks then go one-per-task.
+      for EachIndex(ci, total_chunk_count) {
+        LNK_SectionContribChunk *chunk = task.u.sort_contribs.chunks[ci];
+        if (chunk->count >= LNK_SORT_CONTRIBS_RADIX_MIN) {
+          lnk_sort_contribs_chunk_radix(tp, scratch.arena, chunk);
+        }
+      }
       tp_for_parallel(tp, 0, total_chunk_count, lnk_sort_contribs_task, &task);
 
       ProfEnd();
@@ -4570,6 +6301,9 @@ lnk_build_image(TP_Arena *arena, TP_Context *tp, LNK_Config *config, LNK_SymbolT
 
     tp_for_parallel_prof(tp, 0, objs_count, lnk_set_comdat_leaders_contribs_task, &task, "Update Section Map With COMDAT Leader Contribs");
 
+    // /OPT:ICF: redirect folded static-COMDAT followers' section-map entries to their leader contrib
+    tp_for_parallel_prof(tp, 0, objs_count, lnk_set_icf_static_leader_contribs_task, &task, "Update Section Map With ICF Static Leader Contribs");
+
     // build common block
     //
     // TODO: build common block in .bss and merge with .data
@@ -4774,7 +6508,11 @@ lnk_build_image(TP_Arena *arena, TP_Context *tp, LNK_Config *config, LNK_SymbolT
 
     ProfBeginV("Alloc Image Buffer [%M]", lnk_section_table_total_fsize(sectab));
     image_data.size = lnk_section_table_total_fsize(sectab) + image_string_table.total_size;
-    image_data.str  = push_array_no_zero(arena->v[0], U8, image_data.size);
+    // Standalone reservation (not the shared link arena) so it can be released the instant the image
+    // is written to disk -- VirtualFree returns fast and the kernel zeroes this ~1GB on its background
+    // thread, overlapping the rest of the run, instead of in the single-threaded exit rundown.
+    image_data.str  = reserve_memory(image_data.size);
+    commit_memory(image_data.str, image_data.size);
     ProfEnd();
 
     ProfBegin("Fill Align Bytes");
@@ -5293,6 +7031,7 @@ lnk_write_thread(void *raw_ctx)
   ProfEnd();
 }
 
+
 internal void
 lnk_log_timers(void)
 {
@@ -5322,10 +7061,39 @@ lnk_log_timers(void)
   StringJoin new_line_join = { str8_lit_comp(""), str8_lit_comp("\n"), str8_lit_comp("") };
   String8 output = str8_list_join(scratch.arena, &output_list, &new_line_join);
   lnk_log(LNK_Log_Timers, "%S\n", output);
-  
+
+  // Diagnostic: when RADLINK_PHASE_LOG is set, also write machine-parseable raw
+  // per-phase micros to that file (for automated perf A/B). Env-unset -> no-op,
+  // so normal/validation links are byte-identical; this never touches DLL/PDB bytes.
+  char *phase_log_path = getenv("RADLINK_PHASE_LOG");
+  if (phase_log_path != 0 && phase_log_path[0] != 0) {
+    String8List raw_list = {0};
+    for (U64 i = 0; i < LNK_Timer_Count; ++i) {
+      str8_list_pushf(scratch.arena, &raw_list, "%S %llu\n", lnk_string_from_timer_type(i), g_timers[i].end - g_timers[i].begin);
+    }
+    str8_list_pushf(scratch.arena, &raw_list, "TOTAL %llu\n", total_build_time_micro);
+    String8 raw_str = str8_list_join(scratch.arena, &raw_list, 0);
+    lnk_write_data_to_file_path(str8_cstring(phase_log_path), str8_zero(), raw_str);
+  }
+
   scratch_end(scratch);
 }
 
+internal
+THREAD_POOL_TASK_FUNC(lnk_scratch_decommit_worker)
+{
+  // Each worker decommits the committed-but-unused pages of its OWN equipped
+  // tctx scratch arenas. Runs on the worker thread, so tctx_selected() yields
+  // that worker's scratch. No cross-thread arena access.
+  //
+  // The barrier (dispatched with task_count == worker_count) guarantees every
+  // worker runs the body exactly once -- otherwise the work-stealing loop could
+  // let one fast worker grab several tasks and leave other workers' scratch
+  // committed. All worker_count threads are woken, so all must reach the barrier.
+  tctx_scratch_decommit();
+  barrier_wait(tp->barrier);
+}
+
 internal
 THREAD_POOL_TASK_FUNC(lnk_p2r_worker)
 {
@@ -5381,6 +7149,52 @@ lnk_debug_filter_objs(Arena *arena, LNK_Obj **objs, U64 objs_count, U64 *count_o
   return debug_info_objs;
 }
 
+// Parallel release of memory-mapped input file views.
+// Inputs are mapped copy-on-write (PAGE_WRITECOPY/FILE_MAP_COPY); pages touched
+// during linking become private-dirty and are reclaimed by the kernel in
+// single-threaded process rundown at exit (~3s for a large link). Unmapping them
+// in parallel before exit moves that reclaim off the serial post-exit path.
+typedef struct LNK_UnmapViewTask
+{
+  String8 *views;
+} LNK_UnmapViewTask;
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_unmap_view_task)
+{
+  LNK_UnmapViewTask *task = raw_task;
+  String8 view = task->views[task_id];
+#if OS_WINDOWS
+  UnmapViewOfFile(view.str);
+#elif OS_LINUX
+  munmap(view.str, view.size);
+#endif
+}
+
+internal void
+lnk_release_input_views(TP_Context *tp, LNK_Inputer *inputer)
+{
+  Temp scratch = scratch_begin(0, 0);
+
+  // collect distinct whole-file mapped views (is_thin); skip lib-member
+  // substrings and linkgen arena data
+  U64 cap = inputer->objs.count + inputer->libs.count;
+  String8 *views = push_array_no_zero(scratch.arena, String8, cap);
+  U64 count = 0;
+  for EachNode(n, LNK_Input, inputer->objs.first) { if (n->is_thin && n->data.size) { views[count++] = n->data; } }
+  for EachNode(n, LNK_Input, inputer->libs.first) { if (n->is_thin && n->data.size) { views[count++] = n->data; } }
+
+  if (count > 0) {
+    U64 begin_us = now_time_us();
+    LNK_UnmapViewTask task = { .views = views };
+    tp_for_parallel(tp, 0, count, lnk_unmap_view_task, &task);
+    U64 end_us = now_time_us();
+    lnk_log(LNK_Log_Timers, "Released %llu input views in %.2f ms", count, (F64)(end_us - begin_us) / 1000.0);
+  }
+
+  scratch_end(scratch);
+}
+
 internal void
 lnk_run_linker(TP_Context *tp, TP_Arena *arena, LNK_Config *config)
 {
@@ -5399,7 +7213,7 @@ lnk_run_linker(TP_Context *tp, TP_Arena *arena, LNK_Config *config)
   LNK_SymbolTable *symtab = lnk_symbol_table_init(arena);
 
   //
-  // Link Image
+  // Link Image (group digests, if any, are synthesized + consumed inside lnk_link_image)
   //
   LNK_LinkResult link = lnk_link_image(tp, arena, config, inputer, symtab);
 
@@ -5458,6 +7272,27 @@ lnk_run_linker(TP_Context *tp, TP_Arena *arena, LNK_Config *config)
     LNK_CodeViewInput cv        = lnk_make_code_view_input(tp, arena, config, debug_info_objs_count, debug_info_objs, rrt_input);
     LNK_MergedTypes   cv_types  = lnk_merge_types(tp, arena, &cv, 0);
 
+    // prune merged types not reachable from any surviving symbol (PDB-size win). OFF by default:
+    // it removes types that a debugger can still legitimately cast to in the watch window
+    // (reachable-from-symbols is a subset of castable-types). Opt in with /OPT:GCTYPES.
+    if (config->opt_gc_types == LNK_SwitchState_Yes) {
+      lnk_gc_types(tp, arena->v[0], &cv, &cv_types);
+    }
+
+    // merge-types reached the scratch high-water (~9GB of per-thread tctx scratch
+    // stays committed but idle). Release those unused scratch pages back to the OS
+    // before the PDB build re-grows, dropping the recorded peak working set. Each
+    // worker decommits its own scratch; do the main thread's scratch too. Only
+    // pages strictly above each arena's live `pos` are touched, so output stays
+    // byte-identical and the push path re-commits on demand during PDB build.
+    {
+      ProfBegin("Decommit Scratch");
+      // task_count == worker_count + the in-worker barrier => every worker
+      // (worker 0 IS the main thread) runs exactly once, covering main's scratch.
+      tp_for_parallel(tp, 0, tp->worker_count, lnk_scratch_decommit_worker, 0);
+      ProfEnd();
+    }
+
     //
     // Debug Info
     //
@@ -5576,11 +7411,28 @@ lnk_run_linker(TP_Context *tp, TP_Arena *arena, LNK_Config *config)
   // wait for the thread to finish writing image to disk
   thread_join(image_write_thread, -1);
 
+  // image is on disk and no longer read by anyone -- release its ~1GB now so the kernel reclaims it
+  // concurrently with the remaining work + exit, not single-threaded in the process rundown.
+  release_memory(image_ctx.image_data.str, image_ctx.image_data.size);
+
+  // outputs are written and inputs are no longer read; release the copy-on-write
+  // input views in parallel so their dirty pages are reclaimed here (multi-threaded)
+  // instead of in single-threaded process rundown at exit. Only safe for the CoW
+  // (read-only) mapping mode; read-write-shared would flush dirty pages back to the
+  // input files on unmap.
+  if ((config->io_flags & LNK_IO_Flags_MemoryMapFilesReadOnly) &&
+      !(config->io_flags & LNK_IO_Flags_MemoryMapFilesReadWrite)) {
+    lnk_release_input_views(tp, inputer);
+  }
+
   //
   // Timers
   //
-  if (lnk_get_log_status(LNK_Log_Timers)) {
-    lnk_log_timers();
+  {
+    char *phase_log_env = getenv("RADLINK_PHASE_LOG");
+    if (lnk_get_log_status(LNK_Log_Timers) || (phase_log_env != 0 && phase_log_env[0] != 0)) {
+      lnk_log_timers();
+    }
   }
   
   scratch_end(scratch);
@@ -5870,8 +7722,8 @@ entry_point(CmdLine *cmdline)
   }
 
   switch (config->boot_mode) {
-  case LNK_BootMode_Linker:     lnk_run_linker     (tp, tp_arena, config); break;
-  case LNK_BootMode_TypeServer: lnk_run_type_server(tp, tp_arena, config); break;
+  case LNK_BootMode_Linker:      lnk_run_linker       (tp, tp_arena, config); break;
+  case LNK_BootMode_TypeServer:  lnk_run_type_server  (tp, tp_arena, config); break;
   }
 
   lnk_log_end();
diff --git a/src/linker/lnk.h b/src/linker/lnk.h
index d6388d0a0..f7207f5dc 100644
--- a/src/linker/lnk.h
+++ b/src/linker/lnk.h
@@ -204,6 +204,7 @@ typedef struct LNK_BaseRelocPageArray
 typedef struct
 {
   B32                   search_anti_deps;
+  B32                   reset_cursor; // anti-dep mode flipped: rescan from list start
   LNK_Link             *link;
   HashMap              *imports_hm;
   LNK_SymbolTable      *symtab;
@@ -391,6 +392,7 @@ internal LNK_LinkResult lnk_link_image (TP_Context *tp, TP_Arena *arena, LNK_Con
 // --- Optimizations -----------------------------------------------------------
 
 internal void lnk_opt_ref(TP_Context *tp, LNK_SymbolTable *symtab, LNK_Config *config, LNK_ObjList objs);
+internal void lnk_opt_icf(TP_Context *tp, Arena *perm, LNK_SymbolTable *symtab, LNK_Config *config, LNK_ObjList objs);
 
 // --- Win32 Image -------------------------------------------------------------
 
diff --git a/src/linker/lnk_config.c b/src/linker/lnk_config.c
index ac624c0ed..4b988d3f8 100644
--- a/src/linker/lnk_config.c
+++ b/src/linker/lnk_config.c
@@ -106,6 +106,9 @@ global read_only LNK_CmdSwitch g_cmd_switch_map[] =
 
   { LNK_CmdSwitch_RadTypeServer,                   0, "RAD_TYPE_SERVER", ":FILENAME", "Merge types and store them in the specified file. The filename must have the .rrt extension." },
 
+  { LNK_CmdSwitch_IfcMap,          1, "IFCMAP",          ":FILENAME", "Map a header-unit module interface (.ifc) for debug-record resolution (TOML)." },
+  { LNK_CmdSwitch_IfcDebugRecords, 0, "IFCDEBUGRECORDS", "[:NO]",     "Resolve MSVC header-unit IFC debug records into real CodeView types." },
+
   { LNK_CmdSwitch_Help, 0, "HELP", "", "" },
   { LNK_CmdSwitch_Help, 0, "?",    "", "" },
 };
@@ -1675,10 +1678,18 @@ lnk_apply_cmd_option_to_config(LNK_Config *config, String8 cmd_name, String8List
         config->opt_icf = LNK_SwitchState_Yes;
       } else if (str8_match_lit("noicf", param, StringMatchFlag_CaseInsensitive)) {
         config->opt_icf = LNK_SwitchState_No;
+      } else if (str8_match_lit("icfstatic", param, StringMatchFlag_CaseInsensitive)) {
+        config->opt_icf_static = LNK_SwitchState_Yes;
+      } else if (str8_match_lit("noicfstatic", param, StringMatchFlag_CaseInsensitive)) {
+        config->opt_icf_static = LNK_SwitchState_No;
       } else if (str8_match_lit("lbr", param, StringMatchFlag_CaseInsensitive)) {
         config->opt_lbr = LNK_SwitchState_Yes;
       } else if (str8_match_lit("nolibr", param, StringMatchFlag_CaseInsensitive)) {
         config->opt_lbr = LNK_SwitchState_No;
+      } else if (str8_match_lit("gctypes", param, StringMatchFlag_CaseInsensitive)) {
+        config->opt_gc_types = LNK_SwitchState_Yes;
+      } else if (str8_match_lit("nogctypes", param, StringMatchFlag_CaseInsensitive)) {
+        config->opt_gc_types = LNK_SwitchState_No;
       } else {
         lnk_error_cmd_switch(LNK_Error_Cmdl, obj, cmd_switch, "unknown option \"%S\"", param);
       }
@@ -2179,6 +2190,19 @@ lnk_apply_cmd_option_to_config(LNK_Config *config, String8 cmd_name, String8List
       lnk_error_cmd_switch(LNK_Error_Cmdl, obj, cmd_switch, "missing type server file path");
     }
   } break;
+
+  case LNK_CmdSwitch_IfcMap: {
+    // collect .toml paths (header-unit -> .ifc); parsed lazily during debug-info build
+    String8List copy = str8_list_copy(config->arena, &value_strings);
+    str8_list_concat_in_place(&config->ifc_map_list, &copy);
+  } break;
+
+  case LNK_CmdSwitch_IfcDebugRecords: {
+    LNK_SwitchState state = LNK_SwitchState_Null;
+    if (lnk_cmd_switch_parse_flag(obj, cmd_switch, value_strings, &state)) {
+      config->ifc_debug_records = state;
+    }
+  } break;
   }
 
   scratch_end(scratch);
diff --git a/src/linker/lnk_config.h b/src/linker/lnk_config.h
index a781d664f..d9d10c248 100644
--- a/src/linker/lnk_config.h
+++ b/src/linker/lnk_config.h
@@ -143,6 +143,9 @@ typedef enum
   LNK_CmdSwitch_RadTypeServer,
   LNK_CmdSwitch_RadTypeServer_MatchObj,
 
+  LNK_CmdSwitch_IfcMap,
+  LNK_CmdSwitch_IfcDebugRecords,
+
   LNK_CmdSwitch_Help,
 
   LNK_CmdSwitch_Count
@@ -289,7 +292,9 @@ typedef struct LNK_Config
   B32                         ghash;
   LNK_SwitchState             opt_ref;
   LNK_SwitchState             opt_icf;
+  LNK_SwitchState             opt_icf_static; // /OPT:ICFSTATIC -- also fold static (internal-linkage) COMDATs. Default OFF: runtime-unvalidated, can shrink .text substantially.
   LNK_SwitchState             opt_lbr;
+  LNK_SwitchState             opt_gc_types; // /OPT:GCTYPES -- prune unreferenced CodeView types. Default OFF: shrinks PDB but a pruned type can't be cast-to in the debugger watch window.
   U64                         opt_iter_count;
   LNK_SwitchState             import_table_emit_biat;
   LNK_SwitchState             import_table_emit_uiat;
@@ -387,6 +392,8 @@ typedef struct LNK_Config
   String8                     type_server_name;
   LNK_SwitchState             type_server;
   LNK_SwitchState             sort_imports;
+  LNK_SwitchState             ifc_debug_records; // resolve LF_IFC_RECORD (0x1522) into real CodeView types
+  String8List                 ifc_map_list;      // .toml paths from /ifcMap (header-unit -> .ifc)
 } LNK_Config;
 
 // --- MSVC Error Codes --------------------------------------------------------
diff --git a/src/linker/lnk_debug_info.c b/src/linker/lnk_debug_info.c
index fe125133c..3e977d5df 100644
--- a/src/linker/lnk_debug_info.c
+++ b/src/linker/lnk_debug_info.c
@@ -614,6 +614,681 @@ lnk_rrt_array_from_config(Arena *arena, LNK_Config *config)
   return rrt_arr;
 }
 
+////////////////////////////////
+// IFC header-unit debug-record resolution
+
+typedef struct LNK_IfcMapEntry
+{
+  String8 ifc_path; // absolute .ifc path
+} LNK_IfcMapEntry;
+
+// Parse the trivial /ifcMap TOML by hand:
+//   [[header-unit]]
+//   name = ["quote", '<header-unit-path>']
+//   ifc  = "<abs .ifc path>"
+// Registers basename(header-unit-path) -> .ifc path in `hm` (hash_map of path->raw LNK_IfcMapEntry*).
+internal void
+lnk_parse_ifc_map_toml(Arena *arena, HashMap *hm, String8 toml_data)
+{
+  U64 cursor = 0;
+  String8 cur_name = {0};
+  while (cursor < toml_data.size) {
+    // read a line
+    U64 line_end = cursor;
+    while (line_end < toml_data.size && toml_data.str[line_end] != '\n') { line_end += 1; }
+    String8 line = str8_skip_chop_whitespace(str8_substr(toml_data, r1u64(cursor, line_end)));
+    cursor = line_end + 1;
+
+    if (line.size == 0 || line.str[0] == '#') { continue; }
+
+    if (str8_match(str8_prefix(line, 4), str8_lit("name"), 0)) {
+      // name = ["quote", '<path>']  -- extract the last single-quoted token
+      U64 q0 = str8_find_needle(line, 0, str8_lit("'"), 0);
+      if (q0 < line.size) {
+        U64 q1 = str8_find_needle(line, q0 + 1, str8_lit("'"), 0);
+        if (q1 < line.size) {
+          cur_name = str8_substr(line, r1u64(q0 + 1, q1));
+        }
+      }
+    } else if (str8_match(str8_prefix(line, 3), str8_lit("ifc"), 0)) {
+      U64 q0 = str8_find_needle(line, 0, str8_lit("\""), 0);
+      if (q0 < line.size && cur_name.size) {
+        U64 q1 = str8_find_needle(line, q0 + 1, str8_lit("\""), 0);
+        if (q1 < line.size) {
+          String8 ifc_path = str8_substr(line, r1u64(q0 + 1, q1));
+          // key by basename of the header-unit path (matches LF_IFC_RECORD header_unit_path basename)
+          String8 base = str8_skip_last_slash(cur_name);
+          // header-unit paths use backslashes; normalize to last path component
+          U64 bs = str8_find_needle_reverse(base, 0, str8_lit("\\"), 0);
+          if (bs) { base = str8_skip(base, bs); }
+          LNK_IfcMapEntry *e = push_array(arena, LNK_IfcMapEntry, 1);
+          e->ifc_path = push_str8_copy(arena, ifc_path);
+          hash_map_push_string_raw(arena, hm, push_str8_copy(arena, base), e);
+        }
+      }
+      cur_name = str8_zero();
+    }
+  }
+}
+
+// Reads every /ifcMap toml, materializes the union of header-unit basename -> .ifc path.
+internal HashMap
+lnk_build_ifc_map(Arena *arena, LNK_Config *config)
+{
+  HashMap hm = {0};
+  Temp scratch = scratch_begin(&arena, 1);
+  for EachNode(n, String8Node, config->ifc_map_list.first) {
+    String8 toml = lnk_read_data_from_file_path(scratch.arena, 0, n->string);
+    if (toml.size == 0) {
+      lnk_error(LNK_Error_Cmdl, "/ifcMap: unable to read TOML '%S'", n->string);
+      continue;
+    }
+    lnk_parse_ifc_map_toml(arena, &hm, toml);
+  }
+  scratch_end(scratch);
+  return hm;
+}
+
+// LF_IFC_RECORD (0x1522) body layout (header {len,kind} already stripped from leaf.data):
+//   u16 version (==2); u32 ifc_type_index X; u8[16] guid; u8[16] hash; char[] header_unit_path NUL
+typedef struct LNK_IfcRecord
+{
+  U32     ifc_type_index;  // X: TI into the .ifc debug-records blob (base 0x1000)
+  U8      guid[16];
+  U8      hash[16];
+  String8 header_unit_path;
+  B32     is_valid;
+} LNK_IfcRecord;
+
+internal LNK_IfcRecord
+lnk_parse_ifc_record(String8 leaf_data)
+{
+  LNK_IfcRecord rec = {0};
+  if (leaf_data.size < 2 + 4 + 16 + 16) { return rec; }
+  U64 off = 0;
+  U16 version; off += str8_deserial_read_struct(leaf_data, off, &version);
+  off += str8_deserial_read_struct(leaf_data, off, &rec.ifc_type_index);
+  MemoryCopy(rec.guid, leaf_data.str + off, 16); off += 16;
+  MemoryCopy(rec.hash, leaf_data.str + off, 16); off += 16;
+  rec.header_unit_path = str8_cstring_capped(leaf_data.str + off, leaf_data.str + leaf_data.size);
+  rec.is_valid = 1;
+  return rec;
+}
+
+// Per-blob closure + NOTYPE-prune (third pass of lnk_apply_ifc_debug_records). Each blob is fully
+// independent: it reads/writes only its own ref_bits[blob_i] and its own blob DebugT leaves, so the
+// work parallelizes across the ~13 blobs with no shared state. The worklist scratch comes from the
+// per-worker arena. Closure counts are written per-blob and summed afterward (order-independent).
+// Open-addressing U64 hash set keyed by unique_name hash. Used to record which UDT unique_names
+// already have a COMPLETE definition in a non-blob (consuming) obj, so the blob prune can keep a
+// blob's complete definition only for names that NO normal obj completes (blob-only types). cap is
+// a power of two; 0-hash is reserved as the empty sentinel (we OR in a bit so a real 0 can't occur).
+typedef struct LNK_U64Set { U64 *slots; U64 cap; } LNK_U64Set;
+
+internal U64
+lnk_uname_hash(String8 s)
+{
+  U64 h = 5381;
+  for EachIndex(c, s.size) { h = ((h << 5) + h) ^ (U64)s.str[c]; }
+  return h | 1; // never 0 (0 is the empty sentinel)
+}
+
+internal void
+lnk_u64set_add(LNK_U64Set *set, U64 h)
+{
+  U64 i = h & (set->cap - 1);
+  for (;;) {
+    if (set->slots[i] == 0)  { set->slots[i] = h; return; }
+    if (set->slots[i] == h)  { return; }
+    i = (i + 1) & (set->cap - 1);
+  }
+}
+
+// Thread-safe insert mirroring lnk_icf_map_put_atomic (lnk.c). The empty sentinel is 0 (not
+// LNK_ICF_EMPTY); h is guaranteed nonzero by lnk_uname_hash (`| 1`) so a real key can never be 0.
+// Claim an empty slot with an atomic CAS 0->h; the CAS winner owns it. On a lost race re-read the
+// same slot (it may now hold our h via a duplicate, or another key) before advancing. Duplicate
+// keys across objs are legal and idempotent (a slot already holding h returns), so any insertion
+// order yields the identical final membership -- the set is read-only (lnk_u64set_has) afterward.
+internal void
+lnk_u64set_add_atomic(LNK_U64Set *set, U64 h)
+{
+  U64 i = h & (set->cap - 1);
+  for (;;) {
+    if (set->slots[i] == 0) {
+      if (ins_atomic_u64_eval_cond_assign(&set->slots[i], h, 0) == 0) { return; }
+      continue;
+    }
+    if (set->slots[i] == h) { return; }
+    i = (i + 1) & (set->cap - 1);
+  }
+}
+
+internal B32
+lnk_u64set_has(LNK_U64Set *set, U64 h)
+{
+  U64 i = h & (set->cap - 1);
+  for (;;) {
+    if (set->slots[i] == 0) { return 0; }
+    if (set->slots[i] == h) { return 1; }
+    i = (i + 1) & (set->cap - 1);
+  }
+}
+
+// Parallel collect of complete-definition unique_name hashes per non-blob obj. Each obj is scanned
+// independently (read-only over its .debug$T leaves) and emits its hashes into a per-obj list; a
+// serial pass then adds them to the shared open-addressing set. Moves the ~1.25s serial cv_get_udt_info
+// scan off the main thread. Determinism: set membership is order-independent, serial-add reproduces.
+typedef struct LNK_IfcCompleteScanTask
+{
+  LNK_CodeViewInput *input;
+  U64              **out_hashes;  // per-obj hash array (allocated by task)
+  U64               *out_counts;  // per-obj count
+} LNK_IfcCompleteScanTask;
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_ifc_complete_scan_task)
+{
+  LNK_IfcCompleteScanTask *task = raw_task;
+  U64        obj_idx = task_id;
+  CV_DebugT *dt      = &task->input->debug_t_arr[obj_idx];
+  U64       *hashes  = push_array_no_zero(arena, U64, dt->count ? dt->count : 1);
+  U64        n       = 0;
+  for EachIndex(leaf_idx, dt->count) {
+    CV_Leaf    leaf = cv_debug_t_get_leaf(dt, leaf_idx);
+    CV_UDTInfo ui   = cv_get_udt_info(leaf.kind, leaf.data);
+    if (!(ui.props & CV_TypeProp_HasUniqueName) || ui.unique_name.size == 0) { continue; }
+    if (ui.props & CV_TypeProp_FwdRef) { continue; }
+    hashes[n++] = lnk_uname_hash(ui.unique_name);
+  }
+  task->out_hashes[obj_idx] = hashes;
+  task->out_counts[obj_idx] = n;
+}
+
+// Parallel merge of the per-obj complete-def hash lists into the shared set. Replaces the serial
+// lnk_u64set_add loop (the ~914ms hotspot -- 801ms of it first-touch KiPageFault on a single thread
+// faulting a 128MB+ set). lnk_u64set_add_atomic spreads both the random-scatter probes AND the
+// page faults across the pool. Determinism: keys may legitimately duplicate across objs, but the
+// insert is idempotent and the set is read-only afterward (lnk_u64set_has), so insertion order
+// cannot change the final membership -- output is bit-identical to the serial merge.
+typedef struct LNK_IfcSetMergeTask
+{
+  Rng1U64    *ranges;
+  U64       **out_hashes;
+  U64        *out_counts;
+  LNK_U64Set *set;
+  U64         nonblob_count;
+} LNK_IfcSetMergeTask;
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_ifc_set_merge_task)
+{
+  LNK_IfcSetMergeTask *task = raw_task;
+  for EachInRange(obj_idx, task->ranges[task_id]) {
+    U64 *h = task->out_hashes[obj_idx];
+    U64  n = task->out_counts[obj_idx];
+    for EachIndex(t, n) { lnk_u64set_add_atomic(task->set, h[t]); }
+  }
+}
+
+// Parallel second-pass scan: each consuming obj's .debug$T is scanned independently for 0x1522
+// (LF_IFC_RECORD) leaves. A worker (a) rewrites every 0x1522 leaf to NOTYPE in-place (per-obj
+// disjoint, safe) and (b) for each leaf that resolves to a .ifc blob leaf, emits an ordered
+// redirect record. The shared redirect hash map / ref_bits seed / redirect_count are NOT touched
+// here; the serial merge replays the per-obj records in ascending obj_idx, then ascending leaf_idx
+// (exactly the original serial order) so the hash-map push order -- and thus byte-for-byte output --
+// is identical to the serial scan. All blob/.ifc state (ifc_map_hm, ifc_path_to_blobidx, ifc_files,
+// debug_t_arr blob ranges) is read-only after pass 1, so worker reads are race-free.
+typedef struct LNK_IfcRedirectRec
+{
+  CV_TypeIndex K;             // consuming obj's local placeholder TI
+  U64          blob_obj_idx;  // injected blob obj index
+  U64          blob_leaf_idx; // leaf inside the blob
+  U64          blob_i;        // blob slot (for ref_bits seed)
+} LNK_IfcRedirectRec;
+
+typedef struct LNK_IfcScanTask
+{
+  LNK_CodeViewInput  *input;
+  HashMap            *ifc_map_hm;
+  HashMap            *ifc_path_to_blobidx;
+  IFC_File           *ifc_files;
+  LNK_IfcRedirectRec **out_recs;    // per-obj ordered redirect records (allocated by task)
+  U64                *out_counts;   // per-obj record count
+} LNK_IfcScanTask;
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_ifc_scan_task)
+{
+  LNK_IfcScanTask   *task    = raw_task;
+  U64                obj_idx = task_id;
+  LNK_CodeViewInput *input   = task->input;
+  CV_DebugT         *debug_t = &input->debug_t_arr[obj_idx];
+
+  // count 0x1522 leaves first to size the per-obj record array
+  U64 ifc_leaf_count = 0;
+  for EachIndex(leaf_idx, debug_t->count) {
+    CV_LeafHeader *hdr = cv_debug_t_get_leaf_header(debug_t, leaf_idx);
+    if (hdr->kind == 0x1522) { ifc_leaf_count += 1; }
+  }
+  if (ifc_leaf_count == 0) { task->out_recs[obj_idx] = 0; task->out_counts[obj_idx] = 0; return; }
+
+  LNK_IfcRedirectRec *recs = push_array_no_zero(arena, LNK_IfcRedirectRec, ifc_leaf_count);
+  U64                 n    = 0;
+
+  for EachIndex(leaf_idx, debug_t->count) {
+    CV_LeafHeader *hdr = cv_debug_t_get_leaf_header(debug_t, leaf_idx);
+    if (hdr->kind != 0x1522) { continue; }
+
+    CV_Leaf       leaf = cv_debug_t_get_leaf(debug_t, leaf_idx);
+    LNK_IfcRecord rec  = lnk_parse_ifc_record(leaf.data);
+
+    CV_TypeIndex K = cv_ti_from_leaf_idx(debug_t, CV_TypeIndexSource_TPI, leaf_idx);
+
+    if (rec.is_valid) {
+      String8 base = str8_skip_last_slash(rec.header_unit_path);
+      U64 bs = str8_find_needle_reverse(base, 0, str8_lit("\\"), 0);
+      if (bs) { base = str8_skip(base, bs); }
+      LNK_IfcMapEntry *e = hash_map_search_string_raw(task->ifc_map_hm, base);
+      if (e) {
+        U64 *slot = hash_map_search_string_u64(task->ifc_path_to_blobidx, e->ifc_path);
+        if (slot) {
+          U64        blob_i       = *slot - 1;
+          IFC_File  *f            = &task->ifc_files[blob_i];
+          U64        blob_obj_idx = input->ifc_obj_range.min + blob_i;
+          CV_DebugT *bdt          = &input->debug_t_arr[blob_obj_idx];
+
+          B32 hash_ok = MemoryMatch(rec.guid, f->content_hash, 16) &&
+                        MemoryMatch(rec.hash, f->content_hash + 16, 16);
+
+          if (f->is_valid && hash_ok) {
+            U64 blob_leaf_idx = cv_leaf_idx_from_ti(bdt, CV_TypeIndexSource_TPI, rec.ifc_type_index);
+            if (blob_leaf_idx < bdt->count) {
+              recs[n].K             = K;
+              recs[n].blob_obj_idx  = blob_obj_idx;
+              recs[n].blob_leaf_idx = blob_leaf_idx;
+              recs[n].blob_i        = blob_i;
+              n += 1;
+            }
+          }
+        }
+      }
+    }
+
+    // exclude the placeholder leaf from output regardless: rewrite to NOTYPE.
+    // per-obj disjoint write -- safe in parallel.
+    memory_write16(MemberFromPtr(CV_LeafHeader, hdr, kind), (U16)CV_LeafKind_NOTYPE);
+  }
+
+  task->out_recs[obj_idx]   = recs;
+  task->out_counts[obj_idx] = n;
+}
+
+typedef struct LNK_IfcCloseTask
+{
+  LNK_CodeViewInput *input;
+  U8               **ref_bits;
+  U64               *closure_leaves;  // per-blob output: # leaves surviving in closure
+  LNK_U64Set        *nonblob_complete; // unique_name hashes completed by some non-blob obj
+} LNK_IfcCloseTask;
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_ifc_close_blob_task)
+{
+  LNK_IfcCloseTask  *task    = raw_task;
+  U64                blob_i  = task_id;
+  LNK_CodeViewInput *input   = task->input;
+  U64        blob_obj_idx = input->ifc_obj_range.min + blob_i;
+  CV_DebugT *bdt          = &input->debug_t_arr[blob_obj_idx];
+  U8        *bits         = task->ref_bits[blob_i];
+  U64        closure      = 0;
+  if (bdt->count == 0) { task->closure_leaves[blob_i] = 0; return; }
+
+  Temp  wtemp    = temp_begin(arena);
+
+  // Extra closure roots for forward-ref completion: in CodeView a forward-ref UDT is completed by
+  // ANY same-unique_name complete definition in the PDB. A consuming obj typically emits only a
+  // forward-ref of a header-unit type; the full-merge build incidentally kept the matching complete
+  // definition from the .ifc blob, so the debugger could complete it. On-demand would drop that
+  // definition (nothing references it by TI), leaving the type incomplete vs full-merge. To preserve
+  // fidelity WITHOUT dragging the whole blob, root every blob complete-def UDT whose unique_name has
+  // NO complete definition in any non-blob obj (i.e. blob-only types -- trait/delegate marker structs
+  // etc.). Common types (FString, FGuid, ...) are completed by normal objs, so their redundant blob
+  // copies stay pruned. Members of the kept defs are pulled by the closure walk below.
+  for EachIndex(leaf_idx, bdt->count) {
+    if (bits[leaf_idx >> 3] & (1u << (leaf_idx & 7))) { continue; } // already a root
+    CV_Leaf    leaf = cv_debug_t_get_leaf(bdt, leaf_idx);
+    CV_UDTInfo ui   = cv_get_udt_info(leaf.kind, leaf.data);
+    if (!(ui.props & CV_TypeProp_HasUniqueName) || ui.unique_name.size == 0) { continue; }
+    if (ui.props & CV_TypeProp_FwdRef) { continue; }      // only complete definitions
+    U64 h = lnk_uname_hash(ui.unique_name);
+    if (lnk_u64set_has(task->nonblob_complete, h)) { continue; } // a normal obj already completes it
+    bits[leaf_idx >> 3] |= (U8)(1u << (leaf_idx & 7));
+  }
+
+  U64  *worklist = push_array_no_zero(wtemp.arena, U64, bdt->count);
+  U64   wl_count = 0;
+  for EachIndex(leaf_idx, bdt->count) {
+    if (bits[leaf_idx >> 3] & (1u << (leaf_idx & 7))) { worklist[wl_count++] = leaf_idx; }
+  }
+
+  while (wl_count) {
+    U64     leaf_idx = worklist[--wl_count];
+    CV_Leaf leaf     = cv_debug_t_get_leaf(bdt, leaf_idx);
+    Temp    itemp    = temp_begin(wtemp.arena);
+    CV_TypeIndexInfoList ti_list = cv_get_leaf_type_index_offsets(itemp.arena, leaf.kind, leaf.data);
+    for EachNode(ti_info, CV_TypeIndexInfo, ti_list.first) {
+      CV_TypeIndex *ti_ptr = str8_deserial_get_raw_ptr(leaf.data, ti_info->offset, sizeof(*ti_ptr));
+      if (ti_ptr == 0) { continue; }
+      CV_TypeIndex sub_ti = memory_read32(ti_ptr);
+      if (sub_ti < bdt->ti_ranges[ti_info->source].min ||
+          sub_ti >= bdt->ti_ranges[ti_info->source].max) { continue; }
+      U64 sub_leaf_idx = cv_leaf_idx_from_ti(bdt, ti_info->source, sub_ti);
+      if (sub_leaf_idx >= bdt->count) { continue; }
+      if (bits[sub_leaf_idx >> 3] & (1u << (sub_leaf_idx & 7))) { continue; }
+      bits[sub_leaf_idx >> 3] |= (U8)(1u << (sub_leaf_idx & 7));
+      worklist[wl_count++] = sub_leaf_idx;
+    }
+    temp_end(itemp);
+  }
+  temp_end(wtemp);
+
+  for EachIndex(leaf_idx, bdt->count) {
+    if (bits[leaf_idx >> 3] & (1u << (leaf_idx & 7))) { closure += 1; continue; }
+    CV_LeafHeader *hdr = cv_debug_t_get_leaf_header(bdt, leaf_idx);
+    if (hdr->kind == CV_LeafKind_NOTYPE) { continue; }
+    memory_write16(MemberFromPtr(CV_LeafHeader, hdr, kind), (U16)CV_LeafKind_NOTYPE);
+    memory_write16(MemberFromPtr(CV_LeafHeader, hdr, size), (U16)sizeof(CV_LeafKind));
+  }
+  task->closure_leaves[blob_i] = closure;
+}
+
+// Injects referenced .ifc debug-records blobs as extra "objs" in `input`, scans every
+// consuming obj's .debug$T for LF_IFC_RECORD (0x1522) leaves, registers each placeholder
+// local TI -> blob leaf redirect, and rewrites the 0x1522 leaf to NOTYPE so it is excluded
+// from the output TPI. Must run after .debug$T is parsed and before min-type-index / symbol
+// setup (which iterate input->count).
+internal void
+lnk_apply_ifc_debug_records(TP_Context *tp, TP_Arena *tp_arena, LNK_CodeViewInput *input, LNK_Config *config)
+{
+  ProfBeginFunction();
+  Temp scratch = scratch_begin(&tp_arena->v[0], 1);
+  Arena *arena = tp_arena->v[0];
+
+  // basename -> .ifc path
+  HashMap ifc_map_hm = lnk_build_ifc_map(scratch.arena, config);
+
+  // first pass: discover which .ifc files are actually referenced and by which records.
+  // de-dup .ifc reads by path; parse blob to a CV_DebugT once per .ifc.
+  HashMap   ifc_path_to_blobidx = {0}; // path -> (blob slot index + 1)
+  IFC_File *ifc_files           = push_array(scratch.arena, IFC_File, 256);
+  U64       ifc_file_count      = 0;
+  CV_DebugT blob_debug_t[256]   = {0};
+
+  // scan all consuming objs for 0x1522 leaves; collect the .ifc paths needed
+  for EachIndex(obj_idx, input->obj_count) {
+    CV_DebugT *debug_t = &input->debug_t_arr[obj_idx];
+    for EachIndex(leaf_idx, debug_t->count) {
+      CV_LeafHeader *hdr = cv_debug_t_get_leaf_header(debug_t, leaf_idx);
+      if (hdr->kind != 0x1522) { continue; }
+
+      CV_Leaf       leaf = cv_debug_t_get_leaf(debug_t, leaf_idx);
+      LNK_IfcRecord rec  = lnk_parse_ifc_record(leaf.data);
+      if (!rec.is_valid) { continue; }
+
+      String8 base = str8_skip_last_slash(rec.header_unit_path);
+      U64 bs = str8_find_needle_reverse(base, 0, str8_lit("\\"), 0);
+      if (bs) { base = str8_skip(base, bs); }
+
+      LNK_IfcMapEntry *e = hash_map_search_string_raw(&ifc_map_hm, base);
+      if (e == 0) { continue; }
+
+      if (hash_map_search_string_u64(&ifc_path_to_blobidx, e->ifc_path) == 0) {
+        if (ifc_file_count >= 256) { continue; }
+        String8  err = {0};
+        IFC_File f   = ifc_file_read(arena, e->ifc_path, &err);
+        if (!f.is_valid) {
+          lnk_error(LNK_Error_Cmdl, "/ifcDebugRecords: %S", err);
+          // still reserve a slot so we don't retry
+          hash_map_push_string_u64(scratch.arena, &ifc_path_to_blobidx, e->ifc_path, ifc_file_count + 1);
+          ifc_files[ifc_file_count] = f;
+          blob_debug_t[ifc_file_count] = (CV_DebugT){0};
+          ifc_file_count += 1;
+          continue;
+        }
+        // parse the raw CV leaf stream (no signature, TI base 0x1000)
+        CV_DebugT dt = cv_debug_t_from_data(arena, f.debug_records, 1);
+        hash_map_push_string_u64(scratch.arena, &ifc_path_to_blobidx, e->ifc_path, ifc_file_count + 1);
+        ifc_files[ifc_file_count]    = f;
+        blob_debug_t[ifc_file_count] = dt;
+        ifc_file_count += 1;
+      }
+    }
+  }
+
+  if (ifc_file_count == 0) { goto done; }
+
+  // --- inject blob objs into the parallel arrays (like type servers, but in ifc_obj_range) ---
+  U64 prev_count = input->count;
+  U64 new_count  = prev_count + ifc_file_count;
+
+  LNK_Obj  **obj_arr2     = push_array(arena, LNK_Obj *, new_count);
+  CV_DebugS *debug_s_arr2 = push_array(arena, CV_DebugS, new_count);
+  CV_DebugT *debug_t_arr2 = push_array(arena, CV_DebugT, new_count);
+  CV_DebugH *debug_h_arr2 = push_array(arena, CV_DebugH, new_count);
+  U64       *obj_to_ts2   = push_array(arena, U64,       new_count);
+
+  MemoryCopyTyped(obj_arr2,     input->obj_arr,     prev_count);
+  MemoryCopyTyped(debug_s_arr2, input->debug_s_arr, prev_count);
+  MemoryCopyTyped(debug_t_arr2, input->debug_t_arr, prev_count);
+  MemoryCopyTyped(debug_h_arr2, input->debug_h_arr, prev_count);
+  MemoryCopyTyped(obj_to_ts2,   input->obj_to_ts,   prev_count);
+  MemorySet(obj_to_ts2 + prev_count, 0xff, ifc_file_count * sizeof(U64)); // blobs are not type servers
+
+  // blob obj indices + index list for hash-deep / dedup
+  U32Array ifc_indices = { .v = push_array(arena, U32, ifc_file_count) };
+  for EachIndex(i, ifc_file_count) {
+    U64 blob_obj_idx = prev_count + i;
+    LNK_Obj *blob_obj = push_array(arena, LNK_Obj, 1);
+    blob_obj->path = ifc_files[i].path;
+    obj_arr2[blob_obj_idx]     = blob_obj;
+    debug_t_arr2[blob_obj_idx] = blob_debug_t[i];
+    ifc_indices.v[ifc_indices.count++] = (U32)blob_obj_idx;
+  }
+
+  input->count       = new_count;
+  input->obj_arr     = obj_arr2;
+  input->debug_s_arr = debug_s_arr2;
+  input->debug_t_arr = debug_t_arr2;
+  input->debug_h_arr = debug_h_arr2;
+  input->obj_to_ts   = obj_to_ts2;
+  input->ifc_obj_range = r1u64(prev_count, new_count);
+  input->ifc_indices   = ifc_indices; // hashed + deduped before int objs (see lnk_merge_types)
+
+  // --- on-demand pruning state: per blob, a "referenced" bitset of leaf indices that
+  // are reachable from some consuming obj's 0x1522 redirect (the closure roots). Only these
+  // + their transitive blob-internal deps get merged; the rest are rewritten to NOTYPE so the
+  // hash/dedup pipeline skips ~all of the ~1.5M blob leaves that nothing references. ---
+  U8 **ref_bits = push_array(scratch.arena, U8 *, ifc_file_count);
+  for EachIndex(i, ifc_file_count) {
+    U64 c = blob_debug_t[i].count;
+    ref_bits[i] = push_array(scratch.arena, U8, (c + 7) / 8); // zero-init -> nothing referenced yet
+  }
+
+  // --- second pass: register redirects + NOTYPE the 0x1522 leaves + seed closure roots ---
+  // Parallel per-obj scan (the ~1.86s serial hotspot): each worker NOTYPE-rewrites its own 0x1522
+  // leaves and emits an ordered list of resolved redirects. A serial merge then replays them in
+  // ascending obj_idx, then ascending leaf_idx -- the exact original serial order -- so the redirect
+  // hash-map push order, ref_bits seeding, and redirect_count are bit-for-bit identical to serial.
+  input->has_ifc_redirects = 1;
+  U64 redirect_count = 0;
+  {
+    LNK_IfcScanTask scan      = {0};
+    scan.input                = input;
+    scan.ifc_map_hm           = &ifc_map_hm;
+    scan.ifc_path_to_blobidx  = &ifc_path_to_blobidx;
+    scan.ifc_files            = ifc_files;
+    scan.out_recs             = push_array(scratch.arena, LNK_IfcRedirectRec *, input->obj_count);
+    scan.out_counts           = push_array(scratch.arena, U64,                  input->obj_count);
+    tp_for_parallel(tp, tp_arena, input->obj_count, lnk_ifc_scan_task, &scan);
+
+    // deterministic ordered merge: ascending obj_idx, and within an obj the records are already in
+    // ascending leaf_idx order (the worker scans leaf_idx ascending).
+    for EachIndex(obj_idx, input->obj_count) {
+      LNK_IfcRedirectRec *recs = scan.out_recs[obj_idx];
+      U64                 n    = scan.out_counts[obj_idx];
+      for EachIndex(t, n) {
+        LNK_IfcRedirectRec *r = &recs[t];
+        hash_map_push_u64_u64(arena, &input->ifc_redirect_hm,
+                              Compose64Bit(obj_idx, r->K),
+                              Compose64Bit(r->blob_obj_idx, r->blob_leaf_idx));
+        redirect_count += 1;
+        // seed closure root: this blob leaf is referenced
+        ref_bits[r->blob_i][r->blob_leaf_idx >> 3] |= (U8)(1u << (r->blob_leaf_idx & 7));
+      }
+    }
+  }
+
+  // --- third pass: per blob, close the referenced set over blob-internal sub-TIs, then
+  // NOTYPE every leaf not in the closure. cv_leaf_idx_from_ti on a raw blob is source-agnostic
+  // (source_offsets are 0, all ti_ranges == [0x1000, 0x1000+count)) so a sub-TI maps directly to
+  // leaf_idx = ti - 0x1000 regardless of its CV_TypeIndexSource label. Walk is iterative (worklist).
+  U64 total_blob_leaves = 0, total_closure_leaves = 0;
+  for EachIndex(blob_i, ifc_file_count) {
+    total_blob_leaves += input->debug_t_arr[input->ifc_obj_range.min + blob_i].count;
+  }
+  if (ifc_file_count) {
+    // Build the set of unique_names that already have a COMPLETE definition in some non-blob obj.
+    // The blob prune keeps a blob complete-def only when its name is absent here (blob-only type),
+    // so forward-refs that no normal obj can complete still get their definition (full-merge fidelity)
+    // while redundant blob copies of normally-defined types stay pruned. Size to ~2x the non-blob
+    // complete-def count, rounded up to a power of two, for low load factor.
+    U64 nonblob_complete_estimate = 0;
+    for EachIndex(obj_idx, input->ifc_obj_range.min) {
+      nonblob_complete_estimate += input->debug_t_arr[obj_idx].source_counts[CV_TypeIndexSource_TPI];
+    }
+    LNK_U64Set nonblob_complete = {0};
+    nonblob_complete.cap = 1;
+    while (nonblob_complete.cap < (nonblob_complete_estimate * 2 + 16)) { nonblob_complete.cap <<= 1; }
+    nonblob_complete.slots = push_array(scratch.arena, U64, nonblob_complete.cap);
+    // parallel scan: each non-blob obj emits its complete-def hashes; serial merge adds to the set.
+    U64 nonblob_count = input->ifc_obj_range.min;
+    if (nonblob_count) {
+      LNK_IfcCompleteScanTask scan = {0};
+      scan.input      = input;
+      scan.out_hashes = push_array(scratch.arena, U64 *, nonblob_count);
+      scan.out_counts = push_array(scratch.arena, U64,   nonblob_count);
+      tp_for_parallel(tp, tp_arena, nonblob_count, lnk_ifc_complete_scan_task, &scan);
+      // parallel atomic-CAS merge (replaces the serial lnk_u64set_add loop): output-identical
+      // because set membership is order-independent + idempotent (see lnk_u64set_add_atomic).
+      LNK_IfcSetMergeTask merge = {0};
+      merge.ranges        = tp_divide_work(scratch.arena, nonblob_count, tp->worker_count);
+      merge.out_hashes    = scan.out_hashes;
+      merge.out_counts    = scan.out_counts;
+      merge.set           = &nonblob_complete;
+      merge.nonblob_count = nonblob_count;
+      tp_for_parallel(tp, 0, tp->worker_count, lnk_ifc_set_merge_task, &merge);
+    }
+
+    LNK_IfcCloseTask close_task = {0};
+    close_task.input            = input;
+    close_task.ref_bits         = ref_bits;
+    close_task.closure_leaves   = push_array(scratch.arena, U64, ifc_file_count);
+    close_task.nonblob_complete = &nonblob_complete;
+    tp_for_parallel(tp, tp_arena, ifc_file_count, lnk_ifc_close_blob_task, &close_task);
+    for EachIndex(blob_i, ifc_file_count) { total_closure_leaves += close_task.closure_leaves[blob_i]; }
+  }
+  (void)tp;
+
+  lnk_log(LNK_Log_Debug, "[IFC] injected %llu .ifc blob(s), %llu record redirect(s); on-demand closure %llu / %llu blob leaves (%.1f%%)",
+          ifc_file_count, redirect_count, total_closure_leaves, total_blob_leaves,
+          total_blob_leaves ? (100.0 * (F64)total_closure_leaves / (F64)total_blob_leaves) : 0.0);
+
+done:
+  scratch_end(scratch);
+  ProfEnd();
+}
+
+////////////////////////////////
+// parallel setup tasks for lnk_make_code_view_input
+
+// Loop 3 (PCH/ext/int classification). The expensive predicates -- the read-only rrt_hm lookup
+// and cv_debug_t_is_type_server_ref -- run in parallel per obj. Each obj's class tag and PCH-merge
+// mutation are fully independent, so this pass is data-parallel. The ordered 3-array compaction
+// and the MultipleDebugTAndDebugP warning are then replayed SERIALLY in obj_idx order, so output
+// (array contents/order + warning order + discarded set) is byte-identical to the serial loop.
+typedef struct LNK_CvClassifyTask
+{
+  LNK_CodeViewInput *input;
+  CV_DebugT         *debug_p_arr;
+  HashMap           *rrt_hm; // read-only after build
+  LNK_Obj          **obj_arr;
+  U8                *class_tag; // 0=debug_p, 1=ext, 2=int
+  U8                *warn_multi;
+} LNK_CvClassifyTask;
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_cv_classify_task)
+{
+  LNK_CvClassifyTask *t       = raw_task;
+  U64                 obj_idx = task_id;
+  CV_DebugT          *debug_t = &t->input->debug_t_arr[obj_idx];
+  CV_DebugT          *debug_p = &t->debug_p_arr[obj_idx];
+
+  // classify (same predicate order/precedence as the serial loop)
+  U8 tag;
+  if      (hash_map_search_path_u64(t->rrt_hm, t->obj_arr[obj_idx]->path)) { tag = 1; }
+  else if (debug_p->count > 0 && debug_t->count == 0)                      { tag = 0; }
+  else if (cv_debug_t_is_type_server_ref(debug_t))                         { tag = 1; }
+  else                                                                     { tag = 2; }
+  t->class_tag[obj_idx] = tag;
+
+  // per-obj independent debug_t mutation (identical to serial)
+  if (debug_t->count == 0 && debug_p->count > 0) {
+    *debug_t = *debug_p;
+  } else if (debug_t->count && debug_p->count) {
+    t->warn_multi[obj_idx] = 1; // defer warning to serial obj-order replay
+    MemoryZeroStruct(debug_t);
+    MemoryZeroStruct(debug_p);
+  }
+}
+
+// Loop 4 (Make Symbol Inputs) count pass. cv_sub_section_from_debug_s is a pure read of the
+// already-parsed data_list, so caching each obj's Symbols sub-section list in parallel is safe.
+typedef struct LNK_CvSymTask
+{
+  LNK_CodeViewInput *input;
+  String8List       *per_obj_syms;
+  U64               *counts;   // per-obj node_count (count pass)
+  U64               *offsets;  // per-obj symbol_inputs offset (fill pass)
+} LNK_CvSymTask;
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_cv_sym_count_task)
+{
+  LNK_CvSymTask *t       = raw_task;
+  U64            obj_idx = task_id;
+  t->per_obj_syms[obj_idx] = cv_sub_section_from_debug_s(t->input->debug_s_arr[obj_idx], CV_C13SubSectionKind_Symbols);
+  t->counts[obj_idx]       = t->per_obj_syms[obj_idx].node_count;
+}
+
+// Loop 4 fill pass. Each obj writes a disjoint, contiguous range of symbol_inputs starting at its
+// prefix-sum offset, in node order -- byte-identical to the serial append (which walked obj_idx
+// ascending, each obj's nodes in list order).
+internal
+THREAD_POOL_TASK_FUNC(lnk_cv_sym_fill_task)
+{
+  LNK_CvSymTask *t       = raw_task;
+  U64            obj_idx = task_id;
+  U64            cur     = t->offsets[obj_idx];
+  String8List    s       = t->per_obj_syms[obj_idx];
+  for EachNode(n, String8Node, s.first) {
+    LNK_SymbolInput *in = &t->input->symbol_inputs[cur++];
+    in->obj_idx     = obj_idx;
+    in->raw_symbols = n->string;
+  }
+}
+
 internal LNK_CodeViewInput
 lnk_make_code_view_input(TP_Context *tp, TP_Arena *tp_arena, LNK_Config *config, U64 obj_count, LNK_Obj **obj_arr, LNK_RRT_Array rrt_input)
 {
@@ -634,47 +1309,51 @@ lnk_make_code_view_input(TP_Context *tp, TP_Arena *tp_arena, LNK_Config *config,
 
   ProfBegin("Apply RRT to Objs");
 
-  // hash map (obj path, obj idx)
+  // hash map (obj path, obj idx). Kept SERIAL: HashMap is a 4-ary trie whose insert mutates shared
+  // child pointers + arena-allocates nodes -> not safe for concurrent insert. Only built (and
+  // consulted) when there is at least one input RRT; the monolithic Engine.dll link has none.
   HashMap obj_path_hm = {0};
-  for EachIndex(obj_idx, obj_count) {
-    hash_map_push_path_u64(scratch.arena, &obj_path_hm, obj_arr[obj_idx]->path, obj_idx);
-  }
+  if (rrt_input.count) {
+    for EachIndex(obj_idx, obj_count) {
+      hash_map_push_path_u64(scratch.arena, &obj_path_hm, obj_arr[obj_idx]->path, obj_idx);
+    }
 
-  for EachIndex(obj_idx, obj_count) {
-    LNK_Obj *obj            = obj_arr[obj_idx];
-    U64     *packed_rrt_idx = hash_map_search_path_u64(&rrt_hm, obj->path);
-
-    // obj is not part of any input RRT
-    if (packed_rrt_idx == 0) { continue; }
-
-    // unpack index
-    U32      rrt_idx     = *packed_rrt_idx >> 32;
-    U32      rrt_obj_idx = *packed_rrt_idx & max_U32;
-    LNK_RRT *rrt         = &rrt_input.v[rrt_idx];
-
-    // obj was recompiled, do not apply RRT indirection
-    FileProperties obj_file_props = properties_from_file_path(obj->path);
-    if (rrt->obj_time_stamps[rrt_obj_idx] != obj_file_props.modified) { continue; }
-
-    // invalidate debug section pointers
-    obj->debug_t_sect_idx = ~0;
-    obj->debug_p_sect_idx = ~0;
-    obj->debug_h_sect_idx = ~0;
-
-    // apply type index map 
-    obj->ti_range = rrt->obj_ti_ranges[rrt_obj_idx];
-    obj->ti_map   = rrt->obj_ti_maps  [rrt_obj_idx];
-
-    // apply PCH info
-    U32 rrt_pch_obj_idx = rrt->obj_pch_indices[rrt_obj_idx];
-    if (rrt_pch_obj_idx < rrt->obj_count) {
-      String8  rrt_pch_obj_path = rrt->obj_paths.v[rrt_pch_obj_idx];
-      U64      pch_obj_idx      = *hash_map_search_path_u64(&obj_path_hm, rrt_pch_obj_path);
-      obj->pch_ti_range = rrt->obj_pch_ti_ranges[rrt_obj_idx];
-      obj->pch_obj_idx  = pch_obj_idx;
-    } else {
-      obj->pch_ti_range = r1u64(0,0);
-      obj->pch_obj_idx  = ~0;
+    for EachIndex(obj_idx, obj_count) {
+      LNK_Obj *obj            = obj_arr[obj_idx];
+      U64     *packed_rrt_idx = hash_map_search_path_u64(&rrt_hm, obj->path);
+
+      // obj is not part of any input RRT
+      if (packed_rrt_idx == 0) { continue; }
+
+      // unpack index
+      U32      rrt_idx     = *packed_rrt_idx >> 32;
+      U32      rrt_obj_idx = *packed_rrt_idx & max_U32;
+      LNK_RRT *rrt         = &rrt_input.v[rrt_idx];
+
+      // obj was recompiled, do not apply RRT indirection
+      FileProperties obj_file_props = properties_from_file_path(obj->path);
+      if (rrt->obj_time_stamps[rrt_obj_idx] != obj_file_props.modified) { continue; }
+
+      // invalidate debug section pointers
+      obj->debug_t_sect_idx = ~0;
+      obj->debug_p_sect_idx = ~0;
+      obj->debug_h_sect_idx = ~0;
+
+      // apply type index map
+      obj->ti_range = rrt->obj_ti_ranges[rrt_obj_idx];
+      obj->ti_map   = rrt->obj_ti_maps  [rrt_obj_idx];
+
+      // apply PCH info
+      U32 rrt_pch_obj_idx = rrt->obj_pch_indices[rrt_obj_idx];
+      if (rrt_pch_obj_idx < rrt->obj_count) {
+        String8  rrt_pch_obj_path = rrt->obj_paths.v[rrt_pch_obj_idx];
+        U64      pch_obj_idx      = *hash_map_search_path_u64(&obj_path_hm, rrt_pch_obj_path);
+        obj->pch_ti_range = rrt->obj_pch_ti_ranges[rrt_obj_idx];
+        obj->pch_obj_idx  = pch_obj_idx;
+      } else {
+        obj->pch_ti_range = r1u64(0,0);
+        obj->pch_obj_idx  = ~0;
+      }
     }
   }
   ProfEnd();
@@ -781,25 +1460,35 @@ lnk_make_code_view_input(TP_Context *tp, TP_Arena *tp_arena, LNK_Config *config,
   ProfEnd();
 
   // sort objs based on type: PCH, /Zi (external), /Z7 (internal)
-  input.debug_p_indices.v = push_array(tp_arena->v[0], U32, obj_count); 
+  input.debug_p_indices.v = push_array(tp_arena->v[0], U32, obj_count);
   input.ext_obj_indices.v = push_array(tp_arena->v[0], U32, obj_count);
   input.int_obj_indices.v = push_array(tp_arena->v[0], U32, obj_count);
-  for EachIndex(obj_idx, obj_count) {
-    CV_DebugT *debug_t = &input.debug_t_arr[obj_idx];
-    CV_DebugT *debug_p = &debug_p_arr[obj_idx];
-    U32Array  *arr_ptr;
-    if      (hash_map_search_path_u64(&rrt_hm, obj_arr[obj_idx]->path)) { arr_ptr = &input.ext_obj_indices; }
-    else if (debug_p->count > 0 && debug_t->count == 0)                 { arr_ptr = &input.debug_p_indices; }
-    else if (cv_debug_t_is_type_server_ref(debug_t))                    { arr_ptr = &input.ext_obj_indices; }
-    else                                                                { arr_ptr = &input.int_obj_indices; }
-    arr_ptr->v[arr_ptr->count++] = obj_idx;
+  ProfScope("Classify Objs")
+  {
+    // parallel: classify each obj + apply per-obj debug_t mutation (see lnk_cv_classify_task)
+    LNK_CvClassifyTask classify = {0};
+    classify.input       = &input;
+    classify.debug_p_arr = debug_p_arr;
+    classify.rrt_hm      = &rrt_hm;
+    classify.obj_arr     = obj_arr;
+    classify.class_tag   = push_array(scratch.arena, U8, obj_count ? obj_count : 1);
+    classify.warn_multi  = push_array(scratch.arena, U8, obj_count ? obj_count : 1);
+    tp_for_parallel_prof(tp, 0, obj_count, lnk_cv_classify_task, &classify, "Classify Objs (parallel)");
+
+    // serial obj-order compaction into the 3 ordered arrays + deterministic warning emission.
+    // Cache-linear single pass; preserves the exact element order + warning order of the old loop.
+    for EachIndex(obj_idx, obj_count) {
+      U32Array *arr_ptr;
+      switch (classify.class_tag[obj_idx]) {
+        case 0:  arr_ptr = &input.debug_p_indices; break;
+        case 1:  arr_ptr = &input.ext_obj_indices; break;
+        default: arr_ptr = &input.int_obj_indices; break;
+      }
+      arr_ptr->v[arr_ptr->count++] = obj_idx;
 
-    if (debug_t->count == 0 && debug_p->count > 0) {
-      *debug_t = *debug_p;
-    } else if (debug_t->count && debug_p->count) {
-      lnk_error_obj(LNK_Warning_MultipleDebugTAndDebugP, obj_arr[obj_idx], "multiple sections with debug types detected, obj must have either .debug$T or .debug$P; discarding both sections");
-      MemoryZeroStruct(debug_t);
-      MemoryZeroStruct(debug_p);
+      if (classify.warn_multi[obj_idx]) {
+        lnk_error_obj(LNK_Warning_MultipleDebugTAndDebugP, obj_arr[obj_idx], "multiple sections with debug types detected, obj must have either .debug$T or .debug$P; discarding both sections");
+      }
     }
   }
 
@@ -1059,6 +1748,12 @@ lnk_make_code_view_input(TP_Context *tp, TP_Arena *tp_arena, LNK_Config *config,
   }
   ProfEnd();
 
+  // resolve MSVC header-unit IFC debug records (LF_IFC_RECORD 0x1522) -> real CodeView types.
+  // injects .ifc debug-records blobs as extra objs and registers placeholder-TI redirects.
+  if (config->ifc_debug_records == LNK_SwitchState_Yes && config->ifc_map_list.node_count) {
+    lnk_apply_ifc_debug_records(tp, tp_arena, &input, config);
+  }
+
   // set default min type index
   for EachIndex(ti_source, CV_TypeIndexSource_COUNT) { input.min_type_indices[ti_source] = CV_MinComplexTypeIndex; }
 
@@ -1075,25 +1770,24 @@ lnk_make_code_view_input(TP_Context *tp, TP_Arena *tp_arena, LNK_Config *config,
 
   ProfBegin("Make Symbol Inputs");
   {
-    // count symbol blocks
-    for EachIndex(obj_idx, input.count) {
-      String8List s = cv_sub_section_from_debug_s(input.debug_s_arr[obj_idx], CV_C13SubSectionKind_Symbols);
-      input.symbol_input_count += s.node_count;
-    }
-  
+    // count symbol blocks (cache each obj's Symbols sub-section list so the fill pass below
+    // does not re-decode .debug$S a second time -- cv_sub_section_from_debug_s walks subsections).
+    String8List  *per_obj_syms = push_array(scratch.arena, String8List, input.count ? input.count : 1);
+    LNK_CvSymTask sym_task      = {0};
+    sym_task.input        = &input;
+    sym_task.per_obj_syms = per_obj_syms;
+    sym_task.counts       = push_array(scratch.arena, U64, input.count ? input.count : 1);
+
+    // parallel: cache each obj's Symbols sub-section list + count nodes
+    tp_for_parallel_prof(tp, 0, input.count, lnk_cv_sym_count_task, &sym_task, "Count Symbol Inputs");
+    input.symbol_input_count = sum_array_u64(input.count, sym_task.counts);
+    sym_task.offsets         = offsets_from_counts_array_u64(scratch.arena, sym_task.counts, input.count);
+
     // alloc block pointers
-    input.symbol_inputs = push_array_no_zero(tp_arena->v[0], LNK_SymbolInput, input.symbol_input_count);
+    input.symbol_inputs = push_array_no_zero(tp_arena->v[0], LNK_SymbolInput, input.symbol_input_count ? input.symbol_input_count : 1);
 
-    U64 symbol_input_count = 0;
-    for EachIndex(obj_idx, input.count) {
-      String8List s = cv_sub_section_from_debug_s(input.debug_s_arr[obj_idx], CV_C13SubSectionKind_Symbols);
-      for EachNode(n, String8Node, s.first) {
-        Assert(symbol_input_count < input.symbol_input_count);
-        LNK_SymbolInput *in = &input.symbol_inputs[symbol_input_count++];
-        in->obj_idx     = obj_idx;
-        in->raw_symbols = n->string;
-      }
-    }
+    // parallel fill into disjoint per-obj ranges at prefix-sum offsets (byte-identical order)
+    tp_for_parallel_prof(tp, 0, input.count, lnk_cv_sym_fill_task, &sym_task, "Fill Symbol Inputs");
 
     ProfBegin("Make Ranges");
     U64 total_input_size = 0;
@@ -1124,6 +1818,16 @@ lnk_make_code_view_input(TP_Context *tp, TP_Arena *tp_arena, LNK_Config *config,
 internal LNK_LeafRef
 lnk_leaf_ref_from_ti(LNK_CodeViewInput *input, U32 obj_idx, CV_TypeIndexSource source, CV_TypeIndex ti)
 {
+  // IFC redirect: a consuming obj's local LF_IFC_RECORD placeholder TI is mapped
+  // to a leaf inside an injected .ifc debug-records blob obj. The blob leaves then
+  // dedup/hash/fixup natively through the rest of this function.
+  if (input->has_ifc_redirects && source == CV_TypeIndexSource_TPI) {
+    U64 *packed = hash_map_search_u64_u64(&input->ifc_redirect_hm, Compose64Bit(obj_idx, ti));
+    if (packed) {
+      return (LNK_LeafRef){ (U32)(*packed >> 32), (U32)(*packed & max_U32) };
+    }
+  }
+
   // ti range: external type server
   U64 ts_idx = input->obj_to_ts[obj_idx];
   if (ts_idx != max_U64) {
@@ -1370,29 +2074,23 @@ lnk_hash_cv_leaf_deep(Arena               *arena,
   temp_end(temp);
 }
 
-internal LNK_LeafRef *
-lnk_leaf_hash_table_search(LNK_LeafHashTable *ht, LNK_CodeViewInput *input, LNK_LeafRef leaf_ref)
+// Returns the type index assigned to leaf_ref's deduped class, or 0 if absent. Dedup is by hash
+// (lnk_match_leaf_ref is a_hash==b_hash), so the assigned-ti table is keyed purely by hash -- a single
+// probe, deref-free (the occupant hash is on the slot), and sized to the UNIQUE leaf count.
+internal CV_TypeIndex
+lnk_leaf_hash_table_search_ti(LNK_AssignedTiHash *ht, LNK_CodeViewInput *input, LNK_LeafRef leaf_ref)
 {
-  LNK_LeafRef *match = 0;
-
-  CV_DebugT *debug_t         = &input->debug_t_arr[leaf_ref.obj_idx];
-  CV_DebugH *debug_h         = &input->debug_h_arr[leaf_ref.obj_idx];
-  U64        hash            = debug_h->v[leaf_ref.leaf_idx];
-  U64        best_bucket_idx = hash % ht->cap;
-  U64        bucket_idx      = best_bucket_idx;
+  CV_DebugH *debug_h  = &input->debug_h_arr[leaf_ref.obj_idx];
+  U64        hash     = debug_h->v[leaf_ref.leaf_idx];
+  U64        best_idx = hash & (ht->cap - 1); // cap is pow2
+  U64        idx      = best_idx;
   do {
-    LNK_LeafRef *bucket = ht->bucket_arr[bucket_idx];
-    if (bucket == 0) { break; }
-
-    if (lnk_match_leaf_ref(input, *bucket, leaf_ref)) {
-      match = bucket;
-      break;
-    }
-
-    bucket_idx = (bucket_idx + 1) == ht->cap ? 0 : (bucket_idx + 1);
-  } while (bucket_idx != best_bucket_idx);
+    if (ht->ti_arr[idx] == 0) { break; } // empty slot -> not present
+    if (ht->hash_arr[idx] == hash) { return ht->ti_arr[idx]; }
+    idx = (idx + 1) == ht->cap ? 0 : (idx + 1);
+  } while (idx != best_idx);
 
-  return match;
+  return 0;
 }
 
 internal
@@ -1419,8 +2117,12 @@ THREAD_POOL_TASK_FUNC(lnk_hash_debug_t_deep_task)
   LNK_MergeTypes *task    = raw_task;
   U64             obj_idx = task->indices.v[task_id];
   CV_DebugT      *debug_t = &task->input->debug_t_arr[obj_idx];
+  // on-demand IFC pruning: leaves the closure didn't reach were rewritten to NOTYPE.
+  // skip them as deep-hash roots so we don't hash/bucket ~all of the ~1.5M blob leaves.
+  B32 is_ifc_blob = task->input->has_ifc_redirects && contains_1u64(task->input->ifc_obj_range, obj_idx);
   for EachIndex(leaf_idx, debug_t->count) {
     if (task->input->debug_h_arr[obj_idx].v[leaf_idx] != 0) { continue; }
+    if (is_ifc_blob && cv_debug_t_get_leaf_header(debug_t, leaf_idx)->kind == CV_LeafKind_NOTYPE) { continue; }
     Temp                 temp    = temp_begin(task->fixed_arenas[worker_id]);
     CV_Leaf              leaf    = cv_debug_t_get_leaf(debug_t, leaf_idx);
     CV_TypeIndexInfoList ti_list = cv_get_leaf_type_index_offsets(temp.arena, leaf.kind, leaf.data);
@@ -1454,7 +2156,7 @@ THREAD_POOL_TASK_FUNC(lnk_populate_leaf_ht)
     CV_LeafKind         kind        = memory_read16(MemberFromPtr(CV_LeafHeader, header, kind)); // leaf header -> leaf kind
     CV_TypeIndexSource  leaf_source = cv_type_index_source_from_leaf_kind(kind);                 // leaf kind -> type stream
     LNK_LeafHashTable  *leaf_ht     = &task->leaf_ht_arr[leaf_source];                           // type stream -> hash table
-    U64                 best_idx    = debug_h->v[leaf_idx] % leaf_ht->cap;                       // leaf ref -> hash -> bucket index
+    U64                 best_idx    = debug_h->v[leaf_idx] & (leaf_ht->cap - 1);                  // leaf ref -> hash -> bucket index (cap pow2)
     U64                 idx         = best_idx;
     do {
       // load leaf ref
@@ -1495,8 +2197,14 @@ THREAD_POOL_TASK_FUNC(lnk_leaf_dedup_task)
   CV_DebugH      *debug_h = &task->input->debug_h_arr[obj_idx];
   ProfBeginDynamic("dedup in obj 0x%llx (%.*s) leaf count %llu", obj_idx, str8_varg(task->input->obj_arr[obj_idx]->path), debug_t->count);
 
+  // on-demand IFC pruning: NOTYPE'd blob leaves were never hashed (hash==0). skip them so we
+  // don't insert ~1.5M zero-hash leaves into the NULL-source table (whose cap is sized from the
+  // pre-prune source_counts and would overflow). they are never emitted, so nothing references them.
+  B32 is_ifc_blob = task->input->has_ifc_redirects && contains_1u64(task->input->ifc_obj_range, obj_idx);
+
   LNK_LeafRef *bucket = 0;
   for EachIndex(leaf_idx, debug_t->count) {
+    if (is_ifc_blob && cv_debug_t_get_leaf_header(debug_t, leaf_idx)->kind == CV_LeafKind_NOTYPE) { continue; }
     // alloc new bucket and assign type ref
     if (bucket == 0) { bucket = push_array_no_zero(arena, LNK_LeafRef, 1); }
 
@@ -1509,7 +2217,7 @@ THREAD_POOL_TASK_FUNC(lnk_leaf_dedup_task)
     CV_LeafKind         kind        = memory_read16(MemberFromPtr(CV_LeafHeader, header, kind)); // leaf header -> leaf kind
     CV_TypeIndexSource  leaf_source = cv_type_index_source_from_leaf_kind(kind);                 // leaf kind -> type stream
     LNK_LeafHashTable  *leaf_ht     = &task->leaf_ht_arr[leaf_source];                           // type stream -> hash table
-    U64                 best_idx    = debug_h->v[leaf_idx] % leaf_ht->cap;                       // leaf ref -> hash -> bucket index
+    U64                 best_idx    = debug_h->v[leaf_idx] & (leaf_ht->cap - 1);                  // leaf ref -> hash -> bucket index (cap pow2)
     U64                 idx         = best_idx;
     do {
       // load leaf ref
@@ -1744,57 +2452,39 @@ THREAD_POOL_TASK_FUNC(lnk_assign_type_indices_task)
 {
   LNK_MergeTypes *task  = raw_task;
 
-  CV_TypeIndexSource  ti_source         = task->ti_source;
-  LNK_LeafRefArray    unique_leaf_refs  = task->unique_leaf_refs_arr[ti_source];
-  CV_TypeIndex        min_type_index    = task->min_type_indices[ti_source];
-  U64                 assigned_type_cap = task->assigned_type_caps[ti_source];
-  CV_TypeIndex       *assigned_type_ht  = task->assigned_type_hts[ti_source];
+  CV_TypeIndexSource  ti_source        = task->ti_source;
+  LNK_LeafRefArray    unique_leaf_refs = task->unique_leaf_refs_arr[ti_source];
+  CV_TypeIndex        min_type_index   = task->min_type_indices[ti_source];
+  LNK_AssignedTiHash *at               = &task->assigned_ti_arr[ti_source];
+  CV_DebugH          *debug_h_arr      = task->input->debug_h_arr;
 
+  // Insert each unique leaf's assigned type index into the (unique-sized) hash->ti table, keyed by leaf
+  // hash. Unique leaves have distinct hashes (dedup is by hash), so each claims its own empty slot.
+  // search_ti (in the later fixup phase, after this barrier) recovers ti in one deref-free probe.
   for EachInRange(i, task->ranges[task_id]) {
     LNK_LeafRef  *leaf_ref   = unique_leaf_refs.v[i];
     CV_TypeIndex  type_index = min_type_index + i;
 
-    U64 hash     = u64_hash_from_str8(str8_struct(leaf_ref));
-    U64 best_idx = hash % assigned_type_cap;
+    U64 hash     = debug_h_arr[leaf_ref->obj_idx].v[leaf_ref->leaf_idx];
+    U64 best_idx = hash & (at->cap - 1); // cap is pow2
     U64 idx      = best_idx;
 
-    B32 is_inserted = 0;
+    B32 is_assigned = 0;
     do {
-      CV_TypeIndex curr_type_index = assigned_type_ht[idx];
-      if (curr_type_index == 0) {
-        CV_TypeIndex cmp_type_index = ins_atomic_u32_eval_cond_assign(&assigned_type_ht[idx], type_index, curr_type_index);
-        if (cmp_type_index == curr_type_index) {
-          is_inserted = 1;
+      if (at->ti_arr[idx] == 0) {
+        CV_TypeIndex cmp = ins_atomic_u32_eval_cond_assign(&at->ti_arr[idx], type_index, 0);
+        if (cmp == 0) {
+          at->hash_arr[idx] = hash; // only this worker owns the slot now; read back in the fixup phase
+          is_assigned = 1;
           break;
         }
       }
-      // advance
-      idx = (idx + 1) == assigned_type_cap ? 0 : (idx + 1);
+      idx = (idx + 1) == at->cap ? 0 : (idx + 1);
     } while (idx != best_idx);
-    Assert(is_inserted);
+    Assert(is_assigned);
   }
 }
 
-internal CV_TypeIndex
-lnk_assigned_type_ht_search(U64 cap, CV_TypeIndex *ht, CV_TypeIndex min_type_index, LNK_LeafRefArray unique_leaf_refs, LNK_LeafRef *v, U64 hash)
-{
-  U64 best_idx = hash % cap;
-  U64 idx      = best_idx;
-  do {
-    CV_TypeIndex type_index = ht[idx];
-    if (type_index < min_type_index) { break; }
-
-    U64          leaf_idx = type_index - min_type_index;
-    LNK_LeafRef *compar   = unique_leaf_refs.v[leaf_idx];
-    if (MemoryMatchStruct(compar,v)) { return type_index; }
-      
-    idx = (idx + 1) == cap ? 0 : (idx + 1);
-  } while(idx != best_idx);
-
-  InvalidPath;
-  return 0;
-}
-
 internal void
 lnk_fixup_cv_type_indices(LNK_MergeTypes *ctx, U32 obj_idx, String8 data, CV_TypeIndexInfoList ti_info_list)
 {
@@ -1805,21 +2495,11 @@ lnk_fixup_cv_type_indices(LNK_MergeTypes *ctx, U32 obj_idx, String8 data, CV_Typ
     // skip basic types
     if (ti < ctx->input->min_type_indices[n->source]) { continue; }
 
-    CV_TypeIndex final_ti = 0;
-    LNK_LeafRef        leaf_ref   = lnk_leaf_ref_from_ti(ctx->input, obj_idx, n->source, ti);
-    LNK_LeafHashTable *leaf_ht    = &ctx->leaf_ht_arr[n->source];
-    LNK_LeafRef       *final_leaf = lnk_leaf_hash_table_search(leaf_ht, ctx->input, leaf_ref);
-    if (final_leaf) {
-      U64 final_hash = u64_hash_from_str8(str8_struct(final_leaf));
-      final_ti = lnk_assigned_type_ht_search(ctx->assigned_type_caps  [n->source],
-                                             ctx->assigned_type_hts   [n->source],
-                                             ctx->min_type_indices    [n->source],
-                                             ctx->unique_leaf_refs_arr[n->source],
-                                             final_leaf,
-                                             final_hash);
-    }
+    LNK_LeafRef         leaf_ref = lnk_leaf_ref_from_ti(ctx->input, obj_idx, n->source, ti);
+    LNK_AssignedTiHash *assigned = &ctx->assigned_ti_arr[n->source];
+    CV_TypeIndex        final_ti = lnk_leaf_hash_table_search_ti(assigned, ctx->input, leaf_ref);
 #if BUILD_DEBUG
-    else {
+    if (final_ti == 0) {
       lnk_error_obj(LNK_Error_InvalidTypeIndex, ctx->input->obj_arr[obj_idx], "no itype 0x%x", ti);
     }
 #endif
@@ -1988,23 +2668,9 @@ THREAD_POOL_TASK_FUNC(lnk_build_obj_ti_map)
   for EachIndex(leaf_idx, debug_t->count) {
     CV_Leaf            leaf       = cv_debug_t_get_leaf(debug_t, leaf_idx);
     CV_TypeIndexSource source     = cv_type_index_source_from_leaf_kind(leaf.kind);
-    LNK_LeafRef        leaf_ref   = { obj_idx, leaf_idx };
-    LNK_LeafHashTable *leaf_ht    = &task->leaf_ht_arr[source];
-    LNK_LeafRef       *final_leaf = lnk_leaf_hash_table_search(leaf_ht, input, leaf_ref);
-
-    if (final_leaf) {
-      U64          final_hash = u64_hash_from_str8(str8_struct(final_leaf));
-      CV_TypeIndex final_ti   = lnk_assigned_type_ht_search(task->assigned_type_caps  [source],
-                                                            task->assigned_type_hts   [source],
-                                                            task->min_type_indices    [source],
-                                                            task->unique_leaf_refs_arr[source],
-                                                            final_leaf,
-                                                            final_hash);
-
-      obj_ti_map[leaf_idx] = final_ti;
-    } else {
-      obj_ti_map[leaf_idx] = 0;
-    }
+    LNK_LeafRef         leaf_ref = { obj_idx, leaf_idx };
+    LNK_AssignedTiHash *assigned = &task->assigned_ti_arr[source];
+    obj_ti_map[leaf_idx] = lnk_leaf_hash_table_search_ti(assigned, input, leaf_ref);
   }
 
   task->result.obj_ti_maps[obj_idx] = obj_ti_map;
@@ -2028,6 +2694,7 @@ lnk_merge_types(TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input, LNK
       U32Array     indices;
       U32Array     hash_indices;
     } hash_targets[] = {
+      { lnk_hash_debug_t_deep_task, input->ifc_indices         }, // hash .ifc blobs first: int-obj leaves redirect into them
       { lnk_hash_debug_t_task,      input->debug_p_indices     }, // hash .debug$P first so we can mix in hashes for precompiled sub leaves when hashing leaves in .debug$T
       { lnk_hash_debug_t_task,      input->int_obj_indices     },
       { lnk_hash_debug_t_deep_task, input->type_server_indices },
@@ -2084,14 +2751,24 @@ lnk_merge_types(TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input, LNK
   }
   ProfEnd();
 
+  // bucket_arr (the ~1.3x-total-leaf-count probe tables) is only live through the dedup + extract
+  // phases: its last read is in lnk_get_present_buckets_task ("Copy present buckets") which copies
+  // bucket pointers into unique_leaf_refs. Allocate it in a dedicated arena so we can release that
+  // multi-GB working set immediately after the extract loop, before the merge-types/PDB-build peak.
+  // (A temp_begin on scratch.arena would not work: many surviving allocations -- unique_leaf_refs,
+  // assigned_ti, radix scratch -- land in scratch.arena after bucket_arr.)
+  Arena *bucket_arena = arena_alloc(.name = "LEAF_BUCKETS");
+
   ProfBegin("Leaf Hash Table Init");
   for EachIndex(ti_source, CV_TypeIndexSource_COUNT) {
     U64 total_count = 0;
     for EachIndex(obj_idx, input->count) { total_count += input->debug_t_arr[obj_idx].source_counts[ti_source]; }
 
     task.leaf_ht_arr[ti_source].cap = total_count;
-    task.leaf_ht_arr[ti_source].cap = 1 + ((task.leaf_ht_arr[ti_source].cap * 13) / 10); // * 1.3
-    task.leaf_ht_arr[ti_source].bucket_arr = push_array(scratch.arena, LNK_LeafRef *, task.leaf_ht_arr[ti_source].cap);
+    // pow2 cap so bucket index is hash & (cap-1) (mask) instead of hash % cap (a 64-bit DIV in the
+    // densest dedup probe loop). u64_up_to_pow2(1.3*count) keeps load factor <= ~0.65.
+    task.leaf_ht_arr[ti_source].cap = u64_up_to_pow2(1 + ((task.leaf_ht_arr[ti_source].cap * 13) / 10)); // * 1.3, pow2
+    task.leaf_ht_arr[ti_source].bucket_arr = push_array(bucket_arena, LNK_LeafRef *, task.leaf_ht_arr[ti_source].cap);
 
 #if PROFILE_TELEMETRY
     tmMessage(0, TMMF_ICON_NOTE, "%.*s Bucket Count: %.*s", str8_varg(cv_string_from_type_index_source(ti_source)), str8_varg(str8_from_count(scratch.arena, task.leaf_ht_arr[ti_source].cap)));
@@ -2136,6 +2813,9 @@ lnk_merge_types(TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input, LNK
 
   task.indices = dedup_type_server_indices;
   tp_for_parallel_prof(tp, tp_temp, task.indices.count, lnk_leaf_dedup_task, &task, "Type Servers");
+
+  task.indices = input->ifc_indices;
+  tp_for_parallel_prof(tp, tp_temp, task.indices.count, lnk_leaf_dedup_task, &task, "IFC Blobs");
   ProfEnd();
 
   ProfBegin("Extract present buckets from the leaf hash tables");
@@ -2148,6 +2828,12 @@ lnk_merge_types(TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input, LNK
 
     task.unique_leaf_refs_arr[ti_source].count = sum_array_u64(tp->worker_count, task.counts[ti_source]);
     task.unique_leaf_refs_arr[ti_source].v     = push_array_no_zero(scratch.arena, LNK_LeafRef *, task.unique_leaf_refs_arr[ti_source].count);
+
+    // assigned-ti table sized to the unique (deduped) count -- not the total leaf count, which would
+    // add the bucket-parallel ti/hash arrays' worth of peak working set (~3GB on large links)
+    task.assigned_ti_arr[ti_source].cap      = u64_up_to_pow2(1 + ((task.unique_leaf_refs_arr[ti_source].count * 13) / 10)); // * 1.3, pow2 -> mask index
+    task.assigned_ti_arr[ti_source].ti_arr   = push_array(scratch.arena, CV_TypeIndex, task.assigned_ti_arr[ti_source].cap);
+    task.assigned_ti_arr[ti_source].hash_arr = push_array(scratch.arena, U64, task.assigned_ti_arr[ti_source].cap);
     task.offsets[ti_source]                    = offsets_from_counts_array_u64(scratch.arena, task.counts[ti_source], tp->worker_count);
     tp_for_parallel_prof(tp, 0, tp->worker_count, lnk_get_present_buckets_task, &task, "Copy present buckets");
 
@@ -2234,6 +2920,11 @@ lnk_merge_types(TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input, LNK
     }
   }
 
+  // bucket_arr is fully consumed (copied into unique_leaf_refs / sorted) -- release the probe tables
+  // now so this multi-GB working set is gone before the merge-types/PDB-build peak.
+  arena_release(bucket_arena);
+  for EachIndex(ti_source, CV_TypeIndexSource_COUNT) { task.leaf_ht_arr[ti_source].bucket_arr = 0; }
+
   #if PROFILE_TELEMETRY
   tmMessage(0, TMMF_ICON_NOTE, "TPI Count: %.*s", str8_varg(str8_from_count(scratch.arena, task.unique_leaf_refs_arr[CV_TypeIndexSource_TPI].count)));
   tmMessage(0, TMMF_ICON_NOTE, "IPI Count: %.*s", str8_varg(str8_from_count(scratch.arena, task.unique_leaf_refs_arr[CV_TypeIndexSource_IPI].count)));
@@ -2246,8 +2937,6 @@ lnk_merge_types(TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input, LNK
     ProfBegin("Assign type indices");
     for EachIndex(ti_source, CV_TypeIndexSource_COUNT) {
       task.ti_source                     = ti_source;
-      task.assigned_type_caps[ti_source] = (task.unique_leaf_refs_arr[ti_source].count * 13) / 10;
-      task.assigned_type_hts [ti_source] = push_array(scratch.arena, CV_TypeIndex, task.assigned_type_caps[ti_source]);
       task.min_type_indices  [ti_source] = CV_MinComplexTypeIndex;
       task.ranges                        = tp_divide_work(scratch.arena, task.unique_leaf_refs_arr[ti_source].count, tp->worker_count);
       tp_for_parallel_prof(tp, 0, tp->worker_count, lnk_assign_type_indices_task, &task, "Assign Type Indices");
@@ -3132,6 +3821,321 @@ THREAD_POOL_TASK_FUNC(lnk_push_dbi_sec_contrib_task)
   }
 }
 
+////////////////////////////////
+// Type Garbage Collection
+//
+// After type merging, prune merged TPI/IPI leaves that are not reachable from any surviving
+// symbol record (the GC roots), compact them, and remap all type indices. link.exe keeps a
+// large unreferenced-type set; pruning it is a transparent PDB-size win (debug-info only -- the
+// image is untouched). Runs on the final post-fixup type indices in place.
+
+typedef struct LNK_GCTypes
+{
+  LNK_CodeViewInput *cv;
+  U64                min   [CV_TypeIndexSource_COUNT]; // first type index per source
+  U64                orig_n[CV_TypeIndexSource_COUNT]; // pre-GC leaf count per source
+  U8                *mark    [CV_TypeIndexSource_COUNT]; // reachable bitmap, indexed by (ti - min)
+  CV_TypeIndex      *remap   [CV_TypeIndexSource_COUNT]; // old leaf idx -> new type index
+  U8               **leaf_v  [CV_TypeIndexSource_COUNT]; // original leaf pointer arrays
+  U32               *udt_next;                           // TPI fwdref<->definition unique_name ring
+  Rng1U64           *sym_ranges;
+  B32                do_rewrite;                         // 0 = mark roots, 1 = rewrite to compacted indices
+  // transitive-closure frontier: indices marked but not yet expanded. Each leaf is appended once
+  // (the atomic mark gates it), so frontier[s] is sized orig_n[s] and fcount[s] is its atomic tail.
+  U32               *frontier[CV_TypeIndexSource_COUNT];
+  U32               *fcount  [CV_TypeIndexSource_COUNT]; // atomic append cursor per source
+  // per-source scratch (set before dispatch)
+  CV_TypeIndexSource cur_source;
+  U8               **cur_leaf_v;
+  Rng1U64           *cur_ranges;
+  U64                round_begin, round_end;             // frontier slice processed this round
+} LNK_GCTypes;
+
+typedef struct LNK_GCNamePair { U64 hash; U32 idx; } LNK_GCNamePair;
+
+internal int
+lnk_gc_name_pair_is_before(void *raw_a, void *raw_b)
+{
+  LNK_GCNamePair *a = raw_a, *b = raw_b;
+  return a->hash != b->hash ? (a->hash < b->hash) : (a->idx < b->idx);
+}
+
+internal void
+lnk_gc_mark_ti(LNK_GCTypes *g, CV_TypeIndexSource s, CV_TypeIndex ti)
+{
+  U64 lo = g->min[s];
+  if (ti >= lo) { U64 idx = ti - lo; if (idx < g->orig_n[s]) { g->mark[s][idx] = 1; } }
+}
+
+// walk a record's type-index sites; mark roots (do_rewrite==0) or rewrite to compacted indices (==1)
+internal void
+lnk_gc_visit_offsets(LNK_GCTypes *g, String8 data, CV_TypeIndexInfoList ti_info_list)
+{
+  for EachNode(n, CV_TypeIndexInfo, ti_info_list.first) {
+    U8          *p  = data.str + n->offset;
+    CV_TypeIndex ti = memory_read32(p);
+    if (g->do_rewrite) {
+      U64 lo = g->min[n->source];
+      if (ti >= lo) { U64 idx = ti - lo; if (idx < g->orig_n[n->source]) { memory_write32(p, g->remap[n->source][idx]); } }
+    } else {
+      lnk_gc_mark_ti(g, n->source, ti);
+    }
+  }
+}
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_gc_syms_task)
+{
+  LNK_GCTypes *g = raw_task;
+  Temp scratch = scratch_begin(0, 0);
+  for EachInRange(i, g->sym_ranges[task_id]) {
+    LNK_SymbolInput symbols = g->cv->symbol_inputs[i];
+    for (U64 cursor = 0; cursor + sizeof(CV_SymbolHeader) <= symbols.raw_symbols.size; ) {
+      Temp temp = temp_begin(scratch.arena);
+      CV_Symbol symbol = {0};
+      TryReadBreak(cv_read_symbol(symbols.raw_symbols, cursor, CV_SymbolAlign, &symbol), cursor);
+      CV_TypeIndexInfoList l = cv_get_symbol_type_index_offsets(temp.arena, symbol.kind, symbol.data);
+      lnk_gc_visit_offsets(g, symbol.data, l);
+      temp_end(temp);
+    }
+  }
+  scratch_end(scratch);
+}
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_gc_inlines_task)
+{
+  LNK_GCTypes *g = raw_task;
+  U64 obj_idx = task_id;
+  Temp scratch = scratch_begin(0, 0);
+  String8List inlinee_lines = cv_sub_section_from_debug_s(g->cv->debug_s_arr[obj_idx], CV_C13SubSectionKind_InlineeLines);
+  for EachNode(dn, String8Node, inlinee_lines.first) {
+    Temp temp = temp_begin(scratch.arena);
+    CV_TypeIndexInfoList l = cv_get_inlinee_type_index_offsets(temp.arena, dn->string);
+    lnk_gc_visit_offsets(g, dn->string, l);
+    temp_end(temp);
+  }
+  scratch_end(scratch);
+}
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_gc_rewrite_leaves_task)
+{
+  LNK_GCTypes *g = raw_task;
+  Temp scratch = scratch_begin(0, 0);
+  for EachInRange(i, g->cur_ranges[task_id]) {
+    Temp temp = temp_begin(scratch.arena);
+    CV_Leaf leaf = cv_leaf_from_ptr(g->cur_leaf_v[i]);
+    CV_TypeIndexInfoList l = cv_get_leaf_type_index_offsets(temp.arena, leaf.kind, leaf.data);
+    lnk_gc_visit_offsets(g, leaf.data, l);
+    temp_end(temp);
+  }
+  scratch_end(scratch);
+}
+
+typedef struct LNK_GCRingTask
+{
+  U8            **leaf_v;
+  Rng1U64        *ranges;
+  U64            *counts;
+  U64            *offsets;
+  LNK_GCNamePair *pairs;
+} LNK_GCRingTask;
+
+// parallel: count UDT leaves with a unique_name per range (pass 0) / emit (hash,idx) pairs (pass 1)
+internal
+THREAD_POOL_TASK_FUNC(lnk_gc_ring_count_task)
+{
+  LNK_GCRingTask *t = raw_task;
+  U64 n = 0;
+  for EachInRange(i, t->ranges[task_id]) {
+    CV_Leaf leaf = cv_leaf_from_ptr(t->leaf_v[i]);
+    if (cv_is_udt(leaf.kind)) {
+      CV_UDTInfo ui = cv_get_udt_info(leaf.kind, leaf.data);
+      if (ui.props & CV_TypeProp_HasUniqueName) { n += 1; }
+    }
+  }
+  t->counts[task_id] = n;
+}
+
+internal
+THREAD_POOL_TASK_FUNC(lnk_gc_ring_fill_task)
+{
+  LNK_GCRingTask *t = raw_task;
+  U64 cur = t->offsets[task_id];
+  for EachInRange(i, t->ranges[task_id]) {
+    CV_Leaf leaf = cv_leaf_from_ptr(t->leaf_v[i]);
+    if (cv_is_udt(leaf.kind)) {
+      CV_UDTInfo ui = cv_get_udt_info(leaf.kind, leaf.data);
+      if (ui.props & CV_TypeProp_HasUniqueName) {
+        U64 h = 14695981039346656037ull;
+        for EachIndex(c, ui.unique_name.size) { h = (h ^ ui.unique_name.str[c]) * 0x100000001b3ull; }
+        t->pairs[cur].hash = h; t->pairs[cur].idx = (U32)i; cur += 1;
+      }
+    }
+  }
+}
+
+// mark a leaf reachable and, if this is its first mark, append it to its source's frontier so a
+// later round expands it. Atomic mark gates the append, so each leaf lands on the frontier once.
+internal void
+lnk_gc_mark_enqueue(LNK_GCTypes *g, CV_TypeIndexSource ns, U64 ci)
+{
+  if (ci >= g->orig_n[ns]) { return; }
+  if (g->mark[ns][ci])     { return; } // fast non-atomic skip: already reachable (the common edge)
+  // only the worker that wins the 0->1 transition appends, so the atomic runs once per leaf (not
+  // once per reference edge).
+  if (!ins_atomic_u8_eval_assign(&g->mark[ns][ci], 1)) {
+    U32 pos = ins_atomic_u32_inc_eval(g->fcount[ns]) - 1;
+    g->frontier[ns][pos] = (U32)ci;
+  }
+}
+
+// one bulk-synchronous round of transitive closure: expand the frontier slice [round_begin,
+// round_end) of cur_source -- visit each leaf and enqueue the leaves it references (and its
+// unique_name UDT counterparts). Frontier-driven, so total work is O(reachable leaves), not
+// O(rounds * total leaves).
+internal
+THREAD_POOL_TASK_FUNC(lnk_gc_expand_task)
+{
+  LNK_GCTypes       *g = raw_task;
+  CV_TypeIndexSource s = g->cur_source;
+  Temp scratch = scratch_begin(0, 0);
+  for EachInRange(local, g->cur_ranges[task_id]) {
+    U32 i = g->frontier[s][g->round_begin + local];
+
+    Temp temp = temp_begin(scratch.arena);
+    CV_Leaf leaf = cv_leaf_from_ptr(g->leaf_v[s][i]);
+    CV_TypeIndexInfoList l = cv_get_leaf_type_index_offsets(temp.arena, leaf.kind, leaf.data);
+    for EachNode(n, CV_TypeIndexInfo, l.first) {
+      CV_TypeIndex ti = memory_read32(leaf.data.str + n->offset);
+      U64 lo = g->min[n->source];
+      if (ti >= lo) { lnk_gc_mark_enqueue(g, n->source, ti - lo); }
+    }
+    temp_end(temp);
+
+    if (s == CV_TypeIndexSource_TPI) {
+      for (U32 j = g->udt_next[i]; j != i; j = g->udt_next[j]) {
+        lnk_gc_mark_enqueue(g, CV_TypeIndexSource_TPI, j);
+      }
+    }
+  }
+  scratch_end(scratch);
+}
+
+internal void
+lnk_gc_types(TP_Context *tp, Arena *arena, LNK_CodeViewInput *cv, LNK_MergedTypes *types)
+{
+  ProfBeginFunction();
+  Temp scratch = scratch_begin(&arena, 1);
+
+  LNK_GCTypes g = {0};
+  g.cv = cv;
+  U64 total_leaves = 0;
+  for EachIndex(s, CV_TypeIndexSource_COUNT) {
+    g.min[s]    = types->min_type_indices[s];
+    g.orig_n[s] = types->count[s];
+    g.leaf_v[s] = types->v[s];
+    g.mark[s]   = push_array(scratch.arena, U8, g.orig_n[s] ? g.orig_n[s] : 1); // zeroed
+    total_leaves += g.orig_n[s];
+  }
+
+  // mark roots: every type index referenced by a surviving symbol / inlinee record
+  g.do_rewrite = 0;
+  g.sym_ranges = tp_divide_work(scratch.arena, cv->symbol_input_count, tp->worker_count);
+  tp_for_parallel(tp, 0, tp->worker_count, lnk_gc_syms_task, &g);
+  tp_for_parallel(tp, 0, cv->obj_count,    lnk_gc_inlines_task, &g);
+
+  // link UDT leaves that share a unique_name into rings, so that marking any one (e.g. a
+  // forward ref reached as a member-pointer target) also keeps its full definition -- needed
+  // for the debugger to complete types referenced only by name. TPI only (IPI has no UDTs).
+  U64  n_tpi   = g.orig_n[CV_TypeIndexSource_TPI];
+  g.udt_next   = push_array_no_zero(scratch.arena, U32, n_tpi ? n_tpi : 1);
+  for EachIndex(i, n_tpi) { g.udt_next[i] = (U32)i; }
+  {
+    LNK_GCRingTask rt = {0};
+    rt.leaf_v  = g.leaf_v[CV_TypeIndexSource_TPI];
+    rt.ranges  = tp_divide_work(scratch.arena, n_tpi, tp->worker_count);
+    rt.counts  = push_array(scratch.arena, U64, tp->worker_count);
+    tp_for_parallel(tp, 0, tp->worker_count, lnk_gc_ring_count_task, &rt);
+    rt.offsets = offsets_from_counts_array_u64(scratch.arena, rt.counts, tp->worker_count);
+    U64 np     = sum_array_u64(tp->worker_count, rt.counts);
+    rt.pairs   = push_array_no_zero(scratch.arena, LNK_GCNamePair, np ? np : 1);
+    tp_for_parallel(tp, 0, tp->worker_count, lnk_gc_ring_fill_task, &rt);
+
+    radsort(rt.pairs, np, lnk_gc_name_pair_is_before);
+    for (U64 a = 0; a < np; ) {
+      U64 b = a + 1;
+      while (b < np && rt.pairs[b].hash == rt.pairs[a].hash) { b += 1; }
+      for (U64 k = a; k < b; k += 1) { g.udt_next[rt.pairs[k].idx] = rt.pairs[(k + 1 < b) ? (k + 1) : a].idx; }
+      a = b;
+    }
+  }
+
+  // transitive closure (parallel, bulk-synchronous): repeat rounds that visit each
+  // marked-but-unexpanded leaf and mark what it references, until a round marks nothing new
+  // seed the frontier with the root-marked leaves (one O(total leaves) scan), then expand
+  // frontier slices until both sources drain. Each leaf is expanded exactly once.
+  U32 fcount[CV_TypeIndexSource_COUNT] = {0};
+  U64 start [CV_TypeIndexSource_COUNT] = {0};
+  for EachIndex(s, CV_TypeIndexSource_COUNT) {
+    g.frontier[s] = push_array_no_zero(scratch.arena, U32, g.orig_n[s] ? g.orig_n[s] : 1);
+    g.fcount[s]   = &fcount[s];
+    for EachIndex(i, g.orig_n[s]) { if (g.mark[s][i]) { g.frontier[s][fcount[s]++] = (U32)i; } }
+  }
+  for (;;) {
+    B32 any = 0;
+    for EachIndex(s, CV_TypeIndexSource_COUNT) {
+      U64 begin = start[s], end = fcount[s]; // fcount may grow during the round (cross-source enqueues)
+      if (begin < end) {
+        any = 1;
+        g.cur_source  = (CV_TypeIndexSource)s;
+        g.round_begin = begin;
+        g.round_end   = end;
+        g.cur_ranges  = tp_divide_work(scratch.arena, end - begin, tp->worker_count);
+        tp_for_parallel(tp, 0, tp->worker_count, lnk_gc_expand_task, &g);
+        start[s] = end;
+      }
+    }
+    if (!any) { break; }
+  }
+
+  // compact each source: assign new contiguous type indices to the kept leaves. The leaf
+  // pointer array is compacted IN PLACE (kept count only shrinks, so the write cursor never
+  // passes the read cursor) and remap lives in scratch -- so the GC adds nothing to the arena
+  // that survives into the (peak) PDB build.
+  U64 kept_total = 0;
+  for EachIndex(s, CV_TypeIndexSource_COUNT) {
+    g.remap[s]      = push_array_no_zero(scratch.arena, CV_TypeIndex, g.orig_n[s] ? g.orig_n[s] : 1);
+    U8 **v          = g.leaf_v[s];
+    U64  new_n      = 0;
+    for EachIndex(idx, g.orig_n[s]) {
+      if (g.mark[s][idx]) { g.remap[s][idx] = (CV_TypeIndex)(g.min[s] + new_n); v[new_n++] = v[idx]; }
+      else                { g.remap[s][idx] = 0; /* T_NOTYPE; never referenced by a kept record */ }
+    }
+    types->count[s] = new_n;
+    kept_total += new_n;
+  }
+
+  // rewrite all type-index references to the compacted indices
+  g.do_rewrite = 1;
+  tp_for_parallel(tp, 0, tp->worker_count, lnk_gc_syms_task, &g);
+  tp_for_parallel(tp, 0, cv->obj_count,    lnk_gc_inlines_task, &g);
+  for EachIndex(s, CV_TypeIndexSource_COUNT) {
+    g.cur_source = (CV_TypeIndexSource)s;
+    g.cur_leaf_v = types->v[s];
+    g.cur_ranges = tp_divide_work(scratch.arena, types->count[s], tp->worker_count);
+    tp_for_parallel(tp, 0, tp->worker_count, lnk_gc_rewrite_leaves_task, &g);
+  }
+
+  if (lnk_get_log_status(LNK_Log_Debug)) {
+    lnk_log(LNK_Log_Debug, "type GC: kept %llu of %llu leaves (pruned %llu)", kept_total, total_leaves, total_leaves - kept_total);
+  }
+
+  scratch_end(scratch);
+  ProfEnd();
+}
+
 internal String8List
 lnk_build_pdb(TP_Context *tp, TP_Arena *tp_arena, String8 image_data, LNK_Config *config, LNK_SymbolTable *symtab, LNK_CodeViewInput *cv, LNK_MergedTypes cv_types, LNK_PDB_BuilderFlags builder_flags)
 {
diff --git a/src/linker/lnk_debug_info.h b/src/linker/lnk_debug_info.h
index dc094e0f9..ee6cd37da 100644
--- a/src/linker/lnk_debug_info.h
+++ b/src/linker/lnk_debug_info.h
@@ -98,6 +98,16 @@ typedef struct
   U64              symbol_input_count;
   LNK_SymbolInput *symbol_inputs;       // [symbol_input_count]
   Rng1U64         *symbol_input_ranges; // [worker_count]
+
+  // IFC (header-unit debug-record) resolution:
+  // redirects a consuming obj's local LF_IFC_RECORD placeholder TI to a leaf in
+  // an injected .ifc debug-records blob "obj". Consulted first in lnk_leaf_ref_from_ti.
+  // key   = Compose64Bit(obj_idx, local_ti)
+  // value = Compose64Bit(blob_obj_idx, blob_leaf_idx)
+  B32      has_ifc_redirects;
+  HashMap  ifc_redirect_hm;
+  Rng1U64  ifc_obj_range; // [min,max) range of injected blob objs in the parallel arrays
+  U32Array ifc_indices;   // obj indices of injected .ifc blob objs (hashed/deduped first)
 } LNK_CodeViewInput;
 
 typedef struct
@@ -115,10 +125,20 @@ typedef struct { U64 count; LNK_LeafRef **v; } LNK_LeafRefArray;
 
 typedef struct
 {
-  U64           cap;
+  U64           cap;        // ~1.3x total (pre-dedup) leaf count
   LNK_LeafRef **bucket_arr;
 } LNK_LeafHashTable;
 
+// Maps leaf hash -> assigned type index, sized to the UNIQUE (post-dedup) leaf count rather than the
+// total leaf count -- folds the canonical-bucket + bucket->ti lookups into one probe (deref-free: the
+// hash is stored on the slot) without the total-sized per-bucket arrays that would add ~3GB to peak.
+typedef struct
+{
+  U64           cap;        // ~1.3x unique leaf count
+  CV_TypeIndex *ti_arr;     // assigned type index per slot; 0 == empty (ti is always >= CV_MinComplexTypeIndex)
+  U64          *hash_arr;   // occupant leaf hash (parallel to ti_arr); disambiguates open-addressing collisions
+} LNK_AssignedTiHash;
+
 typedef struct LNK_LeafRange
 {
   struct LNK_LeafRange *next;
@@ -150,6 +170,7 @@ typedef struct
   LNK_CodeViewInput  *input;
   CV_DebugS          *debug_s_arr;
   LNK_LeafHashTable   leaf_ht_arr[CV_TypeIndexSource_COUNT];
+  LNK_AssignedTiHash  assigned_ti_arr[CV_TypeIndexSource_COUNT];
   Arena             **fixed_arenas;
   CV_TypeIndexSource  ti_source;
   U32Array            indices;
@@ -174,8 +195,6 @@ typedef struct
   U64           pass_idx;
 
   // assign type indices
-  U64                 assigned_type_caps  [CV_TypeIndexSource_COUNT];
-  CV_TypeIndex       *assigned_type_hts   [CV_TypeIndexSource_COUNT];
   CV_TypeIndex        min_type_indices    [CV_TypeIndexSource_COUNT];
   LNK_LeafRefArray    unique_leaf_refs_arr[CV_TypeIndexSource_COUNT];
 
@@ -251,11 +270,12 @@ internal B32             lnk_match_leaf_ref                  (LNK_CodeViewInput
 internal U64             lnk_hash_cv_leaf                    (LNK_CodeViewInput *input, LNK_LeafRef leaf_ref, CV_TypeIndexInfoList ti_info_list, B32 discard_cycles);
 internal void            lnk_hash_cv_leaf_deep               (Arena *arena, LNK_CodeViewInput *input, LNK_LeafRef leaf_ref, CV_TypeIndexInfoList ti_info_list);
 internal LNK_LeafRef *   lnk_leaf_hash_table_insert_or_update(LNK_LeafHashTable *leaf_ht, LNK_CodeViewInput *input, CV_DebugH *hashes, U64 hash, LNK_LeafRef *new_bucket);
-internal LNK_LeafRef *   lnk_leaf_hash_table_search          (LNK_LeafHashTable *ht, LNK_CodeViewInput *input, LNK_LeafRef leaf_ref);
+internal CV_TypeIndex    lnk_leaf_hash_table_search_ti       (LNK_AssignedTiHash *ht, LNK_CodeViewInput *input, LNK_LeafRef leaf_ref); // returns assigned ti for leaf_ref's hash class, 0 if absent
 internal LNK_MergedTypes lnk_merge_types                     (TP_Context *tp, TP_Arena *tp_temp, LNK_CodeViewInput *input, LNK_MergeTypeFlags merge_flags);
 internal void            lnk_replace_type_names_with_hashes  (TP_Context *tp, TP_Arena *arena, U64 leaf_count, U8 **leaf_arr, LNK_TypeNameHashMode mode, U64 hash_length, String8 map_name);
 
 ////////////////////////////////
 // PDB
 
+internal void        lnk_gc_types  (TP_Context *tp, Arena *arena, LNK_CodeViewInput *cv, LNK_MergedTypes *types);
 internal String8List lnk_build_pdb(TP_Context *tp, TP_Arena *tp_arena, String8 image_data, LNK_Config *config, LNK_SymbolTable *symtab, LNK_CodeViewInput *cv, LNK_MergedTypes cv_types, LNK_PDB_BuilderFlags builder_flags);
diff --git a/src/linker/lnk_lib.h b/src/linker/lnk_lib.h
index eed5e2dea..8e92e47a9 100644
--- a/src/linker/lnk_lib.h
+++ b/src/linker/lnk_lib.h
@@ -15,6 +15,24 @@ typedef struct LNK_Lib
   String8Array         symbol_names;
   String8              long_names;
   U64                  input_idx;
+
+  // lib-search barrier elision: a re-search of this lib can only queue new members if the set of
+  // undefined/weak symbols grew, or if anti-dep searching was just enabled, since the last search.
+  // these record the state at the last search so identical re-searches can skip the tp dispatch.
+  B32                  was_searched;
+  B32                  searched_anti_deps;
+  U64                  searched_symbol_count;
+
+  // lib-search FRONTIER cursor: search_chunks is append-only across the entire lib-search loop
+  // (it is only drained -> chunks AFTER the loop, in lnk_replace_weak_with_default_symbols), so an
+  // already-searched (symbol,lib) pair can never resolve a new member -- searching is a pure
+  // function of (lib, symbol->name) and the member queue dedup is idempotent. each worker records
+  // the position in its search_chunks[worker] list where this lib's last search ended; the next
+  // search rescans only the slots appended since. search_cursor_chunk[worker] == 0 means "start at
+  // list first" (nothing searched yet). reset to 0 when the anti-dep mode flips (anti-dep weak
+  // symbols must be re-tried under the new mode).
+  struct LNK_SymbolHashTrieChunk **search_cursor_chunk; // [worker_count], 0 = unsearched
+  U64                             *search_cursor_idx;    // [worker_count], intra-chunk resume index
 } LNK_Lib;
  
 typedef struct LNK_LibNode
diff --git a/src/linker/lnk_obj.c b/src/linker/lnk_obj.c
index c2f9649e9..25e5921a3 100644
--- a/src/linker/lnk_obj.c
+++ b/src/linker/lnk_obj.c
@@ -125,13 +125,16 @@ THREAD_POOL_TASK_FUNC(lnk_obj_initer)
   }
 
   //
-  // error check symbol table
+  // error check symbol table (+ memoize parsed symbols)
   //
+  LNK_ParsedSymbolLite *parsed_symbols = push_array(arena, LNK_ParsedSymbolLite, header.symbol_count);
   {
     COFF_SectionHeader *section_table = (COFF_SectionHeader *)str8_substr(input->data, header.section_table_range).str;
     COFF_ParsedSymbol symbol;
     for (U64 symbol_idx = 0; symbol_idx < header.symbol_count; symbol_idx += (1 + symbol.aux_symbol_count)) {
       symbol = coff_parse_symbol(header, raw_coff_string_table, raw_coff_symbol_table, symbol_idx);
+      U32 raw_off = symbol.raw_symbol ? safe_cast_u32((U8 *)symbol.raw_symbol - input->data.str) : 0;
+      parsed_symbols[symbol_idx] = (LNK_ParsedSymbolLite){ raw_off, safe_cast_u32(symbol.value), symbol.section_number, symbol.type, symbol.storage_class, symbol.aux_symbol_count };
       COFF_SymbolValueInterpType interp = coff_interp_symbol(symbol.section_number, symbol.value, symbol.storage_class);
       if (interp == COFF_SymbolValueInterp_Regular) {
         if (symbol.section_number == 0 || symbol.section_number > header.section_count_no_null) {
@@ -335,6 +338,7 @@ THREAD_POOL_TASK_FUNC(lnk_obj_initer)
   obj->path                    = push_str8_copy(arena, input->path);
   obj->header                  = header;
   obj->section_flags           = section_flags;
+  obj->parsed_symbols          = parsed_symbols;
   obj->comdats                 = comdats;
   obj->exclude_from_debug_info = input->exclude_from_debug_info;
   obj->hotpatch                = hotpatch;
@@ -584,8 +588,11 @@ lnk_obj_section_name_from_section_number(LNK_Obj *obj, U64 section_number)
   return lnk_obj_section_name_from_sect_idx(obj, section_number-1);
 }
 
+// NOTE: skips section.name (coff_name_from_section_header does a string-table
+// lookup); use when only the header/flags/ranges are needed (e.g. checking
+// section flags). Callers that need the name should use lnk_obj_section_from_sect_idx.
 internal LNK_ObjSection
-lnk_obj_section_from_sect_idx(LNK_Obj *obj, U64 sect_idx)
+lnk_obj_section_from_sect_idx_no_name(LNK_Obj *obj, U64 sect_idx)
 {
   Assert(sect_idx < obj->header.section_count_no_null);
   LNK_ObjSection section = {0};
@@ -600,6 +607,15 @@ lnk_obj_section_from_sect_idx(LNK_Obj *obj, U64 sect_idx)
   return section;
 }
 
+internal LNK_ObjSection
+lnk_obj_section_from_sect_idx(LNK_Obj *obj, U64 sect_idx)
+{
+  // Under upstream's lazy section-name model LNK_ObjSection has no `name` field
+  // (callers fetch it on demand via lnk_obj_section_name_from_sect_idx), so this
+  // is identical to the _no_name accessor.
+  return lnk_obj_section_from_sect_idx_no_name(obj, sect_idx);
+}
+
 internal LNK_ObjSection
 lnk_obj_section_from_section_number(LNK_Obj *obj, U64 section_number)
 {
@@ -660,19 +676,35 @@ lnk_coff_section_header_from_section_number(LNK_Obj *obj, U64 section_number)
   return &section_table[sect_idx];
 }
 
+// NOTE: returns the memoized parse built in lnk_obj_initer. The struct is mutable: symbol-value
+// patching (lnk_patch_*_task) writes value/section_number/storage_class here, NOT into the mmapped
+// obj->data symbol table -- so obj->data symbol values are never written after load. raw_symbol still
+// points into obj->data (read-only) for aux-record reads (coff_parse_weak_tag / coff_parse_secdef).
 internal COFF_ParsedSymbol
-lnk_parsed_symbol_from_coff_symbol_idx(LNK_Obj *obj, U64 symbol_idx)
+lnk_parsed_symbol_from_coff_symbol_idx_no_name(LNK_Obj *obj, U64 symbol_idx)
 {
-  String8 string_table = str8_substr(obj->data, obj->header.string_table_range);
-  String8 symbol_table = str8_substr(obj->data, obj->header.symbol_table_range);
-
+  LNK_ParsedSymbolLite *lite = &obj->parsed_symbols[symbol_idx];
   COFF_ParsedSymbol result = {0};
-  if (obj->header.is_big_obj) {
-    result = coff_parse_symbol32(string_table, (COFF_Symbol32 *)symbol_table.str + symbol_idx);
-  } else {
-    result = coff_parse_symbol16(string_table, (COFF_Symbol16 *)symbol_table.str + symbol_idx);
+  result.value            = lite->value;
+  result.section_number   = lite->section_number;
+  result.type             = lite->type;
+  result.storage_class    = lite->storage_class;
+  result.aux_symbol_count = lite->aux_symbol_count;
+  result.raw_symbol       = lite->raw_symbol_off ? (obj->data.str + lite->raw_symbol_off) : 0; // offset -> ptr
+  return result;
+}
+
+internal COFF_ParsedSymbol
+lnk_parsed_symbol_from_coff_symbol_idx(LNK_Obj *obj, U64 symbol_idx)
+{
+  COFF_ParsedSymbol result = lnk_parsed_symbol_from_coff_symbol_idx_no_name(obj, symbol_idx);
+  // name is excluded from the memo -- decode it from the (read-only) symbol record on demand. Patching
+  // never touches the name, so re-deriving from raw_symbol stays correct after value/section patches.
+  if (result.raw_symbol) {
+    String8 string_table = str8_substr(obj->data, obj->header.string_table_range);
+    if (obj->header.is_big_obj) { result.name = coff_parse_symbol32(string_table, (COFF_Symbol32 *)result.raw_symbol).name; }
+    else                        { result.name = coff_parse_symbol16(string_table, (COFF_Symbol16 *)result.raw_symbol).name; }
   }
-  
   return result;
 }
 
@@ -768,7 +800,8 @@ lnk_raw_directives_from_obj(Arena *arena, LNK_Obj *obj)
 {
   String8List drectve_data = {0};
   for (U64 sect_idx = 0; sect_idx < obj->header.section_count_no_null; sect_idx += 1) {
-    LNK_ObjSection section = lnk_obj_section_from_sect_idx(obj, sect_idx);
+    // only LnkInfo sections (rare) need the name; skip the string-table lookup for the rest
+    LNK_ObjSection section = lnk_obj_section_from_sect_idx_no_name(obj, sect_idx);
     if (*section.flags & COFF_SectionFlag_LnkInfo) {
       String8 section_name = lnk_obj_section_name_from_sect_idx(obj, sect_idx);
       if (str8_match(section_name, str8_lit(".drectve"), 0)) {
diff --git a/src/linker/lnk_obj.h b/src/linker/lnk_obj.h
index 9a21de590..0d3d32c95 100644
--- a/src/linker/lnk_obj.h
+++ b/src/linker/lnk_obj.h
@@ -5,6 +5,33 @@
 
 // --- Input -------------------------------------------------------------------
 
+// Slim memoized symbol parse: every COFF_ParsedSymbol field EXCEPT name (the 16B String8). Sized by
+// total symbol count, so dropping name saves ~16B/sym of peak. The name is the cold path -- it is
+// re-decoded from raw_symbol on demand in lnk_parsed_symbol_from_coff_symbol_idx (the named accessor);
+// the hot _no_name accessor and all symbol-value patching never touch it.
+typedef struct LNK_ParsedSymbolLite
+{
+  U32                  raw_symbol_off;   // byte offset of the COFF symbol record within obj->data (0 = none).
+                                         // stored as an offset, not a pointer, to keep this struct 16B.
+  U32                  value;            // COFF symbol value is U32 (section-relative offset / size / etc.)
+  U32                  section_number;
+  COFF_SymbolType      type;             // U16
+  COFF_SymStorageClass storage_class;    // U8
+  U8                   aux_symbol_count;
+} LNK_ParsedSymbolLite;
+
+// /OPT:ICF static-COMDAT fold record (one per section, indexed by section_number-1). A static
+// (internal-linkage) COMDAT has no external symbol to redirect, so instead of routing it through
+// the symbol table we record its leader here: /OPT:REF marks the LEADER live (so the follower is
+// dead-stripped with its associated .pdata/.xdata), and the final section map redirects any
+// residual reloc to the leader's contrib. set==0 means the section is not folded.
+typedef struct LNK_ICFFold
+{
+  U32 leader_obj_idx; // input_idx of the leader's obj
+  U32 leader_sn;      // leader section number
+  B8  set;
+} LNK_ICFFold;
+
 typedef struct LNK_Obj
 {
   String8 path;
@@ -12,6 +39,8 @@ typedef struct LNK_Obj
 
   COFF_FileHeaderInfo header;
   COFF_SectionFlags  *section_flags;
+  LNK_ParsedSymbolLite *parsed_symbols; // memoized parse per symbol_idx (aux slots zeroed), name excluded.
+                                         // Mutable: symbol-value patching writes here, NOT obj->data.
 
   // flags
   B8 hotpatch;
@@ -23,6 +52,7 @@ typedef struct LNK_Obj
   U32                 *comdats;
   U32Node            **associated_sections;
   LNK_SymbolHashTrie **symlinks;
+  LNK_ICFFold         *icf_fold; // /OPT:ICF static-COMDAT fold map (per section, sn-1 indexed); 0 if ICF off
 
   // link
   struct LNK_LibMemberRef *link_member;
@@ -145,11 +175,13 @@ internal LNK_Symbol *     lnk_obj_get_comdat_symlink(LNK_Obj *obj, U64 section_n
 
 internal COFF_SectionHeader * lnk_coff_section_header_from_section_number(LNK_Obj *obj, U64 section_number);
 internal COFF_ParsedSymbol    lnk_parsed_symbol_from_coff_symbol_idx(LNK_Obj *obj, U64 symbol_idx);
+internal COFF_ParsedSymbol    lnk_parsed_symbol_from_coff_symbol_idx_no_name(LNK_Obj *obj, U64 symbol_idx);
 internal U64                  lnk_obj_sect_idx_from_section_number(LNK_Obj *obj, U64 section_number);
 internal U64                  lnk_obj_section_number_from_sect_idx(LNK_Obj *obj, U64 sect_idx);
 internal String8              lnk_obj_section_name_from_section_number(LNK_Obj *obj, U64 section_number);
 internal String8              lnk_obj_section_name_from_sect_idx(LNK_Obj *obj, U64 sect_idx);
 internal LNK_ObjSection       lnk_obj_section_from_sect_idx(LNK_Obj *obj, U64 sect_idx);
+internal LNK_ObjSection       lnk_obj_section_from_sect_idx_no_name(LNK_Obj *obj, U64 sect_idx);
 internal LNK_ObjSection       lnk_obj_section_from_section_number(LNK_Obj *obj, U64 section_number);
 internal COFF_RelocArray      lnk_coff_relocs_from_section_header(LNK_Obj *obj, COFF_SectionHeader *section_header);
 internal String8              lnk_coff_string_table_from_obj(LNK_Obj *obj);
diff --git a/src/linker/lnk_symbol_table.c b/src/linker/lnk_symbol_table.c
index 8cb6c4e5f..42b267a03 100644
--- a/src/linker/lnk_symbol_table.c
+++ b/src/linker/lnk_symbol_table.c
@@ -105,12 +105,14 @@ lnk_can_replace_symbol(LNK_Symbol *dst, LNK_Symbol *src)
 {
   B32 can_replace = 0;
 
-  COFF_ParsedSymbol           dst_parsed = lnk_parsed_from_symbol(dst);
-  COFF_ParsedSymbol           src_parsed = lnk_parsed_from_symbol(src);
-  COFF_SymbolValueInterpType  dst_interp = lnk_interp_from_symbol(dst);
-  COFF_SymbolValueInterpType  src_interp = lnk_interp_from_symbol(src);
+  // only scalar fields + raw_symbol are needed here (names come from dst->name/src->name);
+  // skip the symbol-name string-table scan and avoid re-parsing for interp
   LNK_ObjSymbolRef            dst_ref    = lnk_ref_from_symbol(dst);
   LNK_ObjSymbolRef            src_ref    = lnk_ref_from_symbol(src);
+  COFF_ParsedSymbol           dst_parsed = lnk_parsed_symbol_from_coff_symbol_idx_no_name(dst_ref.obj, dst_ref.symbol_idx);
+  COFF_ParsedSymbol           src_parsed = lnk_parsed_symbol_from_coff_symbol_idx_no_name(src_ref.obj, src_ref.symbol_idx);
+  COFF_SymbolValueInterpType  dst_interp = coff_interp_from_parsed_symbol(dst_parsed);
+  COFF_SymbolValueInterpType  src_interp = coff_interp_from_parsed_symbol(src_parsed);
   LNK_Obj                    *dst_obj    = dst_ref.obj;
   LNK_Obj                    *src_obj    = src_ref.obj;
 
@@ -332,9 +334,10 @@ lnk_can_replace_symbol(LNK_Symbol *dst, LNK_Symbol *src)
 internal void
 lnk_on_symbol_replace(LNK_Symbol *dst, LNK_Symbol *src)
 {
-  COFF_ParsedSymbol          dst_parsed = lnk_parsed_from_symbol(dst);
-  COFF_SymbolValueInterpType dst_interp = lnk_interp_from_symbol(dst);
+  // only scalar fields are needed below, so skip the symbol-name string-table scan
   LNK_ObjSymbolRef           dst_ref    = lnk_ref_from_symbol(dst);
+  COFF_ParsedSymbol          dst_parsed = lnk_parsed_symbol_from_coff_symbol_idx_no_name(dst_ref.obj, dst_ref.symbol_idx);
+  COFF_SymbolValueInterpType dst_interp = coff_interp_from_parsed_symbol(dst_parsed);
 
   if (dst_interp == COFF_SymbolValueInterp_Regular) {
     // remove replaced section from the output
@@ -531,7 +534,9 @@ lnk_parsed_from_symbol(LNK_Symbol *symbol)
 internal COFF_SymbolValueInterpType
 lnk_interp_from_symbol(LNK_Symbol *symbol)
 {
-  COFF_ParsedSymbol symbol_parsed = lnk_parsed_from_symbol(symbol); 
+  // interp only needs scalar fields; skip the symbol-name string-table scan
+  LNK_ObjSymbolRef  ref           = lnk_ref_from_symbol(symbol);
+  COFF_ParsedSymbol symbol_parsed = lnk_parsed_symbol_from_coff_symbol_idx_no_name(ref.obj, ref.symbol_idx);
   return coff_interp_from_parsed_symbol(symbol_parsed);
 }
 
@@ -556,6 +561,7 @@ lnk_symbol_table_push_(LNK_SymbolTable *symtab, Arena *arena, U64 worker_id, LNK
 {
   U64                        hash   = lnk_symbol_table_hasher(symbol->name);
   COFF_SymbolValueInterpType interp = lnk_interp_from_symbol(symbol);
+  symbol->interp = interp; // cache for the lib search hot loop
   LNK_SymbolHashTrieChunkList *chunks;
   if (interp == COFF_SymbolValueInterp_Weak || interp == COFF_SymbolValueInterp_Undefined) {
     chunks = &symtab->search_chunks[worker_id];
@@ -585,6 +591,21 @@ lnk_symbol_table_search(LNK_SymbolTable *symtab, String8 name)
   return trie ? trie->symbol : 0;
 }
 
+internal U64
+lnk_symbol_table_search_symbol_count(LNK_SymbolTable *symtab)
+{
+  // total number of weak/undefined symbols inserted into search_chunks. this only ever grows
+  // during the lib-search loop (symbols are never removed mid-loop), so it serves as a monotonic
+  // version stamp for the set of symbols the lib search would scan.
+  U64 count = 0;
+  for EachIndex(worker_id, symtab->arena->count) {
+    for EachNode(c, LNK_SymbolHashTrieChunk, symtab->search_chunks[worker_id].first) {
+      count += c->count;
+    }
+  }
+  return count;
+}
+
 internal LNK_Symbol *
 lnk_symbol_table_searchf(LNK_SymbolTable *symtab, char *fmt, ...)
 {
@@ -772,17 +793,9 @@ THREAD_POOL_TASK_FUNC(lnk_replace_weak_with_default_symbol_task)
           COFF_SymbolValueInterpType resolve_interp = coff_interp_from_parsed_symbol(resolve_parsed);
           if (resolve_interp == COFF_SymbolValueInterp_Weak) {
             COFF_SymbolWeakExt *weak_ext = coff_parse_weak_tag(resolve_parsed, symbol_ref.obj->header.is_big_obj);
-            if (symbol_ref.obj->header.is_big_obj) {
-              COFF_Symbol32 *symbol32  = symbol_parsed.raw_symbol;
-              symbol32->section_number = COFF_Symbol_UndefinedSection;
-              symbol32->value          = 0;
-              symbol32->storage_class  = COFF_SymStorageClass_External;
-            } else {
-              COFF_Symbol16 *symbol16  = symbol_parsed.raw_symbol;
-              symbol16->section_number = COFF_Symbol_UndefinedSection;
-              symbol16->value          = 0;
-              symbol16->storage_class  = COFF_SymStorageClass_External;
-            }
+            symbol_ref.obj->parsed_symbols[symbol_ref.symbol_idx].section_number = COFF_Symbol_UndefinedSection;
+            symbol_ref.obj->parsed_symbols[symbol_ref.symbol_idx].value          = 0;
+            symbol_ref.obj->parsed_symbols[symbol_ref.symbol_idx].storage_class  = COFF_SymStorageClass_External;
           } else {
             symbol->first_ref->v = resolve;
           }
diff --git a/src/linker/lnk_symbol_table.h b/src/linker/lnk_symbol_table.h
index da579bb1e..89caf3b8b 100644
--- a/src/linker/lnk_symbol_table.h
+++ b/src/linker/lnk_symbol_table.h
@@ -19,9 +19,10 @@ typedef struct LNK_ObjSymbolRefNode
 
 typedef struct LNK_Symbol
 {
-  String8               name;
-  LNK_ObjSymbolRefNode *first_ref;
-  LNK_ObjSymbolRefNode *last_ref;
+  String8                    name;
+  LNK_ObjSymbolRefNode      *first_ref;
+  LNK_ObjSymbolRefNode      *last_ref;
+  COFF_SymbolValueInterpType interp;    // cached at push so the lib search can skip re-parsing (and page-faulting) resolved symbols
 } LNK_Symbol;
 
 // --- Symbol Containers -------------------------------------------------------
@@ -124,6 +125,7 @@ internal U64 lnk_symbol_table_hasher(String8 string);
 internal LNK_SymbolTable * lnk_symbol_table_init(TP_Arena *arena);
 internal void              lnk_symbol_table_push(LNK_SymbolTable *symtab, LNK_Symbol *symbol);
 internal LNK_Symbol *      lnk_symbol_table_search(LNK_SymbolTable *symtab, String8 name);
+internal U64               lnk_symbol_table_search_symbol_count(LNK_SymbolTable *symtab);
 internal LNK_Symbol *      lnk_symbol_table_searchf(LNK_SymbolTable *symtab, char *fmt, ...);
 
 // --- Symbol Contrib Helpers --------------------------------------------------
diff --git a/src/linker/pdb_ext/pdb_builder.c b/src/linker/pdb_ext/pdb_builder.c
index be6960122..3865bbf24 100644
--- a/src/linker/pdb_ext/pdb_builder.c
+++ b/src/linker/pdb_ext/pdb_builder.c
@@ -1783,7 +1783,11 @@ psi_addr_map_compar_is_before(void *raw_a, void *raw_b)
   } else if (a->isect_off.off != b->isect_off.off) {
     is_before = a->isect_off.off < b->isect_off.off;
   } else {
-    is_before = str8_compar_case_sensitive(&a->name, &b->name) < 0;
+    int cmp = str8_compar_case_sensitive(&a->name, &b->name);
+    // unique, element-stable tiebreaker (offset is unique per record and travels with the
+    // element across radsort swaps) so equal-key runs (many ICF-folded symbols at one
+    // address) do not degrade radsort's quicksort to O(n^2)
+    is_before = cmp != 0 ? (cmp < 0) : (a->offset < b->offset);
   }
 
   return is_before;
@@ -1797,12 +1801,22 @@ gsi_record_sort_by_name(PDB_GsiSortRecord *arr, U64 count)
   ProfEnd();
 }
 
-internal void
-gsi_record_sort_by_sc(PDB_GsiSortRecord *arr, U64 count)
+// returns a record-index permutation sorted ascending by (isect, off) -- the PSI address map
+// order. A stable parallel radix sort on the U64 (isect<<32|off) key; equal addresses (e.g.
+// ICF-folded symbols) keep input order, which is fine for an address->symbol map.
+internal U32 *
+gsi_record_sort_by_sc(TP_Context *tp, Arena *arena, PDB_GsiSortRecord *arr, U64 count)
 {
   ProfBeginFunction();
-  radsort(arr, count, psi_addr_map_compar_is_before);
+  U64 *keys = push_array_no_zero(arena, U64, count ? count : 1);
+  U32 *idx  = push_array_no_zero(arena, U32, count ? count : 1);
+  for EachIndex(i, count) {
+    keys[i] = ((U64)arr[i].isect_off.isect << 32) | (U64)arr[i].isect_off.off;
+    idx[i]  = (U32)i;
+  }
+  lnk_radix_sort_u64_pairs(tp, arena, count, keys, idx);
   ProfEnd();
+  return idx;
 }
 
 internal
@@ -1839,7 +1853,9 @@ gsi_symbol_is_before(void *raw_a, void *raw_b)
         }
       }
     }
-    is_before = cmp < 0;
+    // unique, element-stable tiebreaker (symbol pointer travels with the element across
+    // radsort swaps) prevents O(n^2) radsort on equal-key runs (ICF-folded symbols)
+    is_before = cmp != 0 ? (cmp < 0) : ((U64)a < (U64)b);
   }
 
   return is_before;
@@ -1870,7 +1886,9 @@ gsi_pub_symbol_is_before(void *raw_a, void *raw_b)
         }
       }
     }
-    is_before = cmp < 0;
+    // unique, element-stable tiebreaker (symbol pointer travels with the element across
+    // radsort swaps) prevents O(n^2) radsort on equal-key runs (ICF-folded publics)
+    is_before = cmp != 0 ? (cmp < 0) : ((U64)a < (U64)b);
   }
 
   return is_before;
@@ -2209,15 +2227,15 @@ psi_build(TP_Context *tp, PDB_PsiContext *psi, MSF_Context *msf, MSF_StreamNumbe
   ProfBegin("Address Map");
   
   ProfBegin("Sort");
-  gsi_record_sort_by_sc(gsi_build.sort_record_arr, gsi_build.hash_record_count);
+  U32 *sorted = gsi_record_sort_by_sc(tp, scratch.arena, gsi_build.sort_record_arr, gsi_build.hash_record_count);
   ProfEnd();
-  
+
   ProfBegin("Offset Fill");
   U64 addr_map_count = gsi_build.hash_record_count;
   U64 addr_map_size = addr_map_count * sizeof(U32);
   U32 *addr_map     = push_array_no_zero(scratch.arena, U32, addr_map_count);
   for (U64 i = 0; i < addr_map_count; i += 1) {
-    addr_map[i] = gsi_build.sort_record_arr[i].offset;
+    addr_map[i] = gsi_build.sort_record_arr[sorted[i]].offset;
   }
   ProfEnd();
 
@@ -2772,12 +2790,39 @@ dbi_build_sec_con(Arena *arena, PDB_DbiContext *dbi)
 
   // sort section contribs so they are binary searchable
   lnk_radix_sort_dbi_sc_array(sc_array, dbi->sec_contrib_list.count, dbi->section_list.count + 1);
-  
+
+  // coalesce adjacent contributions that belong to the same module, section, and
+  // flags and are perfectly contiguous. The section contribution substream is only
+  // used to map an image address range back to its module, so merging contiguous
+  // same-module runs is loss-less (data/reloc CRCs are unused here, always 0). UE
+  // objects emit one COMDAT section per function, so after layout these collapse
+  // heavily -- matching link.exe, which stores one entry per contiguous run.
+  ProfBegin("Coalesce sect contribs");
+  U64 sc_count = 0;
+  for (U64 r = 0; r < dbi->sec_contrib_list.count; r += 1) {
+    PDB_DbiSectionContrib *c = &sc_array[r];
+    if (sc_count > 0) {
+      PDB_DbiSectionContrib *p = &sc_array[sc_count - 1];
+      if (p->base.sec   == c->base.sec   &&
+          p->base.mod   == c->base.mod   &&
+          p->base.flags == c->base.flags &&
+          (U64)c->base.sec_off >= (U64)p->base.sec_off) {
+        // absorb c into the run, extending over any alignment padding gap
+        U64 c_end = (U64)c->base.sec_off + (U64)c->base.size;
+        U64 p_end = (U64)p->base.sec_off + (U64)p->base.size;
+        if (c_end > p_end) { p->base.size = (U32)(c_end - (U64)p->base.sec_off); }
+        continue;
+      }
+    }
+    sc_array[sc_count++] = *c;
+  }
+  ProfEnd();
+
   // push section contrib info
   ProfBegin("List Push");
   String8List sec_con_list = {0};
   str8_list_push(arena, &sec_con_list, str8((U8*)version, sizeof(*version)));
-  str8_list_push(arena, &sec_con_list, str8((U8*)sc_array, sizeof(sc_array[0])*dbi->sec_contrib_list.count));
+  str8_list_push(arena, &sec_con_list, str8((U8*)sc_array, sizeof(sc_array[0])*sc_count));
   ProfEnd();
   
   ProfEnd();