diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index dc4240b0414..a32d7e68040 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -34,6 +34,7 @@ jobs:
           ./odin check examples/all -vet -strict-style -disallow-do -target:netbsd_arm64
           ./odin check vendor/sdl3  -vet -strict-style -disallow-do -target:netbsd_amd64 -no-entry-point
           ./odin check vendor/sdl3  -vet -strict-style -disallow-do -target:netbsd_arm64 -no-entry-point
+          ./odin run tests/heap_allocator -vet -strict-style -disallow-do -define:ODIN_DEBUG_HEAP=true -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests
           ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
           ./odin test tests/core/speed.odin -file -all-packages -vet -strict-style -disallow-do -o:speed -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
           ./odin test tests/vendor -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
@@ -65,6 +66,7 @@ jobs:
           gmake -C vendor/miniaudio/src
           ./odin check examples/all -vet -strict-style -disallow-do -target:freebsd_amd64
           ./odin check vendor/sdl3  -vet -strict-style -disallow-do -target:freebsd_amd64 -no-entry-point
+          ./odin run tests/heap_allocator -vet -strict-style -disallow-do -define:ODIN_DEBUG_HEAP=true -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests
           ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
           ./odin test tests/core/speed.odin -file -all-packages -vet -strict-style -disallow-do -o:speed -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
           ./odin test tests/vendor -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
@@ -123,6 +125,8 @@ jobs:
         run: ./odin check examples/all -strict-style -vet -disallow-do
       - name: Odin check vendor/sdl3
         run: ./odin check vendor/sdl3  -strict-style -vet -disallow-do -no-entry-point
+      - name: Odin heap allocator tests
+        run: ./odin run tests/heap_allocator -vet -strict-style -disallow-do -define:ODIN_DEBUG_HEAP=true -sanitize:thread -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests
       - name: Normal Core library tests
         run: ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
       - name: Optimized Core library tests
@@ -211,6 +215,11 @@ jobs:
         run: |
           call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
           odin check vendor/sdl3 -vet -strict-style -disallow-do -no-entry-point
+      - name: Odin heap allocator tests
+        shell: cmd
+        run: |
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
+          odin run tests/heap_allocator -vet -strict-style -disallow-do -define:ODIN_DEBUG_HEAP=true -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests
       - name: Core library tests
         shell: cmd
         run: |
@@ -305,6 +314,9 @@ jobs:
       - name: Odin run -debug
         run: ./odin run examples/demo -debug -vet -strict-style -disallow-do -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static" -no-rpath
 
+      - name: Odin heap allocator tests
+        run: ./odin run tests/heap_allocator -vet -strict-style -disallow-do -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static" -no-rpath -define:ODIN_DEBUG_HEAP=true -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests
+
       - name: Normal Core library tests
         run: ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static" -no-rpath
 
diff --git a/base/runtime/default_allocators_general.odin b/base/runtime/default_allocators_general.odin
index 64af6c90455..cbaf4d22a31 100644
--- a/base/runtime/default_allocators_general.odin
+++ b/base/runtime/default_allocators_general.odin
@@ -6,9 +6,6 @@ when ODIN_DEFAULT_TO_NIL_ALLOCATOR {
 } else when ODIN_DEFAULT_TO_PANIC_ALLOCATOR {
 	default_allocator_proc :: panic_allocator_proc
 	default_allocator :: panic_allocator
-} else when ODIN_OS != .Orca && (ODIN_ARCH == .wasm32 || ODIN_ARCH == .wasm64p32) {
-	default_allocator :: default_wasm_allocator
-	default_allocator_proc :: wasm_allocator_proc
 } else {
 	default_allocator :: heap_allocator
 	default_allocator_proc :: heap_allocator_proc
diff --git a/base/runtime/heap_allocator.odin b/base/runtime/heap_allocator.odin
index 4b04dffef15..7fc1c45c635 100644
--- a/base/runtime/heap_allocator.odin
+++ b/base/runtime/heap_allocator.odin
@@ -1,3 +1,6 @@
+#+build !js
+#+build !orca
+#+build !wasi
 package runtime
 
 import "base:intrinsics"
@@ -9,111 +12,65 @@ heap_allocator :: proc() -> Allocator {
 	}
 }
 
-heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
-                            size, alignment: int,
-                            old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) {
-	//
-	// NOTE(tetra, 2020-01-14): The heap doesn't respect alignment.
-	// Instead, we overallocate by `alignment + size_of(rawptr) - 1`, and insert
-	// padding. We also store the original pointer returned by heap_alloc right before
-	// the pointer we return to the user.
-	//
-
-	aligned_alloc :: proc(size, alignment: int, old_ptr: rawptr, old_size: int, zero_memory := true) -> ([]byte, Allocator_Error) {
-		// Not(flysand): We need to reserve enough space for alignment, which
-		// includes the user data itself, the space to store the pointer to
-		// allocation start, as well as the padding required to align both
-		// the user data and the pointer.
-		a := max(alignment, align_of(rawptr))
-		space := a-1 + size_of(rawptr) + size
-		allocated_mem: rawptr
-
-		force_copy := old_ptr != nil && alignment > align_of(rawptr)
-
-		if old_ptr != nil && !force_copy {
-			original_old_ptr := ([^]rawptr)(old_ptr)[-1]
-			allocated_mem = heap_resize(original_old_ptr, space)
-		} else {
-			allocated_mem = heap_alloc(space, zero_memory)
-		}
-		aligned_mem := rawptr(([^]u8)(allocated_mem)[size_of(rawptr):])
-
-		ptr := uintptr(aligned_mem)
-		aligned_ptr := (ptr + uintptr(a)-1) & ~(uintptr(a)-1)
-		if allocated_mem == nil {
-			aligned_free(old_ptr)
-			aligned_free(allocated_mem)
+heap_allocator_proc :: proc(
+	allocator_data: rawptr,
+	mode: Allocator_Mode,
+	size, alignment: int,
+	old_memory: rawptr,
+	old_size: int,
+	loc := #caller_location,
+) -> ([]byte, Allocator_Error) {
+	assert(alignment <= HEAP_MAX_ALIGNMENT, "Heap allocation alignment beyond HEAP_MAX_ALIGNMENT bytes is not supported.", loc = loc)
+	assert(alignment >= 0, "Alignment must be greater than or equal to zero.", loc = loc)
+	switch mode {
+	case .Alloc:
+		// All allocations are aligned to at least their size up to
+		// `HEAP_MAX_ALIGNMENT`, and by virtue of binary arithmetic, any
+		// address aligned to N will also be aligned to N>>1.
+		//
+		// Therefore, we have no book-keeping costs for alignment.
+		ptr := heap_alloc(max(size, alignment))
+		if ptr == nil {
 			return nil, .Out_Of_Memory
 		}
-
-		aligned_mem = rawptr(aligned_ptr)
-		([^]rawptr)(aligned_mem)[-1] = allocated_mem
-
-		if force_copy {
-			mem_copy_non_overlapping(aligned_mem, old_ptr, min(old_size, size))
-			aligned_free(old_ptr)
-		}
-
-		return byte_slice(aligned_mem, size), nil
-	}
-
-	aligned_free :: proc(p: rawptr) {
-		if p != nil {
-			heap_free(([^]rawptr)(p)[-1])
+		return transmute([]byte)Raw_Slice{ data = ptr, len = size }, nil
+	case .Alloc_Non_Zeroed:
+		ptr := heap_alloc(max(size, alignment), zero_memory = false)
+		if ptr == nil {
+			return nil, .Out_Of_Memory
 		}
-	}
-
-	aligned_resize :: proc(p: rawptr, old_size: int, new_size: int, new_alignment: int, zero_memory := true) -> (new_memory: []byte, err: Allocator_Error) {
-		if p == nil {
-			return aligned_alloc(new_size, new_alignment, nil, old_size, zero_memory)
+		return transmute([]byte)Raw_Slice{ data = ptr, len = size }, nil
+	case .Resize:
+		ptr := heap_resize(old_memory, old_size, max(size, alignment))
+		if ptr == nil {
+			return nil, .Out_Of_Memory
 		}
-
-		new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return
-
-		// NOTE: heap_resize does not zero the new memory, so we do it
-		if zero_memory && new_size > old_size {
-			new_region := raw_data(new_memory[old_size:])
-			intrinsics.mem_zero(new_region, new_size - old_size)
+		return transmute([]byte)Raw_Slice{ data = ptr, len = size }, nil
+	case .Resize_Non_Zeroed:
+		ptr := heap_resize(old_memory, old_size, max(size, alignment), zero_memory = false)
+		if ptr == nil {
+			return nil, .Out_Of_Memory
 		}
-		return
-	}
-
-	switch mode {
-	case .Alloc, .Alloc_Non_Zeroed:
-		return aligned_alloc(size, alignment, nil, 0, mode == .Alloc)
-
+		return transmute([]byte)Raw_Slice{ data = ptr, len = size }, nil
 	case .Free:
-		aligned_free(old_memory)
-
+		heap_free(old_memory)
 	case .Free_All:
 		return nil, .Mode_Not_Implemented
-
-	case .Resize, .Resize_Non_Zeroed:
-		return aligned_resize(old_memory, old_size, size, alignment, mode == .Resize)
-
 	case .Query_Features:
 		set := (^Allocator_Mode_Set)(old_memory)
 		if set != nil {
-			set^ = {.Alloc, .Alloc_Non_Zeroed, .Free, .Resize, .Resize_Non_Zeroed, .Query_Features}
+			set^ = {
+				.Alloc,
+				.Alloc_Non_Zeroed,
+				.Resize,
+				.Resize_Non_Zeroed,
+				.Free,
+				.Query_Features,
+			}
 		}
 		return nil, nil
-
 	case .Query_Info:
 		return nil, .Mode_Not_Implemented
 	}
-
 	return nil, nil
 }
-
-
-heap_alloc :: proc "contextless" (size: int, zero_memory := true) -> rawptr {
-	return _heap_alloc(size, zero_memory)
-}
-
-heap_resize :: proc "contextless" (ptr: rawptr, new_size: int) -> rawptr {
-	return _heap_resize(ptr, new_size)
-}
-
-heap_free :: proc "contextless" (ptr: rawptr) {
-	_heap_free(ptr)
-}
\ No newline at end of file
diff --git a/base/runtime/heap_allocator_control.odin b/base/runtime/heap_allocator_control.odin
new file mode 100644
index 00000000000..4f5ae68b468
--- /dev/null
+++ b/base/runtime/heap_allocator_control.odin
@@ -0,0 +1,92 @@
+#+build !js
+#+build !orca
+#+build !wasi
+package runtime
+
+import "base:intrinsics"
+
+/*
+Merge all remote frees then free as many slabs as possible.
+
+This bypasses any heuristics that keep slabs setup.
+
+Returns true if the superpage was emptied and freed.
+*/
+@(private)
+compact_superpage :: proc "contextless" (superpage: ^Heap_Superpage) -> (freed: bool) {
+	for i := 0; i < HEAP_SLAB_COUNT; /**/ {
+		slab := heap_superpage_index_slab(superpage, i)
+
+		if slab.bin_size > HEAP_MAX_BIN_SIZE {
+			// Skip contiguous slabs.
+			i += heap_slabs_needed_for_size(slab.bin_size)
+		} else {
+			i += 1
+			if slab.bin_size == 0 {
+				continue
+			}
+		}
+
+		slab_is_cached := slab.free_bins > 0
+		heap_merge_remote_frees(slab)
+
+		if slab.free_bins == slab.max_bins {
+			if slab.bin_size > HEAP_MAX_BIN_SIZE {
+				heap_free_wide_slab(superpage, slab)
+			} else {
+				if slab_is_cached {
+					heap_cache_remove_slab(slab, heap_bin_size_to_rank(slab.bin_size))
+				}
+				heap_free_slab(superpage, slab)
+			}
+		}
+	}
+
+	if superpage.free_slabs == HEAP_SLAB_COUNT && !superpage.cache_block.in_use {
+		heap_free_superpage(superpage)
+		freed = true
+	}
+	return
+}
+
+/*
+Merge all remote frees then free as many slabs and superpages as possible.
+
+This bypasses any heuristics that keep slabs setup.
+*/
+compact_heap :: proc "contextless" () {
+	superpage := local_heap
+	for {
+		if superpage == nil {
+			return
+		}
+		next_superpage := superpage.next
+		compact_superpage(superpage)
+		superpage = next_superpage
+	}
+}
+
+/*
+Free any empty superpages in the orphanage.
+
+This procedure assumes there won't ever be more than 128 superpages in the
+orphanage. This limitation is due to the avoidance of heap allocation.
+*/
+compact_heap_orphanage :: proc "contextless" () {
+	// First, try to empty the orphanage so that we can evaluate each superpage.
+	buffer: [128]^Heap_Superpage
+	for &b in buffer {
+		b = heap_pop_orphan()
+		if b == nil {
+			break
+		}
+	}
+
+	// Next, compact each superpage and push it back to the orphanage if it was
+	// not freed.
+	for superpage in buffer {
+		if !compact_superpage(superpage) {
+			heap_push_orphan(superpage)
+		}
+	}
+}
diff --git a/base/runtime/heap_allocator_debugging.odin b/base/runtime/heap_allocator_debugging.odin
new file mode 100644
index 00000000000..d055f2ce902
--- /dev/null
+++ b/base/runtime/heap_allocator_debugging.odin
@@ -0,0 +1,96 @@
+#+build !js
+#+build !orca
+#+build !wasi
+package runtime
+
+import "base:intrinsics"
+
+ODIN_DEBUG_HEAP :: #config(ODIN_DEBUG_HEAP, false)
+
+Heap_Code_Coverage_Type :: enum {
+	Alloc_Bin,
+	Alloc_Collected_Remote_Frees,
+	Alloc_Heap_Initialized,
+	Alloc_Huge,
+	Alloc_Slab_Wide,
+	Alloc_Slab_Wide_Needed_New_Superpage,
+	Alloc_Slab_Wide_Used_Available_Superpage,
+	Alloc_Zeroed_Memory,
+	Freed_Bin_Freed_Slab_Which_Was_Fully_Used,
+	Freed_Bin_Freed_Superpage,
+	Freed_Bin_Reopened_Full_Slab,
+	Freed_Bin_Updated_Slab_Next_Free_Sector,
+	Freed_Huge_Allocation,
+	Freed_Wide_Slab,
+	Heap_Expanded_Cache_Data,
+	Huge_Alloc_Size_Adjusted,
+	Huge_Alloc_Size_Set_To_Superpage,
+	Merged_Remote_Frees,
+	Orphaned_Superpage_Freed_Slab,
+	Orphaned_Superpage_Merged_Remote_Frees,
+	Remotely_Freed_Bin,
+	Remotely_Freed_Bin_Caused_Remote_Superpage_Caching,
+	Remotely_Freed_Wide_Slab,
+	Resize_Caused_Memory_Zeroing,
+	Resize_Crossed_Size_Categories,
+	Resize_Huge,
+	Resize_Huge_Caused_Memory_Zeroing,
+	Resize_Huge_Size_Adjusted,
+	Resize_Huge_Size_Set_To_Superpage,
+	Resize_Kept_Old_Pointer,
+	Resize_Wide_Slab_Caused_Memory_Zeroing,
+	Resize_Wide_Slab_Expanded_In_Place,
+	Resize_Wide_Slab_Failed_To_Find_Contiguous_Expansion,
+	Resize_Wide_Slab_From_Remote_Thread,
+	Resize_Wide_Slab_Kept_Old_Pointer,
+	Resize_Wide_Slab_Shrunk_In_Place,
+	Slab_Adjusted_For_Partial_Sector,
+	Superpage_Add_Remote_Free_Guarded_With_Masterless,
+	Superpage_Add_Remote_Free_Guarded_With_Set,
+	Superpage_Added_Remote_Free,
+	Superpage_Added_To_Open_Cache_By_Freeing_Wide_Slab,
+	Superpage_Added_To_Open_Cache_By_Resizing_Wide_Slab,
+	Superpage_Added_To_Open_Cache_By_Slab,
+	Superpage_Adopted_From_Orphanage,
+	Superpage_Created_By_Empty_Orphanage,
+	Superpage_Freed_By_Exiting_Thread,
+	Superpage_Freed_By_Wide_Slab,
+	Superpage_Freed_On_Full_Orphanage,
+	Superpage_Linked,
+	Superpage_Cache_Block_Cleared,
+	Superpage_Orphaned_By_Exiting_Thread,
+	Superpage_Pushed_To_Orphanage,
+	Superpage_Registered_With_Free_Slabs,
+	Superpage_Registered_With_Slab_In_Use,
+	Superpage_Removed_From_Open_Cache_By_Slab,
+	Superpage_Unlinked_Non_Tail,
+	Superpage_Unlinked_Tail,
+	Superpage_Unregistered,
+	Superpage_Updated_Next_Free_Slab_Index,
+	Superpage_Updated_Next_Free_Slab_Index_As_Empty,
+}
+
+when ODIN_DEBUG_HEAP {
+	heap_global_code_coverage: [Heap_Code_Coverage_Type]int // atomic
+}
+
+@(private, disabled=!ODIN_DEBUG_HEAP)
+heap_debug_cover :: #force_inline proc "contextless" (type: Heap_Code_Coverage_Type) {
+	when ODIN_DEBUG_HEAP {
+		intrinsics.atomic_add_explicit(&heap_global_code_coverage[type], 1, .Release)
+	}
+}
+
+_check_heap_code_coverage :: proc "contextless" () -> bool {
+	when ODIN_DEBUG_HEAP {
+		intrinsics.atomic_thread_fence(.Seq_Cst)
+		for t in heap_global_code_coverage {
+			if t == 0 {
+				return false
+			}
+		}
+		return true
+	} else {
+		panic_contextless("ODIN_DEBUG_HEAP is not enabled, therefore the results of this procedure are meaningless.")
+	}
+}
diff --git a/base/runtime/heap_allocator_fallback.odin b/base/runtime/heap_allocator_fallback.odin
new file mode 100644
index 00000000000..25e906d7e3d
--- /dev/null
+++ b/base/runtime/heap_allocator_fallback.odin
@@ -0,0 +1,135 @@
+#+build orca
+package runtime
+
+// This is the libc malloc-based Odin allocator, used as a fallback for
+// platforms that do not support virtual memory, superpages, or aligned
+// allocation, or for platforms where we would prefer to use the system's
+// built-in allocator through the malloc interface.
+
+import "base:intrinsics"
+
+foreign {
+	@(link_name="malloc")   _libc_malloc   :: proc "c" (size: uint) -> rawptr ---
+	@(link_name="calloc")   _libc_calloc   :: proc "c" (num, size: uint) -> rawptr ---
+	@(link_name="free")     _libc_free     :: proc "c" (ptr: rawptr) ---
+	@(link_name="realloc")  _libc_realloc  :: proc "c" (ptr: rawptr, size: uint) -> rawptr ---
+}
+
+@(require_results)
+heap_alloc :: proc "contextless" (size: int, zero_memory := true) -> rawptr {
+	if size <= 0 {
+		return nil
+	}
+	if zero_memory {
+		return _libc_calloc(1, uint(size))
+	} else {
+		return _libc_malloc(uint(size))
+	}
+}
+
+@(require_results)
+heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int, zero_memory: bool = true) -> (new_ptr: rawptr) {
+	new_ptr = _libc_realloc(old_ptr, uint(new_size))
+
+	// Section 7.22.3.5.2 of the C17 standard: "The contents of the new object
+	// shall be the same as that of the old object prior to deallocation, up to
+	// the lesser of the new and old sizes. Any bytes in the new object beyond
+	// the size of the old object have indeterminate values."
+	//
+	// Therefore, we zero the memory ourselves.
+	if zero_memory && new_size > old_size {
+		intrinsics.mem_zero(rawptr(uintptr(new_ptr) + uintptr(old_size)), new_size - old_size)
+	}
+
+	return
+}
+
+heap_free :: proc "contextless" (ptr: rawptr) {
+	_libc_free(ptr)
+}
+
+heap_allocator :: proc() -> Allocator {
+	return {
+		data      = nil,
+		procedure = heap_allocator_proc,
+	}
+}
+
+heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
+                            size, alignment: int,
+                            old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) {
+
+	// Because malloc does not support alignment requests, and aligned_alloc
+	// has specific requirements for what sizes it supports, this allocator
+	// over-allocates by the alignment requested and stores the original
+	// pointer behind the address returned to the user.
+
+	switch mode {
+	case .Alloc, .Alloc_Non_Zeroed:
+		padding := max(alignment, size_of(rawptr))
+		ptr := heap_alloc(size + padding, mode == .Alloc)
+		if ptr == nil {
+			return nil, .Out_Of_Memory
+		}
+		shift := uintptr(padding) - uintptr(ptr) & uintptr(padding-1)
+		aligned_ptr := rawptr(uintptr(ptr) + shift)
+		([^]rawptr)(aligned_ptr)[-1] = ptr
+		return byte_slice(aligned_ptr, size), nil
+
+	case .Free:
+		if old_memory != nil {
+			heap_free(([^]rawptr)(old_memory)[-1])
+		}
+
+	case .Free_All:
+		return nil, .Mode_Not_Implemented
+
+	case .Resize, .Resize_Non_Zeroed:
+		new_padding := max(alignment, size_of(rawptr))
+		original_ptr := ([^]rawptr)(old_memory)[-1]
+		ptr: rawptr
+
+		if alignment > align_of(rawptr) {
+			// The alignment is in excess of what malloc/realloc will return
+			// for address alignment per the C standard, so we must reallocate
+			// manually in order to guarantee alignment for the user.
+			//
+			// Resizing through realloc simply won't work because it's possible
+			// that our target address originally only needed a padding of 8
+			// bytes, but if we expand the memory used and the address is moved,
+			// we may then need 16 bytes for proper alignment, for example.
+			//
+			// We'll copy the old data later.
+			ptr = heap_alloc(size + new_padding, mode == .Resize)
+
+		} else {
+			real_old_size := size_of(rawptr) + old_size
+			real_new_size := new_padding + size
+
+			ptr = heap_resize(original_ptr, real_old_size, real_new_size, mode == .Resize)
+		}
+
+		shift := uintptr(new_padding) - uintptr(ptr) & uintptr(new_padding-1)
+		aligned_ptr := rawptr(uintptr(ptr) + shift)
+		([^]rawptr)(aligned_ptr)[-1] = ptr
+
+		if alignment > align_of(rawptr) {
+			intrinsics.mem_copy_non_overlapping(aligned_ptr, old_memory, min(size, old_size))
+			heap_free(original_ptr)
+		}
+
+		return byte_slice(aligned_ptr, size), nil
+
+	case .Query_Features:
+		set := (^Allocator_Mode_Set)(old_memory)
+		if set != nil {
+			set^ = {.Alloc, .Alloc_Non_Zeroed, .Free, .Resize, .Resize_Non_Zeroed, .Query_Features}
+		}
+		return nil, nil
+
+	case .Query_Info:
+		return nil, .Mode_Not_Implemented
+	}
+
+	return nil, nil
+}
diff --git a/base/runtime/heap_allocator_implementation.odin b/base/runtime/heap_allocator_implementation.odin
new file mode 100644
index 00000000000..cf3f2d795c3
--- /dev/null
+++ b/base/runtime/heap_allocator_implementation.odin
@@ -0,0 +1,1893 @@
+#+build !js
+#+build !orca
+#+build !wasi
+package runtime
+
+import "base:intrinsics"
+
+/*
+This is the dynamic heap allocator for the Odin runtime.
+
+**Features**
+
+- Lock-free guarantee: A thread cannot deadlock or obstruct the allocator;
+  some thread makes progress no matter the parallel conditions.
+
+- Thread-local heaps: There is no allocator-induced false sharing.
+
+- Global storage for unused memory: When a thread finishes cleanly, its memory
+  is sent to a global storage where other threads can use it.
+
+- Headerless: All bin-sized allocations (any power of two, less than or equal
+  to 32KiB, by default) consume no extra space, and as a result of being tightly
+  packed, enhance performance with cache locality for programs.
+
+
+**Terminology**
+
+- Bin: an allocation of a fixed size, shared with others of the same size category.
+
+- Slab: a fixed-size block within a Superpage that is divided into a constant
+  number of Bins at runtime based on the needs of the program.
+
+- Sector: a variable-sized bitmap, used to track whether a Bin is free or not.
+
+
+**Allocation Categories**
+
+- Huge:      anything in excess of 3/4ths of a Superpage.
+- Slab-wide: anything in excess of the largest Bin size.
+- Bin:       anything less than or equal to the largest bin size.
+*/
+
+//
+// Tunables
+//
+
+// NOTE: Adjusting this constant by itself is rarely enough; `HEAP_SUPERPAGE_CACHE_RATIO`
+// will have to be changed as well.
+HEAP_SLAB_SIZE                     :: #config(ODIN_HEAP_SLAB_SIZE, 64 * Kilobyte)
+HEAP_MAX_BIN_SIZE                  :: #config(ODIN_HEAP_MAX_BIN_SIZE, HEAP_SLAB_SIZE / 2)
+HEAP_MIN_BIN_SIZE                  :: #config(ODIN_HEAP_MIN_BIN_SIZE, 8 * Byte)
+HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES :: #config(ODIN_HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES, 3)
+HEAP_SUPERPAGE_CACHE_RATIO         :: #config(ODIN_HEAP_SUPERPAGE_CACHE_RATIO, 20)
+HEAP_PANIC_ON_DOUBLE_FREE          :: #config(ODIN_HEAP_PANIC_ON_DOUBLE_FREE, true)
+
+//
+// Constants
+//
+
+HEAP_MAX_BIN_SHIFT                :: intrinsics.constant_log2(HEAP_MAX_BIN_SIZE)
+HEAP_MIN_BIN_SHIFT                :: intrinsics.constant_log2(HEAP_MIN_BIN_SIZE)
+HEAP_BIN_RANKS                    :: 1 + HEAP_MAX_BIN_SHIFT - HEAP_MIN_BIN_SHIFT
+HEAP_HUGE_ALLOCATION_BOOK_KEEPING :: size_of(int) + HEAP_MAX_ALIGNMENT
+HEAP_HUGE_ALLOCATION_THRESHOLD    :: SUPERPAGE_SIZE / 4 * 3
+HEAP_MAX_ALIGNMENT                :: 64 * Byte
+HEAP_CACHE_SLAB_MAP_STRIDE        :: HEAP_SUPERPAGE_CACHE_RATIO * HEAP_SLAB_COUNT
+HEAP_SECTOR_TYPES                 :: 2 // { local_free, remote_free }
+HEAP_SLAB_ALLOCATION_BOOK_KEEPING :: size_of(Heap_Slab) + HEAP_SECTOR_TYPES * size_of(uint) + HEAP_MAX_ALIGNMENT
+HEAP_SLAB_COUNT                   :: SUPERPAGE_SIZE / HEAP_SLAB_SIZE - 1
+
+@(private="file") INTEGER_BITS :: 8 * size_of(int)
+@(private="file") SECTOR_BITS  :: 8 * size_of(uint)
+
+//
+// Sanity checking
+//
+
+#assert(HEAP_MAX_BIN_SIZE & (HEAP_MAX_BIN_SIZE-1) == 0, "HEAP_MAX_BIN_SIZE must be a power of two.")
+#assert(HEAP_MAX_BIN_SIZE < HEAP_SLAB_SIZE - size_of(Heap_Slab) - HEAP_SECTOR_TYPES * size_of(uint), "HEAP_MAX_BIN_SIZE must be able to fit into one Slab, including its free maps.")
+#assert(HEAP_MAX_BIN_SIZE >= HEAP_MIN_BIN_SIZE, "HEAP_MAX_BIN_SIZE must be greater than or equal to HEAP_MIN_BIN_SIZE.")
+#assert(HEAP_HUGE_ALLOCATION_THRESHOLD <= SUPERPAGE_SIZE - HEAP_HUGE_ALLOCATION_BOOK_KEEPING, "HEAP_HUGE_ALLOCATION_THRESHOLD must be smaller than a Superpage, with enough space for HEAP_HUGE_ALLOCATION_BOOK_KEEPING.")
+#assert(HEAP_MIN_BIN_SIZE & (HEAP_MIN_BIN_SIZE-1) == 0, "HEAP_MIN_BIN_SIZE must be a power of two.")
+#assert(HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES >= 0, "HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES must be positive.")
+#assert(HEAP_MAX_ALIGNMENT & (HEAP_MAX_ALIGNMENT-1) == 0, "HEAP_MAX_ALIGNMENT must be a power of two.")
+#assert(HEAP_SLAB_COUNT > 0, "HEAP_SLAB_COUNT must be greater than zero.")
+#assert(HEAP_SLAB_SIZE & (HEAP_SLAB_SIZE-1) == 0, "HEAP_SLAB_SIZE must be a power of two.")
+#assert(SUPERPAGE_SIZE >= 2 * HEAP_SLAB_SIZE, "SUPERPAGE_SIZE must be at least twice HEAP_SLAB_SIZE.")
+#assert(size_of(Heap_Superpage) < HEAP_SLAB_SIZE, "The Superpage struct must not exceed the size of a Slab.")
+
+//
+// Utility Procedures
+//
+
+@(require_results, private)
+round_to_nearest_power_of_two :: #force_inline proc "contextless" (n: uint) -> int {
+	assert_contextless(n > 1, "This procedure does not handle the edge case of n < 2.")
+	return 1 << (INTEGER_BITS - intrinsics.count_leading_zeros(n-1))
+}
+
+@(require_results)
+heap_slabs_needed_for_size :: #force_inline proc "contextless" (size: int) -> (result: int) {
+	assert_contextless(size > 0)
+	assert_contextless(size > HEAP_MAX_BIN_SIZE)
+	size := size
+	size += HEAP_SLAB_ALLOCATION_BOOK_KEEPING
+	result = size / HEAP_SLAB_SIZE + (0 if size % HEAP_SLAB_SIZE == 0 else 1)
+	assert_contextless(result > 0)
+	assert_contextless(result < HEAP_SLAB_COUNT, "Calculated an overly-large Slab-wide allocation.")
+	return
+}
+
+@(require_results)
+heap_round_to_bin_size :: #force_inline proc "contextless" (n: int) -> int {
+	if n <= HEAP_MIN_BIN_SIZE {
+		return HEAP_MIN_BIN_SIZE
+	}
+	m := round_to_nearest_power_of_two(uint(n))
+	assert_contextless(m & (m-1) == 0, "Internal rounding error.")
+	return m
+}
+
+@(require_results)
+heap_bin_size_to_rank :: #force_inline proc "contextless" (size: int) -> (rank: int) {
+	// By this point, a size of zero should've been rounded up to HEAP_MIN_BIN_SIZE.
+	assert_contextless(size > 0, "Size must be greater-than zero.")
+	assert_contextless(size & (size-1) == 0, "Size must be a power of two.")
+	rank = intrinsics.count_trailing_zeros(size >> HEAP_MIN_BIN_SHIFT)
+	assert_contextless(rank <= HEAP_BIN_RANKS, "Bin rank calculated incorrectly; it must be less-than-or-equal-to HEAP_BIN_RANKS.")
+	return
+}
+
+@(require_results)
+find_superpage_from_pointer :: #force_inline proc "contextless" (ptr: rawptr) -> ^Heap_Superpage {
+	return cast(^Heap_Superpage)(uintptr(ptr) & ~uintptr(SUPERPAGE_SIZE-1))
+}
+
+@(require_results)
+find_slab_from_pointer :: #force_inline proc "contextless" (ptr: rawptr) -> ^Heap_Slab {
+	return cast(^Heap_Slab)(uintptr(ptr) & ~uintptr(HEAP_SLAB_SIZE-1))
+}
+
+/*
+Get a specific slab by index from a superpage.
+
+The position deltas are all constant with respect to the origin, hence this
+procedure should prove faster than accessing an array of pointers.
+*/
+@(require_results)
+heap_superpage_index_slab :: #force_inline proc "contextless" (superpage: ^Heap_Superpage, index: int) -> (slab: ^Heap_Slab) {
+	assert_contextless(index >= 0, "The heap allocator tried to index a negative slab index.")
+	assert_contextless(index < HEAP_SLAB_COUNT, "The heap allocator tried to index a slab beyond the configured maximum.")
+	return cast(^Heap_Slab)(uintptr(superpage) + HEAP_SLAB_SIZE * uintptr(1 + index))
+}
+
+//
+// Data Structures
+//
+
+/*
+The **Slab** is a fixed-size division of a Superpage, configured at runtime to
+contain fixed-size allocations. It uses two bitmaps to keep track of the
+state of its bins: whether a bin is locally free or remotely free.
+
+It is a Slab allocator in its own right, hence the name.
+
+Each allocation is self-aligned, up to an alignment size of `HEAP_MAX_ALIGNMENT`
+(64 bytes by default). The slab itself is aligned to its size, allowing
+allocations within to find their owning slab in constant time.
+
+Remote threads, in an atomic wait-free manner, flip bits across `remote_free`
+to signal that they have freed memory which does not belong to them. The owning
+thread will then collect those bits when the slab is approaching fullness.
+
+
+**Fields**:
+
+`index` is used to make self-referencing easier. It must be the first field in
+the Slab, as `heap_slab_clear_data` depends on this.
+
+`bin_size` tracks the precise byte size of the allocations.
+
+`is_dirty` indicates that the slab has been used in the past and its memory may
+not be entirely zeroed.
+
+`is_full` is true at some point after `free_bins` == 0 and false otherwise.
+It is atomic and used for inter-thread communication.
+
+`has_remote_frees` is true at some point after a bit has been flipped on in
+`remote_free`. It is atomic and used for inter-thread communication.
+
+`max_bins` is set at initialization with the number of bins allotted.
+
+`free_bins` counts the exact number of free bins known to the allocating
+thread. This value does not yet include any remote free bins.
+
+`dirty_bins` tracks which bins have been used and must be zeroed before being
+handed out again. Because we always allocate linearly, this is simply an
+integer indicating the greatest index the slab has ever distributed.
+
+`next_free_sector` points to which `uint` in `local_free` contains a free bit.
+It is always the lowest index possible.
+
+`sectors` is the length of each bitmap.
+
+`local_free` tracks which bins are free.
+
+`remote_free` tracks which bins have been freed by other threads.
+It is atomic.
+
+`data` points to the first bin and is used for calculating bin positions.
+*/
+Heap_Slab :: struct {
+	index:                      int,     // This field must be the first.
+	bin_size:                   int,
+	is_dirty:                   bool,
+
+	max_bins:                   int,
+	free_bins:                  int,
+	dirty_bins:                 int,
+	next_free_sector:           int,
+
+	is_full:                    bool,    // atomic
+	remote_free_bins_scheduled: int,     // atomic
+
+	sectors:                    int,
+	local_free:                 [^]uint,
+	remote_free:                [^]uint, // data referenced is atomic
+
+	// NOTE: `remote_free` and its contents should be fine with regards to
+	// alignment for the purposes of atomic access, as every field in a Slab is
+	// at least register-sized.
+	//
+	// Atomically accessing memory that is not aligned to the register size
+	// may cause an issue on some systems.
+
+	data:                       uintptr,
+
+	/* ... local_free's data  ... */
+	/* ... remote_free's data ... */
+	/* ... allocation data    ... */
+}
+
+/*
+The **Superpage** is a single allocation from the operating system's virtual
+memory subsystem, always with a fixed size at compile time, that is used to
+store almost every allocation requested by the program in sub-allocators known
+as Slabs.
+
+On almost every platform, this structure will be 2MiB by default.
+
+Depending on the operating system, addresses within the space occupied by the
+superpage (and hence its allocations) may also have faster access times due to
+leveraging properties of the Translation Lookaside Buffer.
+
+It is always aligned to its size, allowing any address allocated from its space
+to look up the owning superpage in constant-time with bit masking.
+
+**Fields:**
+
+`huge_size` is precisely how many bytes have been allocated to the address at
+which this structure resides, if it is a Huge allocation. It is zero for normal
+superpages.
+
+`prev` points to the previous superpage in the doubly-linked list of all
+superpages for a single thread's heap.
+
+`next` points to the next superpage in the same doubly-linked list. It is
+accessed with atomics only when engaging with the orphanage, as that is the
+only time it should change in a parallel situation. For all other cases, the
+thread which owns the superpage is the only one to read this field.
+
+`remote_free_set` is true if there might be a slab with remote frees in this superpage.
+
+`owner` is the value of `current_thread_id` for the thread which owns this superpage..
+
+`master_cache_block` points to the `local_heap_cache` for the thread which owns this
+superpage. This field is used to help synchronize remote freeing.
+
+`free_slabs` is the count of slabs which are ready to use for new bin ranks.
+
+`next_free_slab_index` is the lowest index of a free slab.
+A value of `HEAP_SLAB_COUNT` means there are no free slabs.
+
+`cache_block` is the space where data for the heap's cache is stored, if this
+superpage has been claimed by the heap. It should otherwise be all zero.
+*/
+Heap_Superpage :: struct {
+	huge_size: int,        // This field must be the first.
+	prev: ^Heap_Superpage,
+	next: ^Heap_Superpage, // atomic in orphanage, otherwise non-atomic
+
+	remote_free_set:    bool,              // atomic
+	owner:              int,               // atomic
+	master_cache_block: ^Heap_Cache_Block, // atomic
+
+	free_slabs: int,
+	next_free_slab_index: int,
+
+	cache_block: Heap_Cache_Block,
+}
+
+/*
+`Heap_Cache_Block` is a structure that lives within each `Heap_Superpage` that has been
+claimed by a thread's heap to store heap-relevant metadata for improving
+allocator performance. This structure makes use of space that would have
+otherwise been unused due to the particular alignment needs the allocator has.
+
+The number of superpages that each `Heap_Cache_Block` oversees is dictated by the
+`HEAP_SUPERPAGE_CACHE_RATIO` constant. As a thread's heap accumulates more
+superpages, the heap will need to claim additional superpages - at the rate of
+that constant - to keep track of all the possible data.
+
+The arrays are configured in such a manner that they can never overflow, and
+the heap will always have more than enough space to record the data needed.
+
+
+The `HEAP_CACHE_SLAB_MAP_STRIDE` is of note, as it must contain the number of slabs
+per superpage (31, by default) times the number of superpages overseen per
+`Heap_Cache_Block`. This is largely where all the space is allocated, but it pays off
+handsomely in allowing the allocator to find if a slab is available for any
+particular bin size in constant time.
+
+In practice, this is an excessive amount of space allocated for a map of arrays
+for this purpose, but it is entirely possible that the allocator could get into
+a spot where this hash map could overflow if it used any less space. For
+instance, if the allocator had as many superpages as possible for one
+`Heap_Cache_Block`, filled all the slabs, then the program freed one bin from each
+slab, the allocator would need every slot in the map. The cache has as much
+space to prevent just that problem from arising.
+
+**Fields:**
+
+`in_use` will be true, if the parent `Heap_Superpage` is using the space occupied by
+this struct. This indicates that the superpage should not be freed.
+
+`next_cache_block` is an atomic pointer to the next `Heap_Cache_Block`. Each
+`Heap_Cache_Block` is linked together in this way to allow each heap to expand
+its available space for tracking information.
+
+
+_The next three fields are only used in the `Heap_Superpage` pointed to by `local_heap`._
+
+`length` is how many `Heap_Cache_Block` structs are in use across the local
+thread's heap.
+
+`owned_superpages` is how many superpages are in use by the local thread's
+heap.
+
+`remote_free_count` is an estimate of how many superpages have slabs with
+remote frees available to merge.
+
+
+_The next three fields constitute the main data used in this struct, and each
+of them are like partitions, spread across a linked list._
+
+`slab_map` is a hash map of arrays, keyed by bin rank, used to quickly find a
+slab with free bins for a particular bin size. It uses linear probing.
+
+`superpages_with_free_slabs` is an array of superpages with `free_slabs`
+greater than zero, used for quickly allocating new slabs when none can be found
+in `slab_map`.
+
+`superpages_with_remote_frees` is an atomic set containing references to
+superpages that may have slabs with remotely free bins. Superpages are only
+added if the slab was full at the time, and an invasive flag keeps the
+algorithm from having to search the entire span for an existence check.
+
+Keep in mind that even though a superpage is added to this set when a slab is
+full and freed remotely, that state need not persist; the owning thread can
+free a bin before the remote frees are merged, invalidating that part of the
+heuristic.
+*/
+Heap_Cache_Block :: struct {
+	in_use: bool,
+	next_cache_block: ^Heap_Cache_Block, // atomic
+
+	// { only used in `local_heap`
+	length:            int,
+	owned_superpages:  int,
+	remote_free_count: int, // atomic
+	// }
+
+	slab_map:                     [HEAP_CACHE_SLAB_MAP_STRIDE*HEAP_BIN_RANKS]^Heap_Slab,
+	superpages_with_free_slabs:   [HEAP_SUPERPAGE_CACHE_RATIO]^Heap_Superpage,
+	superpages_with_remote_frees: [HEAP_SUPERPAGE_CACHE_RATIO]^Heap_Superpage, // atomic
+}
+
+//
+// Superstructure Allocation
+//
+
+/*
+Allocate memory for a Superpage from the operating system and do any initialization work.
+*/
+@(require_results)
+heap_make_superpage :: proc "contextless" () -> (superpage: ^Heap_Superpage) {
+	superpage = cast(^Heap_Superpage)allocate_virtual_memory_superpage()
+	assert_contextless(uintptr(superpage) & uintptr(SUPERPAGE_SIZE-1) == 0, "The operating system returned virtual memory which isn't aligned to a Superpage-sized boundary.")
+
+	superpage.owner = get_current_thread_id()
+	superpage.free_slabs = HEAP_SLAB_COUNT
+
+	// Each Slab is aligned to its size, so that finding which slab an
+	// allocated pointer is assigned to is a constant operation by bit masking.
+	//
+	// However, this means we must waste one Slab per Superpage so that the
+	// Superpage data can live nearby inside the chunk of virtual memory.
+	base := uintptr(superpage) + HEAP_SLAB_SIZE
+	for i in 0..<HEAP_SLAB_COUNT {
+		slab := heap_superpage_index_slab(superpage, i)
+		slab.index = i
+		assert_contextless(find_superpage_from_pointer(slab) == superpage, "Lookup of slab to superpage failed.")
+		assert_contextless(base == uintptr(find_slab_from_pointer(slab)), "Reverse lookup from slab base pointer failed.")
+		assert_contextless(base - uintptr(superpage) + HEAP_SLAB_SIZE - 1 < SUPERPAGE_SIZE, "A slab was setup beyond the superpage boundary.")
+		base += HEAP_SLAB_SIZE
+	}
+
+	return superpage
+}
+
+/*
+Give the Superpage back to the operating system if the orphanage has more than
+`HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES` superpages.
+
+Otherwise, place it in the orphanage for another thread to adopt later.
+*/
+heap_free_superpage :: proc "contextless" (superpage: ^Heap_Superpage) {
+	// We cannot free a Superpage that is being used by the heap cache.
+	// It will stay available for future allocations.
+	assert_contextless(superpage.free_slabs == HEAP_SLAB_COUNT, "The heap allocator tried to free a superpage that was not fully free.")
+	assert_contextless(!superpage.cache_block.in_use, "The heap allocator tried to free a superpage that's in use by its cache.")
+	heap_unlink_superpage(superpage)
+	heap_cache_remove_superpage_with_free_slabs(superpage)
+	heap_cache_unregister_superpage(superpage)
+	// We only return excess Superpages to the operating system.
+	//
+	// The rest are put to a global orphanage so that the memory is ready to be
+	// used as soon as needed.
+	if intrinsics.atomic_add_explicit(&heap_orphanage_count, 1, .Acq_Rel) >= HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES {
+		intrinsics.atomic_sub_explicit(&heap_orphanage_count, 1, .Relaxed)
+		free_virtual_memory(superpage, SUPERPAGE_SIZE)
+		heap_debug_cover(.Superpage_Freed_On_Full_Orphanage)
+	} else {
+		heap_push_orphan(superpage)
+		heap_debug_cover(.Superpage_Pushed_To_Orphanage)
+	}
+}
+
+/*
+Remove a superpage from a heap's cache of superpages.
+*/
+heap_cache_unregister_superpage :: proc "contextless" (superpage: ^Heap_Superpage) {
+	if superpage.cache_block.in_use {
+		return
+	}
+	local_heap_cache.owned_superpages -= 1
+	intrinsics.atomic_store_explicit(&superpage.master_cache_block, nil, .Release)
+	heap_debug_cover(.Superpage_Unregistered)
+}
+
+/*
+Add a superpage to a heap's cache of superpages.
+
+This will take note if the superpage has free slabs, what are the contents of
+its slabs, and it will merge any waiting remote free bins.
+*/
+heap_cache_register_superpage :: proc "contextless" (superpage: ^Heap_Superpage) {
+	// Expand the heap cache's available space if needed.
+	local_heap_cache.owned_superpages += 1
+	if local_heap_cache.owned_superpages / HEAP_SUPERPAGE_CACHE_RATIO > local_heap_cache.length {
+		tail := local_heap_cache
+		for /**/; tail.next_cache_block != nil; tail = tail.next_cache_block { }
+		heap_cache_expand(tail)
+	}
+
+	// This must come after expansion to prevent a remote thread from running out of space.
+	// The cache is first expanded, _then_ other threads may know about it.
+	intrinsics.atomic_store_explicit(&superpage.master_cache_block, local_heap_cache, .Release)
+
+	// Register the superpage for allocation.
+	if superpage.free_slabs > 0 {
+		// Register the superpage as having free slabs available.
+		heap_cache_add_superpage_with_free_slabs(superpage)
+		heap_debug_cover(.Superpage_Registered_With_Free_Slabs)
+	}
+
+	// Register slabs.
+	if superpage.free_slabs < HEAP_SLAB_COUNT {
+		for i := 0; i < HEAP_SLAB_COUNT; /**/ {
+			slab := heap_superpage_index_slab(superpage, i)
+
+			if slab.bin_size > HEAP_MAX_BIN_SIZE {
+				// Skip contiguous slabs.
+				i += heap_slabs_needed_for_size(slab.bin_size)
+			} else {
+				i += 1
+				if slab.bin_size == 0 {
+					continue
+				}
+			}
+
+			// When adopting a new Superpage, we take the opportunity to
+			// merge any remote frees. This is important because it's
+			// possible for another thread to remotely free memory while
+			// the thread which owned it is in limbo.
+			if intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Acquire) > 0 {
+				heap_merge_remote_frees(slab)
+
+				if slab.free_bins > 0 {
+					// Synchronize with any thread that might be trying to
+					// free as we merge.
+					intrinsics.atomic_store_explicit(&slab.is_full, false, .Release)
+				}
+			}
+
+			// Free any empty Slabs and register the ones with free bins.
+			if slab.free_bins == slab.max_bins {
+				if slab.bin_size > HEAP_MAX_BIN_SIZE {
+					heap_free_wide_slab(superpage, slab)
+				} else {
+					heap_free_slab(superpage, slab)
+				}
+			} else if slab.bin_size <= HEAP_MAX_BIN_SIZE && slab.free_bins > 0 {
+				heap_cache_add_slab(slab, heap_bin_size_to_rank(slab.bin_size))
+			}
+
+			i += 1
+			heap_debug_cover(.Superpage_Registered_With_Slab_In_Use)
+		}
+	}
+}
+
+/*
+Get a superpage, first by adopting any orphan, or allocating memory from the
+operating system if one is not available.
+*/
+@(require_results)
+heap_get_superpage:: proc "contextless" () -> (superpage: ^Heap_Superpage) {
+	superpage = heap_pop_orphan()
+	if superpage == nil {
+		superpage = heap_make_superpage()
+		heap_debug_cover(.Superpage_Created_By_Empty_Orphanage)
+	} else {
+		heap_debug_cover(.Superpage_Adopted_From_Orphanage)
+	}
+	assert_contextless(superpage != nil, "The heap allocator failed to get a superpage.")
+	return
+}
+
+/*
+Make an allocation in the Huge category.
+*/
+@(require_results)
+heap_make_huge_allocation :: proc "contextless" (size: int) -> (ptr: rawptr) {
+	// NOTE: ThreadSanitizer may wrongly say that this is the source of a data
+	// race. This is because a virtual memory address has been allocated once,
+	// returned to the operating system, then given back to the process in a
+	// different thread.
+	//
+	// It is otherwise impossible for us to race on newly allocated memory.
+	size := size
+	if size < SUPERPAGE_SIZE - HEAP_HUGE_ALLOCATION_BOOK_KEEPING {
+		size = SUPERPAGE_SIZE
+		heap_debug_cover(.Huge_Alloc_Size_Set_To_Superpage)
+	} else {
+		size += HEAP_HUGE_ALLOCATION_BOOK_KEEPING
+		heap_debug_cover(.Huge_Alloc_Size_Adjusted)
+	}
+	assert_contextless(size >= SUPERPAGE_SIZE, "Calculated incorrect Huge allocation size.")
+
+	// All free operations assume every pointer has a Superpage at the
+	// Superpage boundary of the pointer, and a Huge allocation is no
+	// different.
+	//
+	// The size of the allocation is written as an integer at the beginning,
+	// but all other fields are left zero-initialized. We then align forward to
+	// `HEAP_MAX_ALIGNMENT` (64 bytes, by default) and give that to the user.
+	superpage := cast(^Heap_Superpage)allocate_virtual_memory_aligned(size, SUPERPAGE_SIZE)
+	assert_contextless(uintptr(superpage) & uintptr(SUPERPAGE_SIZE-1) == 0, "The operating system returned virtual memory which isn't aligned to a Superpage-sized boundary.")
+
+	superpage.huge_size = size
+
+	u := uintptr(superpage) + HEAP_HUGE_ALLOCATION_BOOK_KEEPING
+	ptr = rawptr(u - u & (HEAP_MAX_ALIGNMENT-1))
+
+	assert_contextless(uintptr(ptr) & (HEAP_MAX_ALIGNMENT-1) == 0, "Huge allocation is not aligned to HEAP_MAX_ALIGNMENT.")
+	assert_contextless(find_superpage_from_pointer(ptr) == superpage, "Huge allocation reverse lookup failed.")
+
+	return
+}
+
+/*
+Make an allocation that is at least one entire Slab wide from the provided superpage.
+
+This will return false if the Superpage lacks enough contiguous Slabs to fit the size.
+*/
+@(require_results)
+heap_make_slab_sized_allocation :: proc "contextless" (superpage: ^Heap_Superpage, size: int) -> (ptr: rawptr, ok: bool) {
+	assert_contextless(0 <= superpage.next_free_slab_index && superpage.next_free_slab_index < HEAP_SLAB_COUNT, "Invalid next_free_slab_index.")
+	contiguous := heap_slabs_needed_for_size(size)
+
+	find_run: for start := superpage.next_free_slab_index; start < HEAP_SLAB_COUNT-contiguous+1; /**/ {
+		n := contiguous
+
+		for i := start; i < HEAP_SLAB_COUNT; /**/ {
+			bin_size := heap_superpage_index_slab(superpage, i).bin_size
+			if bin_size > HEAP_MAX_BIN_SIZE {
+				// Skip contiguous slabs.
+				start = i + heap_slabs_needed_for_size(bin_size)
+				continue find_run
+			} else if bin_size != 0 {
+				start = i + 1
+				continue find_run
+			}
+			n -= 1
+			if n > 0 {
+				i += 1
+				continue
+			}
+			// Setup the Slab header.
+			// This will be a single-sector Slab that may span several Slabs.
+			slab := heap_superpage_index_slab(superpage, start)
+
+			// Setup slab.
+			if slab.is_dirty {
+				heap_slab_clear_data(slab)
+			}
+			slab.bin_size = size
+			slab.is_full = true
+			slab.max_bins = 1
+			slab.dirty_bins = 1
+			slab.sectors = 1
+			slab.local_free  = cast([^]uint)(uintptr(slab) + size_of(Heap_Slab))
+			slab.remote_free = cast([^]uint)(uintptr(slab) + size_of(Heap_Slab) + 1 * size_of(uint))
+			data := uintptr(slab) + HEAP_SLAB_ALLOCATION_BOOK_KEEPING
+			ptr = rawptr(data - data & (HEAP_MAX_ALIGNMENT-1))
+			slab.data = uintptr(ptr)
+			assert_contextless(uintptr(ptr) & (HEAP_MAX_ALIGNMENT-1) == 0, "Slab-wide allocation's data pointer is not correctly aligned.")
+			assert_contextless(int(uintptr(ptr) - uintptr(superpage)) + size < SUPERPAGE_SIZE, "Incorrectly calculated Slab-wide allocation exceeds Superpage end boundary.")
+
+			// Wipe any non-zero data from slabs ahead of the header.
+			for x in start+1..=i {
+				next_slab := heap_superpage_index_slab(superpage, x)
+				next_slab.index = 0
+				if next_slab.is_dirty {
+					heap_slab_clear_data(next_slab)
+				}
+			}
+
+			// Update statistics.
+			superpage.free_slabs -= contiguous
+			assert_contextless(superpage.free_slabs >= 0, "The heap allocator caused a superpage's free_slabs to go negative.")
+			if superpage.free_slabs == 0 {
+				heap_cache_remove_superpage_with_free_slabs(superpage)
+			}
+			// NOTE: Start from zero again, because we may have skipped a non-contiguous block.
+			heap_update_next_free_slab_index(superpage, 0)
+
+			return ptr, true
+		}
+	}
+
+	return
+}
+
+//
+// Slabs
+//
+
+/*
+Do everything that is needed to ready a Slab for a specific bin size.
+
+This involves a handful of space calculations and writing the header.
+*/
+@(require_results)
+heap_slab_setup :: proc "contextless" (superpage: ^Heap_Superpage, rounded_size: int) -> (slab: ^Heap_Slab) {
+	assert_contextless(0 <= superpage.next_free_slab_index && superpage.next_free_slab_index < HEAP_SLAB_COUNT, "The heap allocator found a Superpage with an invalid next_free_slab_index.")
+	assert_contextless(superpage.free_slabs > 0, "The heap allocator tried to setup a Slab in an exhausted Superpage.")
+
+	superpage.free_slabs -= 1
+	assert_contextless(superpage.free_slabs >= 0, "The heap allocator caused a Superpage's free_slabs to go negative.")
+
+	slab = heap_superpage_index_slab(superpage, superpage.next_free_slab_index)
+	if slab.is_dirty {
+		heap_slab_clear_data(slab)
+	}
+	slab.bin_size = rounded_size
+
+	// The book-keeping structures compete for the same space as the data,
+	// so we have to go back and forth with the math a bit.
+	bins := HEAP_SLAB_SIZE / rounded_size
+	sectors := bins / INTEGER_BITS
+	sectors += 0 if bins % INTEGER_BITS == 0 else 1
+
+	// We'll waste `2 * HEAP_MAX_ALIGNMENT` bytes per slab to simplify the math
+	// behind getting the pointer to the first bin to align properly.
+	// Otherwise we'd have to go back and forth even more.
+	bookkeeping_bin_cost := max(1, int(size_of(Heap_Slab) + HEAP_SECTOR_TYPES * uintptr(sectors) * size_of(uint) + 2 * HEAP_MAX_ALIGNMENT) / rounded_size)
+	bins -= bookkeeping_bin_cost
+	sectors = bins / INTEGER_BITS
+	sectors += 0 if bins % INTEGER_BITS == 0 else 1
+
+	slab.sectors = sectors
+	slab.free_bins = bins
+	slab.max_bins = bins
+
+	base_alignment := uintptr(min(HEAP_MAX_ALIGNMENT, rounded_size))
+
+	slab_bitmap_base := uintptr(slab) + size_of(Heap_Slab)
+	total_byte_size_of_all_bitmaps := HEAP_SECTOR_TYPES * uintptr(sectors) * size_of(uint)
+	pointer_padding := (uintptr(base_alignment) - (slab_bitmap_base + total_byte_size_of_all_bitmaps)) & uintptr(base_alignment - 1)
+
+	// These bitmaps are placed at the end of the struct, one after the other.
+	slab.local_free  = cast([^]uint)(slab_bitmap_base)
+	slab.remote_free = cast([^]uint)(slab_bitmap_base + uintptr(sectors) * size_of(uint))
+	// This pointer is specifically aligned.
+	slab.data        = (slab_bitmap_base + total_byte_size_of_all_bitmaps + pointer_padding)
+
+	assert_contextless(slab.data & (base_alignment-1) == 0, "Incorrect calculation for aligning Slab data pointer.")
+	assert_contextless(size_of(Heap_Slab) + int(total_byte_size_of_all_bitmaps) + bins * rounded_size < HEAP_SLAB_SIZE, "Slab internal allocation overlimit.")
+	assert_contextless(find_slab_from_pointer(rawptr(slab.data)) == slab, "Slab data pointer cannot be traced back to its Slab.")
+
+	// Set all of the local free bits.
+	{
+		full_sectors := bins / INTEGER_BITS
+		partial_sector := bins % INTEGER_BITS
+		for i in 0..<full_sectors {
+			slab.local_free[i] = max(uint)
+		}
+		if partial_sector > 0 {
+			slab.local_free[sectors-1] = (1 << uint(bins % INTEGER_BITS)) - 1
+			heap_debug_cover(.Slab_Adjusted_For_Partial_Sector)
+		}
+	}
+
+	// Update the next free slab.
+	heap_update_next_free_slab_index(superpage, superpage.next_free_slab_index + 1)
+
+	// Make the slab known to the heap.
+	heap_cache_add_slab(slab, heap_bin_size_to_rank(rounded_size))
+
+	if superpage.free_slabs == 0 {
+		heap_cache_remove_superpage_with_free_slabs(superpage)
+		heap_debug_cover(.Superpage_Removed_From_Open_Cache_By_Slab)
+	}
+
+	return
+}
+
+/*
+Set everything after the index field to zero in a Slab.
+*/
+heap_slab_clear_data :: proc "contextless" (slab: ^Heap_Slab) {
+	intrinsics.mem_zero_volatile(rawptr(uintptr(slab) + size_of(int)), HEAP_SLAB_SIZE - size_of(int))
+}
+
+/*
+Mark a Slab-wide allocation as no longer in use by the Superpage.
+*/
+heap_free_wide_slab :: proc "contextless" (superpage: ^Heap_Superpage, slab: ^Heap_Slab) {
+	assert_contextless(slab.bin_size > HEAP_MAX_BIN_SIZE, "The heap allocator tried to wide-free a non-wide slab.")
+	// There is only one bit for a Slab-wide allocation, so this will be easy.
+	contiguous := heap_slabs_needed_for_size(slab.bin_size)
+	previously_full_superpage := superpage.free_slabs == 0
+	superpage.free_slabs += contiguous
+
+	for i in slab.index..<slab.index+contiguous {
+		// Rewrite the index into place in case it was overwritten, then mark
+		// all slabs of the allocation as dirty and available.
+		next_slab := heap_superpage_index_slab(superpage, i)
+		next_slab.index = i
+		next_slab.is_dirty = true
+		next_slab.bin_size = 0
+	}
+
+	superpage.next_free_slab_index = min(superpage.next_free_slab_index, slab.index)
+	if superpage.free_slabs == HEAP_SLAB_COUNT && !superpage.cache_block.in_use {
+		heap_free_superpage(superpage)
+		heap_debug_cover(.Superpage_Freed_By_Wide_Slab)
+		return
+	}
+	if previously_full_superpage {
+		heap_cache_add_superpage_with_free_slabs(superpage)
+		heap_debug_cover(.Superpage_Added_To_Open_Cache_By_Freeing_Wide_Slab)
+	}
+}
+
+/*
+Mark a Slab as no longer in use by the Superpage.
+*/
+heap_free_slab :: proc "contextless" (superpage: ^Heap_Superpage, slab: ^Heap_Slab) {
+	if superpage.free_slabs == 0 {
+		heap_cache_add_superpage_with_free_slabs(superpage)
+		heap_debug_cover(.Superpage_Added_To_Open_Cache_By_Slab)
+	}
+	slab.is_dirty = true
+	slab.bin_size = 0
+	superpage.free_slabs += 1
+	assert_contextless(superpage.free_slabs <= HEAP_SLAB_COUNT)
+	superpage.next_free_slab_index = min(superpage.next_free_slab_index, slab.index)
+}
+
+/*
+Merge bins marked as free by remote threads.
+
+During normal operation, this is only called:
+
+1. when a slab is about to become full during allocation and there are known
+   remote frees.
+2. when a slab is known to be full and a remote thread has freed a bin.
+3. when a superpage is adopted and slabs within are known to have remote frees.
+4. when a superpage is being released to the orphanage after clean exit of a
+   thread, and one of its slabs is known to have remote frees.
+
+This procedure returns an estimation of the number of remote free bins left to
+merge which were not captured in this merge.
+*/
+heap_merge_remote_frees :: proc "contextless" (slab: ^Heap_Slab) -> (bins_left: int) {
+	assert_contextless(slab.bin_size > 0, "The heap allocator tried to merge remote frees on an unused slab.")
+
+	merged_bins := 0
+	next_free_sector := slab.next_free_sector
+
+	// Atomically merge in all of the bits set by other threads.
+	for i in 0..<slab.sectors {
+		remote_free_sector_bits := intrinsics.atomic_exchange_explicit(&slab.remote_free[i], 0, .Acq_Rel)
+		if remote_free_sector_bits == 0 {
+			continue
+		}
+		merged_bins += int(intrinsics.count_ones(remote_free_sector_bits))
+		slab.local_free[i] |= remote_free_sector_bits
+		next_free_sector = min(next_free_sector, i)
+	}
+
+	bins_left = intrinsics.atomic_sub_explicit(&slab.remote_free_bins_scheduled, merged_bins, .Seq_Cst) - merged_bins
+
+	slab.free_bins += merged_bins
+	slab.next_free_sector = next_free_sector
+	heap_debug_cover(.Merged_Remote_Frees)
+	return
+}
+
+
+//
+// Superpages
+//
+
+/*
+Update the cached index for the next available Slab.
+
+This will save a little time on the next allocation that needs one.
+*/
+heap_update_next_free_slab_index :: proc "contextless" (superpage: ^Heap_Superpage, start_at: int) {
+	// Reset it to the "no free slab" state.
+	superpage.next_free_slab_index = HEAP_SLAB_COUNT
+
+	if superpage.free_slabs == 0 {
+		heap_debug_cover(.Superpage_Updated_Next_Free_Slab_Index_As_Empty)
+		return
+	}
+
+	// Find a free slab.
+	for i := start_at; i < HEAP_SLAB_COUNT; /**/ {
+		bin_size := heap_superpage_index_slab(superpage, i).bin_size
+		if bin_size == 0 {
+			superpage.next_free_slab_index = i
+			heap_debug_cover(.Superpage_Updated_Next_Free_Slab_Index)
+			return
+		} else if bin_size > HEAP_MAX_BIN_SIZE {
+			// Skip contiguous slabs.
+			i += heap_slabs_needed_for_size(bin_size)
+		} else {
+			i += 1
+		}
+	}
+
+	panic_contextless("The heap allocator was unable to find a free slab in a superpage with free_slabs > 0.")
+}
+
+//
+// The Heap Cache
+//
+
+/*
+Link a new superpage into the heap.
+*/
+heap_link_superpage :: proc "contextless" (superpage: ^Heap_Superpage) {
+	assert_contextless(superpage.prev == nil, "The heap allocator tried to link an already-linked superpage.")
+	superpage.prev = local_heap_tail
+	local_heap_tail.next = superpage
+	local_heap_tail = superpage
+	heap_debug_cover(.Superpage_Linked)
+}
+
+/*
+Unlink a superpage from the heap.
+
+There must always be at least one superpage in the heap after the first
+non-Huge allocation, in order to track heap metadata.
+*/
+heap_unlink_superpage :: proc "contextless" (superpage: ^Heap_Superpage) {
+	if superpage == local_heap_tail {
+		assert_contextless(superpage.next == nil, "The heap allocator's tail superpage has a next link.")
+		assert_contextless(superpage.prev != nil, "The heap allocator's tail superpage has no previous link.")
+		local_heap_tail = superpage.prev
+		// We never unlink all superpages, so no need to check validity here.
+		superpage.prev.next = nil
+		heap_debug_cover(.Superpage_Unlinked_Tail)
+		return
+	}
+	if superpage.prev != nil {
+		superpage.prev.next = superpage.next
+	}
+	if superpage.next != nil {
+		superpage.next.prev = superpage.prev
+	}
+	heap_debug_cover(.Superpage_Unlinked_Non_Tail)
+}
+
+/*
+Mark a superpage from another thread as having had a bin remotely freed.
+*/
+heap_remote_cache_add_remote_free_superpage :: proc "contextless" (superpage: ^Heap_Superpage) {
+	if intrinsics.atomic_exchange_explicit(&superpage.remote_free_set, true, .Acq_Rel) {
+		// Already set.
+		heap_debug_cover(.Superpage_Add_Remote_Free_Guarded_With_Set)
+		return
+	}
+	master := intrinsics.atomic_load_explicit(&superpage.master_cache_block, .Acquire)
+	if master == nil {
+		// This superpage is not owned by anyone.
+		// Its remote frees will be acknowledged in whole when it's adopted.
+		heap_debug_cover(.Superpage_Add_Remote_Free_Guarded_With_Masterless)
+		return
+	}
+	defer heap_debug_cover(.Superpage_Added_Remote_Free)
+
+	cache := master
+	for {
+		for i := 0; i < len(cache.superpages_with_remote_frees); i += 1 {
+			old, swapped := intrinsics.atomic_compare_exchange_strong_explicit(&cache.superpages_with_remote_frees[i], nil, superpage, .Acq_Rel, .Relaxed)
+			assert_contextless(old != superpage, "A remote thread found a duplicate of a superpage in a heap's superpages_with_remote_frees.")
+			if swapped {
+				intrinsics.atomic_add_explicit(&master.remote_free_count, 1, .Release)
+				return
+			}
+		}
+		next_cache_block := intrinsics.atomic_load_explicit(&cache.next_cache_block, .Acquire)
+		assert_contextless(next_cache_block != nil, "A remote thread failed to find free space for a new entry in another heap's superpages_with_remote_frees.")
+		cache = next_cache_block
+	}
+}
+
+/*
+Claim an additional superpage for the heap cache.
+*/
+heap_cache_expand :: proc "contextless" (cache: ^Heap_Cache_Block) {
+	superpage := find_superpage_from_pointer(cache)
+	assert_contextless(superpage.next != nil, "The heap allocator tried to expand its cache but has run out of linked superpages.")
+	new_next_cache_block := &(superpage.next).cache_block
+	assert_contextless(new_next_cache_block.in_use == false, "The heap allocator tried to expand its cache, but the candidate superpage is already using its cache block.")
+	new_next_cache_block.in_use = true
+	intrinsics.atomic_store_explicit(&cache.next_cache_block, new_next_cache_block, .Release)
+	local_heap_cache.length += 1
+	heap_debug_cover(.Heap_Expanded_Cache_Data)
+}
+
+/*
+Add a slab to the heap's cache, keyed to the bin size rank of `rank`.
+*/
+heap_cache_add_slab :: proc "contextless" (slab: ^Heap_Slab, rank: int) {
+	assert_contextless(slab != nil)
+	cache := local_heap_cache
+	for {
+		start := rank * HEAP_CACHE_SLAB_MAP_STRIDE
+		for i := start; i < start+HEAP_CACHE_SLAB_MAP_STRIDE; i += 1 {
+			assert_contextless(cache.slab_map[i] != slab, "The heap allocator found a duplicate entry in its slab map.")
+			if cache.slab_map[i] == nil {
+				cache.slab_map[i] = slab
+				return
+			}
+		}
+		assert_contextless(cache.next_cache_block != nil)
+		cache = cache.next_cache_block
+	}
+}
+
+/*
+Get a slab from the heap's cache for an allocation of `rounded_size` bytes.
+
+If no slabs are already available for the bin rank, one will be created.
+*/
+@(require_results)
+heap_cache_get_slab :: proc "contextless" (rounded_size: int) -> (slab: ^Heap_Slab) {
+	rank := heap_bin_size_to_rank(rounded_size)
+	slab = local_heap_cache.slab_map[rank * HEAP_CACHE_SLAB_MAP_STRIDE]
+	if slab == nil {
+		superpage := local_heap_cache.superpages_with_free_slabs[0]
+		if superpage == nil {
+			superpage = heap_get_superpage()
+			heap_link_superpage(superpage)
+			heap_cache_register_superpage(superpage)
+		}
+		assert_contextless(superpage.free_slabs > 0)
+		slab = heap_slab_setup(superpage, rounded_size)
+	}
+	return
+}
+
+/*
+Remove a slab with the corresponding bin rank from the heap's cache.
+*/
+heap_cache_remove_slab :: proc "contextless" (slab: ^Heap_Slab, rank: int) {
+	cache := local_heap_cache
+	assert_contextless(cache.in_use)
+	assert_contextless(slab.bin_size == 1 << (HEAP_MIN_BIN_SHIFT + uint(rank)))
+	for {
+		assert_contextless(cache != nil)
+		start := rank * HEAP_CACHE_SLAB_MAP_STRIDE
+		for i := start; i < start+HEAP_CACHE_SLAB_MAP_STRIDE; i += 1 {
+			if cache.slab_map[i] == slab {
+				// Swap with the tail.
+				source_i := i
+				target_cache := cache
+				i += 1
+				for {
+					for j := i; j < start+HEAP_CACHE_SLAB_MAP_STRIDE; j += 1 {
+						if target_cache.slab_map[j] == nil {
+							cache.slab_map[source_i] = target_cache.slab_map[j-1]
+							target_cache.slab_map[j-1] = nil
+							return
+						}
+					}
+					if target_cache.next_cache_block == nil {
+						// The entry is at the end of the stride and we have
+						// run out of space to search.
+						cache.slab_map[source_i] = nil
+						assert_contextless(source_i % (HEAP_CACHE_SLAB_MAP_STRIDE-1) == 0, "The heap allocator tried to remove a non-terminal slab map entry when it should have swapped it with the tail.")
+						return
+					} else if target_cache.next_cache_block.slab_map[start] == nil {
+						// The starting bucket in the next cache block is empty,
+						// so we terminate on the current stride's final bucket.
+						cache.slab_map[source_i] = target_cache.slab_map[start+HEAP_CACHE_SLAB_MAP_STRIDE-1]
+						target_cache.slab_map[start+HEAP_CACHE_SLAB_MAP_STRIDE-1] = nil
+						return
+					}
+					target_cache = target_cache.next_cache_block
+					// Reset `i` after the first iteration.
+					i = start
+				}
+			}
+		}
+		// Entry must be in the expanded cache blocks.
+		assert_contextless(cache.next_cache_block != nil)
+		cache = cache.next_cache_block
+	}
+}
+
+/*
+Make an allocation using contiguous slabs as the backing.
+
+This procedure will check through the heap's cache for viable superpages to
+support the allocation.
+*/
+@(require_results)
+heap_cache_get_contiguous_slabs :: proc "contextless" (size: int) -> (ptr: rawptr) {
+	contiguous := heap_slabs_needed_for_size(size)
+	cache := local_heap_cache
+	for {
+		for i := 0; i < len(cache.superpages_with_free_slabs); i += 1 {
+			if cache.superpages_with_free_slabs[i] == nil {
+				// No superpages with free slabs left.
+				heap_debug_cover(.Alloc_Slab_Wide_Needed_New_Superpage)
+				for {
+					superpage := heap_get_superpage()
+					heap_link_superpage(superpage)
+					heap_cache_register_superpage(superpage)
+					if superpage.free_slabs >= contiguous {
+						alloc, ok := heap_make_slab_sized_allocation(superpage, size)
+						if ok {
+							return alloc
+						}
+					}
+				}
+			} else {
+				heap_debug_cover(.Alloc_Slab_Wide_Used_Available_Superpage)
+				superpage := cache.superpages_with_free_slabs[i]
+				if superpage.free_slabs >= contiguous {
+					alloc, ok := heap_make_slab_sized_allocation(superpage, size)
+					if ok {
+						return alloc
+					}
+				}
+			}
+		}
+		assert_contextless(cache.next_cache_block != nil)
+		cache = cache.next_cache_block
+	}
+}
+
+/*
+Add a superpage with free slabs to the heap's cache.
+*/
+heap_cache_add_superpage_with_free_slabs :: proc "contextless" (superpage: ^Heap_Superpage) {
+	assert_contextless(intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) == get_current_thread_id(), "The heap allocator tried to cache a superpage that does not belong to it.")
+	cache := local_heap_cache
+
+	for {
+		for i := 0; i < len(cache.superpages_with_free_slabs); i += 1 {
+			if cache.superpages_with_free_slabs[i] == nil {
+				cache.superpages_with_free_slabs[i] = superpage
+				return
+			}
+		}
+		assert_contextless(cache.next_cache_block != nil)
+		cache = cache.next_cache_block
+	}
+}
+
+/*
+Remove a superpage from the heap's cache for superpages with free slabs.
+*/
+heap_cache_remove_superpage_with_free_slabs :: proc "contextless" (superpage: ^Heap_Superpage) {
+	cache := local_heap_cache
+	for {
+		for i := 0; i < len(cache.superpages_with_free_slabs); i += 1 {
+			if cache.superpages_with_free_slabs[i] == superpage {
+				// Swap with the tail.
+				source_i := i
+				target_cache := cache
+				i += 1
+				for {
+					for j := i; j < len(cache.superpages_with_free_slabs); j += 1 {
+						if target_cache.superpages_with_free_slabs[j] == nil {
+							cache.superpages_with_free_slabs[source_i] = target_cache.superpages_with_free_slabs[j-1]
+							target_cache.superpages_with_free_slabs[j-1] = nil
+							return
+						}
+					}
+					if target_cache.next_cache_block == nil {
+						// The entry is at the end of the list and we have run
+						// out of space to search.
+						cache.superpages_with_free_slabs[source_i] = nil
+						assert_contextless(source_i == len(cache.superpages_with_free_slabs), "The heap allocator tried to remove a non-terminal superpage with free slabs entry when it should have swapped it with the tail.")
+						return
+					} else if target_cache.next_cache_block.superpages_with_free_slabs[0] == nil {
+						// The next list section is empty, so we have to end here.
+						cache.superpages_with_free_slabs[source_i] = target_cache.superpages_with_free_slabs[len(cache.superpages_with_free_slabs)-1]
+						target_cache.superpages_with_free_slabs[len(cache.superpages_with_free_slabs)-1] = nil
+						return
+					}
+					target_cache = target_cache.next_cache_block
+					// Reset `i` after the first iteration.
+					i = 0
+				}
+			}
+		}
+		assert_contextless(cache.next_cache_block != nil)
+		cache = cache.next_cache_block
+	}
+}
+
+//
+// Superpage Orphanage
+//
+
+// This construction is used to avoid the ABA problem.
+//
+// Virtually all systems we support should have a 64-bit CAS to make this work.
+Tagged_Pointer :: bit_field u64 {
+	// Intel 5-level paging uses up to 56 bits of a pointer on x86-64.
+	// This should be enough to cover ARM64, too.
+	//
+	// We use an `i64` here to maintain sign extension on the upper bits for
+	// systems where this is relevant, hence the extra bit over 56.
+	pointer: i64 | 57,
+	// We only need so many bits that enough transactions don't happen so fast
+	// as to roll over the value to cause a situation where ABA can manifest.
+	version: u8 | 7,
+}
+
+/*
+Put a Superpage into the global orphanage.
+
+The caller is responsible for fetch-adding the count.
+*/
+heap_push_orphan :: proc "contextless" (superpage: ^Heap_Superpage) {
+	assert_contextless(intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) != 0, "The heap allocator tried to push an unowned superpage to the orphanage.")
+
+	// The algorithm below is one of the well-known methods of resolving the
+	// ABA problem, known as a tagged pointer. The gist is that if another
+	// thread has changed the value, we'll be able to detect that by checking
+	// against the version bits.
+	old_head := transmute(Tagged_Pointer)intrinsics.atomic_load_explicit(cast(^u64)&heap_orphanage, .Relaxed)
+	intrinsics.atomic_store_explicit(&superpage.master_cache_block, nil, .Seq_Cst)
+	// NOTE: This next instruction must not float above the previous one, as
+	// this superpage could host the `master_cache_block`. The order is important
+	// to keep other threads from trying to access it while we're clearing it.
+	if superpage.cache_block.in_use {
+		intrinsics.mem_zero_volatile(&superpage.cache_block, size_of(Heap_Cache_Block))
+		heap_debug_cover(.Superpage_Cache_Block_Cleared)
+	}
+	superpage.prev = nil
+	intrinsics.atomic_store_explicit(&superpage.owner, 0, .Release)
+	for {
+		// NOTE: `next` is accessed atomically when pushing or popping from the
+		// orphanage, because this field must synchronize with other threads at
+		// this point.
+		//
+		// This has to do mainly with swinging the head's linking pointer.
+		//
+		// Beyond this point, the thread which owns the superpage will be the
+		// only one to read `next`, hence why it is not read atomically
+		// anywhere else.
+		intrinsics.atomic_store_explicit(&superpage.next, cast(^Heap_Superpage)uintptr(old_head.pointer), .Release)
+		new_head: Tagged_Pointer = ---
+		new_head.pointer = i64(uintptr(superpage))
+		new_head.version = old_head.version + 1
+
+		old_head_, swapped := intrinsics.atomic_compare_exchange_weak_explicit(cast(^u64)&heap_orphanage, transmute(u64)old_head, transmute(u64)new_head, .Acq_Rel, .Relaxed)
+		if swapped {
+			break
+		}
+		old_head = transmute(Tagged_Pointer)old_head_
+	}
+}
+
+/*
+Remove and return the first entry from the Superpage orphanage, which may be nil.
+*/
+@(require_results)
+heap_pop_orphan :: proc "contextless" () -> (superpage: ^Heap_Superpage) {
+	old_head := transmute(Tagged_Pointer)intrinsics.atomic_load_explicit(cast(^u64)&heap_orphanage, .Relaxed)
+	for {
+		superpage = cast(^Heap_Superpage)uintptr(old_head.pointer)
+		if superpage == nil {
+			return
+		}
+		new_head: Tagged_Pointer = ---
+		new_head.pointer = i64(uintptr(intrinsics.atomic_load_explicit(&superpage.next, .Acquire)))
+		new_head.version = old_head.version + 1
+
+		old_head_, swapped := intrinsics.atomic_compare_exchange_weak_explicit(cast(^u64)&heap_orphanage, transmute(u64)old_head, transmute(u64)new_head, .Acq_Rel, .Relaxed)
+		if swapped {
+			intrinsics.atomic_store_explicit(&superpage.next, nil, .Release)
+			intrinsics.atomic_store_explicit(&superpage.owner, get_current_thread_id(), .Release)
+			intrinsics.atomic_sub_explicit(&heap_orphanage_count, 1, .Release)
+			break
+		}
+		old_head = transmute(Tagged_Pointer)old_head_
+	}
+	return
+}
+
+//
+// Globals
+//
+
+@(init, private)
+setup_superpage_orphanage :: proc "contextless" () {
+	when !VIRTUAL_MEMORY_SUPPORTED {
+		return
+	}
+
+	// Upon a thread's clean exit, this procedure will compact its heap and
+	// distribute the superpages into the orphanage, if it has space.
+	add_thread_local_cleaner(proc "odin" () {
+		for superpage := local_heap; superpage != nil; /**/ {
+			next_superpage := superpage.next
+			// The following logic is a specialized case of the same found in
+			// `compact_heap` that ignores the cache since there's no need to
+			// update it when the thread's heap is being broken down.
+			for i := 0; i < HEAP_SLAB_COUNT; /**/ {
+				slab := heap_superpage_index_slab(superpage, i)
+
+				if slab.bin_size > HEAP_MAX_BIN_SIZE {
+					// Skip contiguous slabs.
+					i += heap_slabs_needed_for_size(slab.bin_size)
+				} else {
+					i += 1
+					if slab.bin_size == 0 {
+						continue
+					}
+				}
+
+				if intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Acquire) > 0 {
+					heap_merge_remote_frees(slab)
+
+					if slab.free_bins > 0 {
+						// Synchronize with any thread that might be trying to
+						// free as we merge.
+						intrinsics.atomic_store_explicit(&slab.is_full, false, .Release)
+					}
+					heap_debug_cover(.Orphaned_Superpage_Merged_Remote_Frees)
+				}
+
+				if slab.free_bins == slab.max_bins {
+					if slab.bin_size > HEAP_MAX_BIN_SIZE {
+						heap_free_wide_slab(superpage, slab)
+					} else {
+						heap_free_slab(superpage, slab)
+					}
+					heap_debug_cover(.Orphaned_Superpage_Freed_Slab)
+				}
+			}
+
+			if superpage.free_slabs == HEAP_SLAB_COUNT {
+				if intrinsics.atomic_add_explicit(&heap_orphanage_count, 1, .Acq_Rel) >= HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES {
+					intrinsics.atomic_sub_explicit(&heap_orphanage_count, 1, .Relaxed)
+
+					free_virtual_memory(superpage, SUPERPAGE_SIZE)
+					heap_debug_cover(.Superpage_Freed_By_Exiting_Thread)
+				} else {
+					heap_push_orphan(superpage)
+					heap_debug_cover(.Superpage_Orphaned_By_Exiting_Thread)
+				}
+			} else {
+				intrinsics.atomic_add_explicit(&heap_orphanage_count, 1, .Release)
+				heap_push_orphan(superpage)
+				heap_debug_cover(.Superpage_Orphaned_By_Exiting_Thread)
+			}
+
+			superpage = next_superpage
+		}
+	})
+}
+
+// This is a lock-free, intrusively singly-linked list of Superpages that are
+// not in any thread's heap. They are free for adoption by other threads
+// needing memory.
+heap_orphanage: Tagged_Pointer
+
+// This is an _estimate_ of the number of Superpages in the orphanage. It can
+// never be entirely accurate due to the nature of the design.
+heap_orphanage_count: int
+
+@(thread_local) local_heap:       ^Heap_Superpage
+@(thread_local) local_heap_tail:  ^Heap_Superpage
+@(thread_local) local_heap_cache: ^Heap_Cache_Block
+
+//
+// API
+//
+
+/*
+Allocate an arbitrary amount of memory from the heap and optionally zero it.
+*/
+@(require_results)
+heap_alloc :: proc "contextless" (size: int, zero_memory: bool = true) -> (ptr: rawptr) {
+	assert_contextless(size >= 0, "The heap allocator was given a negative size.")
+
+	// Handle Huge allocations.
+	if size >= HEAP_HUGE_ALLOCATION_THRESHOLD {
+		heap_debug_cover(.Alloc_Huge)
+		return heap_make_huge_allocation(size)
+	}
+
+	// Initialize the heap if needed.
+	if local_heap == nil {
+		local_heap = heap_get_superpage()
+		local_heap_tail = local_heap
+		local_heap_cache = &local_heap.cache_block
+		local_heap_cache.in_use = true
+		heap_cache_register_superpage(local_heap)
+		heap_debug_cover(.Alloc_Heap_Initialized)
+	}
+
+	// Take care of any remote frees.
+	remote_free_count := intrinsics.atomic_load_explicit(&local_heap_cache.remote_free_count, .Acquire)
+	if remote_free_count > 0 {
+		cache := local_heap_cache
+		removed := 0
+		counter := remote_free_count
+		// Go through all superpages in the cache for superpages with remote
+		// frees to see if any still have frees needing merged.
+		merge_loop: for {
+			consume_loop: for i in 0..<len(cache.superpages_with_remote_frees) {
+				superpage := intrinsics.atomic_load_explicit(&cache.superpages_with_remote_frees[i], .Acquire)
+				if superpage == nil {
+					continue consume_loop
+				}
+
+				// First, we will remove the cache entry, to avoid issues with
+				// freeing operations that happen simultaneously while we're
+				// merging.
+				//
+				// For example, if we merge the freed bins in one slab while a
+				// lower-index slab receives a remote free, we would otherwise
+				// be unaware of that, and if the superpage was still marked as
+				// being in the cache, the freeing thread would not schedule
+				// it.
+				//
+				// The performance hit should be negligible in the event that
+				// we have to reschedule the superpage.
+				intrinsics.atomic_store_explicit(&cache.superpages_with_remote_frees[i], nil, .Release)
+				// Seq_Cst, to prevent these two from being seen out of order.
+				intrinsics.atomic_store_explicit(&superpage.remote_free_set, false, .Seq_Cst)
+				should_reschedule := false
+
+				for j := 0; j < HEAP_SLAB_COUNT; /**/ {
+					slab := heap_superpage_index_slab(superpage, j)
+					bin_size := slab.bin_size
+					// We only bother to merge if the slab is in use, full, and
+					// signals that it has remote frees. This is for the sake
+					// of speed, given how large the bitmaps can be.
+					merge_block: if bin_size > 0 && slab.free_bins == 0 && intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Acquire) > 0 {
+						heap_merge_remote_frees(slab)
+						if slab.free_bins == 0 {
+							// No bins were freed at all, which is possible due
+							// to the parallel nature of this code. The freeing
+							// thread could have signalled its intent, but we
+							// merged before it had a chance to flip the
+							// necessary bit.
+							should_reschedule = true
+							break merge_block
+						}
+						intrinsics.atomic_store_explicit(&slab.is_full, false, .Release)
+
+						if bin_size > HEAP_MAX_BIN_SIZE {
+							heap_free_wide_slab(superpage, slab)
+						} else {
+							if slab.free_bins == slab.max_bins {
+								heap_free_slab(superpage, slab)
+							} else {
+								heap_cache_add_slab(slab, heap_bin_size_to_rank(bin_size))
+							}
+						}
+					}
+
+					if bin_size > HEAP_MAX_BIN_SIZE {
+						// Skip contiguous slabs.
+						j += heap_slabs_needed_for_size(bin_size)
+					} else {
+						j += 1
+					}
+				}
+
+				if should_reschedule {
+					// This is the logic found in `heap_remote_cache_add_remote_free_superpage`, simplified.
+					if !intrinsics.atomic_exchange_explicit(&superpage.remote_free_set, true, .Acq_Rel) {
+						cache_reschedule := local_heap_cache
+						reschedule_loop: for {
+							for j := 0; j < len(cache_reschedule.superpages_with_remote_frees); j += 1 {
+								old, swapped := intrinsics.atomic_compare_exchange_strong_explicit(&cache_reschedule.superpages_with_remote_frees[j], nil, superpage, .Acq_Rel, .Relaxed)
+								assert_contextless(old != superpage, "The heap allocator found a duplicate of a superpage in its superpages_with_remote_frees while rescheduling.")
+								if swapped {
+									intrinsics.atomic_add_explicit(&local_heap_cache.remote_free_count, 1, .Release)
+									break reschedule_loop
+								}
+							}
+							next_cache_block := intrinsics.atomic_load_explicit(&cache_reschedule.next_cache_block, .Acquire)
+							assert_contextless(next_cache_block != nil, "The heap allocator failed to find free space for a new entry in its superpages_with_remote_frees cache.")
+							cache_reschedule = next_cache_block
+						}
+					}
+				} else {
+					removed += 1
+				}
+
+				counter -= 1
+				if counter == 0 {
+					break merge_loop
+				}
+			}
+			if cache.next_cache_block == nil {
+				break merge_loop
+			}
+			assert_contextless(cache.next_cache_block != nil)
+			cache = cache.next_cache_block
+		}
+		intrinsics.atomic_sub_explicit(&local_heap_cache.remote_free_count, removed, .Release)
+		heap_debug_cover(.Alloc_Collected_Remote_Frees)
+	}
+
+	// Handle slab-wide allocations.
+	if size > HEAP_MAX_BIN_SIZE {
+		heap_debug_cover(.Alloc_Slab_Wide)
+		return heap_cache_get_contiguous_slabs(size)
+	}
+
+	// Get a suitable slab from the heap.
+	rounded_size := heap_round_to_bin_size(size)
+	slab := heap_cache_get_slab(rounded_size)
+	assert_contextless(slab.bin_size == rounded_size, "The heap allocator found a slab with the wrong bin size during allocation.")
+
+	// Allocate a bin inside the slab.
+	sector := slab.next_free_sector
+	sector_bits := slab.local_free[sector]
+	assert_contextless(sector_bits != 0, "The heap allocator found a slab with a full next_free_sector.")
+
+	// Select the lowest free bit.
+	index := uintptr(intrinsics.count_trailing_zeros(sector_bits))
+
+	// Convert the index to a pointer.
+	ptr = rawptr(slab.data + (uintptr(sector * SECTOR_BITS) + index) * uintptr(rounded_size))
+	when !ODIN_DISABLE_ASSERT {
+		base_alignment := min(HEAP_MAX_ALIGNMENT, uintptr(rounded_size))
+		assert_contextless(uintptr(ptr) & uintptr(base_alignment-1) == 0, "A pointer allocated by the heap is not well-aligned.")
+	}
+
+	// Clear the free bit.
+	slab.local_free[sector] &~= (1 << index)
+
+	// Zero the memory, if needed.
+	if zero_memory && index < uintptr(slab.dirty_bins) {
+		// Ensure that the memory zeroing is not optimized out by the compiler.
+		intrinsics.mem_zero_volatile(ptr, rounded_size)
+		// NOTE: A full memory fence should not be needed for any newly-zeroed
+		// allocation, as each thread controls its own heap, and for one thread
+		// to pass a memory address to another implies some secondary
+		// synchronization method, such as a mutex, which would be the way by
+		// which the threads come to agree on the state of main memory.
+		heap_debug_cover(.Alloc_Zeroed_Memory)
+	}
+
+	// Update statistics.
+	slab.dirty_bins = int(max(slab.dirty_bins, 1 + sector * SECTOR_BITS + int(index)))
+	slab.free_bins -= 1
+
+	// Remove the slab from the cache if the slab's full.
+	// Otherwise, update the next free sector if the sector's full.
+	if slab.free_bins == 0 {
+		slab.next_free_sector = slab.sectors
+		if intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Seq_Cst) > 0 {
+			heap_merge_remote_frees(slab)
+			if slab.free_bins == 0 {
+				// We have come before the other thread, and the bit we needed to find was not set.
+				// Treat it as if it is full anyway.
+				intrinsics.atomic_store_explicit(&slab.is_full, true, .Release)
+				heap_cache_remove_slab(slab, heap_bin_size_to_rank(rounded_size))
+			} else {
+				// No cache adjustment happens here; we know it's already in the cache.
+			}
+		} else {
+			intrinsics.atomic_store_explicit(&slab.is_full, true, .Release)
+			heap_cache_remove_slab(slab, heap_bin_size_to_rank(rounded_size))
+		}
+	} else {
+		if slab.local_free[sector] == 0 {
+			sector += 1
+			for /**/; sector < slab.sectors; sector += 1 {
+				if slab.local_free[sector] != 0 {
+					break
+				}
+			}
+			slab.next_free_sector = sector
+		}
+	}
+	heap_debug_cover(.Alloc_Bin)
+	return
+}
+
+/*
+Free memory returned by `heap_alloc`.
+*/
+heap_free :: proc "contextless" (ptr: rawptr) {
+	// Check for nil.
+	if ptr == nil {
+		return
+	}
+
+	superpage := find_superpage_from_pointer(ptr)
+
+	// Check if this is a huge allocation.
+	if superpage.huge_size > 0 {
+		// NOTE: If the allocator is passed a pointer that it does not own,
+		// which is a scenario that it cannot possibly detect (in any
+		// reasonably performant fashion), then the condition above may result
+		// in a segmentation violation.
+		//
+		// Regardless, the result of passing a pointer to this heap allocator
+		// which it did not return is undefined behavior.
+		free_virtual_memory(superpage, superpage.huge_size)
+		heap_debug_cover(.Freed_Huge_Allocation)
+		return
+	}
+
+	// Find which slab this pointer belongs to.
+	slab := find_slab_from_pointer(ptr)
+	assert_contextless(slab.bin_size > 0, "The heap allocator tried to free a pointer belonging to an empty slab.")
+
+	// Check if this is a slab-wide allocation.
+	if slab.bin_size > HEAP_MAX_BIN_SIZE {
+		if intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) == get_current_thread_id() {
+			heap_free_wide_slab(superpage, slab)
+			heap_debug_cover(.Freed_Wide_Slab)
+		} else {
+			// Atomically let the owner know there's a free slab.
+			intrinsics.atomic_add_explicit(&slab.remote_free_bins_scheduled, 1, .Release)
+			old := intrinsics.atomic_or_explicit(&slab.remote_free[0], 1, .Seq_Cst)
+			heap_remote_cache_add_remote_free_superpage(superpage)
+
+			when HEAP_PANIC_ON_DOUBLE_FREE {
+				if old == 1 {
+					panic_contextless("The heap allocator freed an already-free pointer.")
+				}
+			}
+			heap_debug_cover(.Remotely_Freed_Wide_Slab)
+		}
+		return
+	}
+
+	// Find which sector and bin this pointer refers to.
+	bin_number := int(uintptr(ptr) - slab.data) / slab.bin_size
+	sector := bin_number / INTEGER_BITS
+	index := uint(bin_number) % INTEGER_BITS
+
+	assert_contextless(bin_number < slab.max_bins, "Calculated an incorrect bin number for slab.")
+	assert_contextless(sector < slab.sectors, "Calculated an incorrect sector for slab.")
+
+	// See if we own the slab or not, then free the pointer.
+	if intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) == get_current_thread_id() {
+		when HEAP_PANIC_ON_DOUBLE_FREE {
+			if slab.local_free[sector] & (1 << index) != 0 {
+				panic_contextless("The heap allocator freed an already-free pointer.")
+			}
+		}
+		// Mark the bin as free.
+		slab.local_free[sector] |= 1 << index
+		if slab.free_bins == 0 {
+			intrinsics.atomic_store_explicit(&slab.is_full, false, .Release)
+			// Put this slab back in the available list.
+			heap_cache_add_slab(slab, heap_bin_size_to_rank(slab.bin_size))
+			heap_debug_cover(.Freed_Bin_Reopened_Full_Slab)
+		}
+		slab.free_bins += 1
+		assert_contextless(slab.free_bins <= slab.max_bins, "A slab of the heap allocator overflowed its free bins.")
+		// Free the slab if it's empty and was at one point in time completely
+		// full. This is a heuristic to prevent a single repeated new/free
+		// operation in an otherwise empty slab from bogging down the
+		// allocator.
+		if slab.free_bins == slab.max_bins && slab.dirty_bins == slab.max_bins {
+			heap_cache_remove_slab(slab, heap_bin_size_to_rank(slab.bin_size))
+			heap_free_slab(superpage, slab)
+			heap_debug_cover(.Freed_Bin_Freed_Slab_Which_Was_Fully_Used)
+		} else {
+			slab.next_free_sector = min(slab.next_free_sector, sector)
+			heap_debug_cover(.Freed_Bin_Updated_Slab_Next_Free_Sector)
+		}
+		// Free the entire superpage if it's empty.
+		if superpage.free_slabs == HEAP_SLAB_COUNT && !superpage.cache_block.in_use {
+			heap_free_superpage(superpage)
+			heap_debug_cover(.Freed_Bin_Freed_Superpage)
+		}
+	} else {
+		// Atomically let the owner know there's a free bin.
+
+		// NOTE: The order of operations here is important.
+		//
+		// 1. We must first check if the slab is full. If we wait until later,
+		//    we risk causing a race.
+		is_full := intrinsics.atomic_load_explicit(&slab.is_full, .Acquire)
+
+		// 2. We have to let the owner know that we intend to schedule a remote
+		//    free. This way, it can keep the cached entry around if it isn't
+		//    able to merge all of them due to timing differences on our part.
+		intrinsics.atomic_add_explicit(&slab.remote_free_bins_scheduled, 1, .Release)
+
+		// (Technically, the compiler or processor is allowed to re-order the
+		// above two operations, and this is okay. However, the next one acts
+		// as a full barrier due to its Sequential Consistency ordering.)
+
+		// 3. Finally, we flip the bit of the bin we want freed. This must be
+		//    the final operation across all cores, because the owner could
+		//    already be in the process of merging remote frees, and if our bin
+		//    was the last one, it has the authorization to wipe the slab and
+		//    possibly reallocate in the same block of memory.
+		//
+		//    By deferring this to the end, we avoid any possibility of a race,
+		//    even if multiple threads are in this section.
+		old := intrinsics.atomic_or_explicit(&slab.remote_free[sector], 1 << index, .Seq_Cst)
+
+		// 4. Now we can safely check if the slab is full and add it to the
+		//    owner's cache if needed.
+		if is_full {
+			heap_remote_cache_add_remote_free_superpage(superpage)
+			heap_debug_cover(.Remotely_Freed_Bin_Caused_Remote_Superpage_Caching)
+		}
+
+		when HEAP_PANIC_ON_DOUBLE_FREE {
+			if old & (1 << index) != 0 {
+				panic_contextless("The heap allocator freed an already-free pointer.")
+			}
+		}
+		heap_debug_cover(.Remotely_Freed_Bin)
+
+		// NOTE: It is possible for two threads to be here at the same time,
+		// thus violating any sense of order among the actual number of bins
+		// free and the reported number.
+		//
+		// T1 & T2 could flip bits,
+		// T2 could increment the free count, then
+		// T3 could merge the free bins.
+		//
+		// This scenario would result in an inconsistent count no matter which
+		// way the above procedure is carried out, hence its unreliability.
+	}
+}
+
+/*
+Resize memory returned by `heap_alloc`.
+*/
+@(require_results)
+heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int, zero_memory: bool = true) -> (new_ptr: rawptr) {
+	Size_Category :: enum {
+		Unknown,
+		Bin,
+		Slab,
+		Huge,
+	}
+
+	// We need to first determine if we're crossing size categories.
+	old_category: Size_Category
+	switch {
+	case old_size <= HEAP_MAX_BIN_SIZE:              old_category = .Bin
+	case old_size <  HEAP_HUGE_ALLOCATION_THRESHOLD: old_category = .Slab
+	case old_size >= HEAP_HUGE_ALLOCATION_THRESHOLD: old_category = .Huge
+	case: unreachable()
+	}
+	new_category: Size_Category
+	switch {
+	case new_size <= HEAP_MAX_BIN_SIZE:              new_category = .Bin
+	case new_size <  HEAP_HUGE_ALLOCATION_THRESHOLD: new_category = .Slab
+	case new_size >= HEAP_HUGE_ALLOCATION_THRESHOLD: new_category = .Huge
+	case: unreachable()
+	}
+	assert_contextless(old_category != .Unknown)
+	assert_contextless(new_category != .Unknown)
+
+	if new_category != old_category {
+		// A change in size category cannot be optimized.
+		new_ptr = heap_alloc(new_size, zero_memory)
+		intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size))
+		heap_free(old_ptr)
+		heap_debug_cover(.Resize_Crossed_Size_Categories)
+		return
+	}
+
+	// NOTE: Superpage owners are the only ones that can change the amount of
+	// memory that backs each pointer. Other threads have to allocate and copy.
+
+	// Check if this is a huge allocation.
+	superpage := find_superpage_from_pointer(old_ptr)
+	if superpage.huge_size > 0 {
+		// This block follows the preamble in `heap_make_huge_allocation`.
+		new_real_size := new_size
+		if new_size < SUPERPAGE_SIZE - HEAP_HUGE_ALLOCATION_BOOK_KEEPING {
+			new_real_size = SUPERPAGE_SIZE
+			heap_debug_cover(.Resize_Huge_Size_Set_To_Superpage)
+		} else {
+			new_real_size += HEAP_HUGE_ALLOCATION_BOOK_KEEPING
+			heap_debug_cover(.Resize_Huge_Size_Adjusted)
+		}
+		resized_superpage := cast(^Heap_Superpage)resize_virtual_memory(superpage, superpage.huge_size, new_real_size, SUPERPAGE_SIZE)
+		assert_contextless(uintptr(resized_superpage) & (SUPERPAGE_SIZE-1) == 0, "After resizing a huge allocation, the pointer was no longer aligned to a superpage boundary.")
+		resized_superpage.huge_size = new_real_size
+		u := uintptr(resized_superpage) + HEAP_HUGE_ALLOCATION_BOOK_KEEPING
+		new_ptr = rawptr(u - u & (HEAP_MAX_ALIGNMENT-1))
+
+		if zero_memory && new_size > old_size {
+			intrinsics.mem_zero_volatile(
+				rawptr(uintptr(new_ptr) + uintptr(old_size)),
+				new_size - old_size,
+			)
+			heap_debug_cover(.Resize_Huge_Caused_Memory_Zeroing)
+		}
+
+		heap_debug_cover(.Resize_Huge)
+		return
+	}
+
+	// Find which slab this pointer belongs to.
+	slab := find_slab_from_pointer(old_ptr)
+
+	// Check if this is a slab-wide allocation.
+	if slab.bin_size > HEAP_MAX_BIN_SIZE {
+		contiguous_old := heap_slabs_needed_for_size(slab.bin_size)
+		contiguous_new := heap_slabs_needed_for_size(new_size)
+		if contiguous_new == contiguous_old {
+			// We already have enough slabs to serve the request.
+			if zero_memory && new_size > old_size {
+				intrinsics.mem_zero_volatile(
+					rawptr(uintptr(old_ptr) + uintptr(old_size)),
+					new_size - old_size,
+				)
+				heap_debug_cover(.Resize_Wide_Slab_Caused_Memory_Zeroing)
+			}
+			heap_debug_cover(.Resize_Wide_Slab_Kept_Old_Pointer)
+			return old_ptr
+		}
+
+		if slab.index + contiguous_new >= HEAP_SLAB_COUNT {
+			// Expanding this slab would go beyond the Superpage.
+			// We need more memory.
+			new_ptr = heap_alloc(new_size, zero_memory)
+			intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size))
+			heap_free(old_ptr)
+			return
+		}
+
+		if intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) != get_current_thread_id() {
+			// We are not the owner of this data, therefore none of the special
+			// optimized paths for wide slabs are available to us, as they all
+			// involve touching the Superpage.
+			//
+			// We must re-allocate.
+			new_ptr = heap_alloc(new_size, zero_memory)
+			intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size))
+			heap_free(old_ptr)
+			heap_debug_cover(.Resize_Wide_Slab_From_Remote_Thread)
+			return
+		}
+
+		// Can we shrink the wide slab, or can we expand it in-place?
+		if contiguous_new < contiguous_old {
+			previously_full_superpage := superpage.free_slabs == 0
+			for i := slab.index + contiguous_new; i < slab.index + contiguous_old; i += 1 {
+				// Mark the latter slabs as unused.
+				next_slab := heap_superpage_index_slab(superpage, i)
+				next_slab.index = i
+				next_slab.is_dirty = true
+				next_slab.bin_size = 0
+			}
+			superpage.next_free_slab_index = min(superpage.next_free_slab_index, slab.index + contiguous_new)
+			superpage.free_slabs += contiguous_old - contiguous_new
+			if previously_full_superpage {
+				heap_cache_add_superpage_with_free_slabs(superpage)
+				heap_debug_cover(.Superpage_Added_To_Open_Cache_By_Resizing_Wide_Slab)
+			}
+			heap_debug_cover(.Resize_Wide_Slab_Shrunk_In_Place)
+		} else {
+			// NOTE: We've already guarded against going beyond `HEAP_SLAB_COUNT` in the section above.
+			for i := slab.index + contiguous_old; i < slab.index + contiguous_new; i += 1 {
+				if heap_superpage_index_slab(superpage, i).bin_size != 0 {
+					// Contiguous space is unavailable.
+					new_ptr = heap_alloc(new_size, zero_memory)
+					intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size))
+					heap_free(old_ptr)
+					heap_debug_cover(.Resize_Wide_Slab_Failed_To_Find_Contiguous_Expansion)
+					return
+				}
+			}
+			for i := slab.index + contiguous_old; i < slab.index + contiguous_new; i += 1 {
+				// Wipe the index bits, and if needed, the rest of the data.
+				next_slab := heap_superpage_index_slab(superpage, i)
+				next_slab.index = 0
+				if next_slab.is_dirty {
+					heap_slab_clear_data(next_slab)
+				}
+			}
+			superpage.free_slabs += contiguous_old - contiguous_new
+			if superpage.free_slabs == 0 {
+				heap_cache_remove_superpage_with_free_slabs(superpage)
+			} else {
+				heap_update_next_free_slab_index(superpage, 0)
+			}
+			heap_debug_cover(.Resize_Wide_Slab_Expanded_In_Place)
+		}
+
+		// The slab-wide allocation has been resized in-place.
+		slab.bin_size = new_size
+
+		return old_ptr
+	}
+
+	// See if a bin rank change is needed.
+	new_rounded_size := heap_round_to_bin_size(new_size)
+	if slab.bin_size == new_rounded_size {
+		if zero_memory && new_size > old_size {
+			intrinsics.mem_zero_volatile(
+				rawptr(uintptr(old_ptr) + uintptr(old_size)),
+				new_size - old_size,
+			)
+			// It could be argued that a full memory fence is necessary here,
+			// because one thread may resize an address known to other threads,
+			// but as is the case with zeroing during allocation, we treat this
+			// as if the state change is not independent of the allocator.
+			//
+			// That is to say, if one thread resizes an address in-place, it's
+			// expected that other threads will need to be notified of this by
+			// the program, as with any other synchronization.
+			heap_debug_cover(.Resize_Caused_Memory_Zeroing)
+		}
+		heap_debug_cover(.Resize_Kept_Old_Pointer)
+		return old_ptr
+	}
+
+	// Allocate and copy, as a last resort.
+	new_ptr = heap_alloc(new_size, zero_memory)
+	intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size))
+	heap_free(old_ptr)
+	return
+}
diff --git a/base/runtime/heap_allocator_info.odin b/base/runtime/heap_allocator_info.odin
new file mode 100644
index 00000000000..9249845b486
--- /dev/null
+++ b/base/runtime/heap_allocator_info.odin
@@ -0,0 +1,159 @@
+#+build !js
+#+build !orca
+#+build !wasi
+package runtime
+
+import "base:intrinsics"
+
+/*
+Heap_Info provides metrics on a single thread's heap memory usage.
+*/
+Heap_Info :: struct {
+	total_memory_allocated_from_system: int `fmt:"M"`,
+	total_memory_used_for_book_keeping: int `fmt:"M"`,
+	total_memory_in_use:                int `fmt:"M"`,
+	total_memory_free:                  int `fmt:"M"`,
+	total_memory_dirty:                 int `fmt:"M"`,
+	total_memory_remotely_free:         int `fmt:"M"`,
+
+	total_superpages:                         int,
+	total_superpages_dedicated_to_heap_cache: int,
+	total_huge_allocations:                   int,
+	total_slabs:                              int,
+	total_slabs_in_use:                       int,
+
+	total_dirty_bins:  int,
+	total_free_bins:   int,
+	total_bins_in_use: int,
+	total_remote_free_bins: int,
+
+	heap_slab_map_entries:           int,
+	heap_superpages_with_free_slabs: int,
+	heap_slabs_with_remote_frees:    int,
+}
+
+/*
+Get information about the current thread's heap.
+
+This will do additional sanity checking on the heap if assertions are enabled.
+*/
+@(require_results)
+get_local_heap_info :: proc "contextless" () -> (info: Heap_Info) {
+	if local_heap_cache != nil {
+		cache := local_heap_cache
+		slab_map_terminated: [HEAP_BIN_RANKS]bool
+
+		for {
+			for rank := 0; rank < HEAP_BIN_RANKS; rank += 1 {
+				for i := 0; i < HEAP_CACHE_SLAB_MAP_STRIDE; i += 1 {
+					slab := cache.slab_map[rank * HEAP_CACHE_SLAB_MAP_STRIDE + i]
+					if slab_map_terminated[rank] {
+						assert_contextless(slab == nil, "The heap allocator has a gap in its slab map.")
+					} else if slab == nil {
+						slab_map_terminated[rank] = true
+					} else {
+						info.heap_slab_map_entries += 1
+						assert_contextless(slab.bin_size != 0, "The heap allocator has an empty slab in its slab map.")
+						assert_contextless(slab.bin_size == 1 << (HEAP_MIN_BIN_SHIFT + uint(rank)), "The heap allocator has a slab in the wrong sub-array of the slab map.")
+					}
+				}
+			}
+			for superpage in cache.superpages_with_free_slabs {
+				if superpage != nil {
+					info.heap_superpages_with_free_slabs += 1
+				}
+			}
+			for i in 0..<len(cache.superpages_with_remote_frees) {
+				if intrinsics.atomic_load_explicit(&cache.superpages_with_remote_frees[i], .Seq_Cst) != nil {
+					info.heap_slabs_with_remote_frees += 1
+				}
+			}
+			if cache.next_cache_block == nil {
+				break
+			}
+			cache = cache.next_cache_block
+		}
+	}
+
+	superpage := local_heap
+	for {
+		if superpage == nil {
+			break
+		}
+		assert_contextless(superpage.owner == get_current_thread_id(), "The heap allocator for this thread has a superpage that belongs to another thread.")
+		info.total_superpages += 1
+		if superpage.huge_size > 0 {
+			info.total_huge_allocations += 1
+			info.total_memory_allocated_from_system += superpage.huge_size
+			info.total_memory_in_use += superpage.huge_size - HEAP_HUGE_ALLOCATION_BOOK_KEEPING
+			info.total_memory_used_for_book_keeping += HEAP_HUGE_ALLOCATION_BOOK_KEEPING
+		} else {
+			if superpage.cache_block.in_use {
+				info.total_superpages_dedicated_to_heap_cache += 1
+			}
+			info.total_memory_allocated_from_system += SUPERPAGE_SIZE
+			for i := 0; i < HEAP_SLAB_COUNT; /**/ {
+				slab := heap_superpage_index_slab(superpage, i)
+
+				if slab.bin_size != 0 {
+					info.total_slabs_in_use += 1
+					info.total_memory_in_use += slab.bin_size * (slab.max_bins - slab.free_bins)
+					info.total_memory_free   += slab.bin_size * slab.free_bins
+					info.total_memory_dirty  += slab.bin_size * slab.dirty_bins
+					info.total_bins_in_use   += slab.max_bins - slab.free_bins
+					info.total_free_bins     += slab.free_bins
+					info.total_dirty_bins    += slab.dirty_bins
+					assert_contextless(slab.dirty_bins >= slab.max_bins - slab.free_bins, "A slab of the heap allocator has a number of dirty bins which is not equivalent to the number of its total bins minus the number of free bins.")
+					// Account for the bitmaps used by the Slab.
+					info.total_memory_used_for_book_keeping += int(slab.data - uintptr(slab))
+					// Account for the space not used by the bins or the bitmaps.
+					n := int(slab.data - uintptr(slab) + uintptr(slab.max_bins * slab.bin_size))
+					if slab.bin_size > HEAP_MAX_BIN_SIZE {
+						info.total_memory_used_for_book_keeping += heap_slabs_needed_for_size(slab.bin_size) * HEAP_SLAB_SIZE - n
+					} else {
+						info.total_memory_used_for_book_keeping += HEAP_SLAB_SIZE - n
+					}
+					remote_free_bins := 0
+					for j in 0..<slab.sectors {
+						remote_free_bins += int(intrinsics.count_ones(intrinsics.atomic_load_explicit(&slab.remote_free[j], .Seq_Cst)))
+					}
+					info.total_remote_free_bins += remote_free_bins
+					info.total_memory_remotely_free += slab.bin_size * remote_free_bins
+				} else {
+					// When the slab is allocated, the book-keeping bitmaps and
+					// the Slab struct itself will take some of this space, so
+					// it's only an approximation of what is possible.
+					info.total_memory_free += HEAP_SLAB_SIZE
+					when !ODIN_DISABLE_ASSERT {
+						if !slab.is_dirty {
+							// Verify that the slab is actually zeroed out ahead of its index field.
+							ptr := cast([^]u8)rawptr(uintptr(slab) + size_of(int))
+							for k in 0..<HEAP_SLAB_SIZE - size_of(int) {
+								assert_contextless(ptr[k] == 0)
+							}
+						}
+					}
+				}
+
+				if slab.bin_size > HEAP_MAX_BIN_SIZE {
+					// Skip contiguous slabs.
+					i += heap_slabs_needed_for_size(slab.bin_size)
+				} else {
+					i += 1
+				}
+			}
+			// Every superpage has to sacrifice one Slab's worth of space so
+			// that they're all aligned.
+			info.total_memory_used_for_book_keeping += HEAP_SLAB_SIZE
+			info.total_slabs += HEAP_SLAB_COUNT
+		}
+		superpage = superpage.next
+	}
+
+	assert_contextless(info.total_memory_allocated_from_system == info.total_memory_used_for_book_keeping + info.total_memory_in_use + info.total_memory_free, "The heap allocator's metrics for total memory in use, free, and used for book-keeping do not add up to the total memory allocated from the operating system.")
+
+	if local_heap_cache != nil {
+		assert_contextless(info.total_superpages == local_heap_cache.owned_superpages)
+	}
+	return
+}
diff --git a/base/runtime/thread_management.odin b/base/runtime/threading.odin
similarity index 68%
rename from base/runtime/thread_management.odin
rename to base/runtime/threading.odin
index cabd4691cee..778d7e90061 100644
--- a/base/runtime/thread_management.odin
+++ b/base/runtime/threading.odin
@@ -5,6 +5,20 @@ Thread_Local_Cleaner :: #type proc "odin" ()
 @(private="file")
 thread_local_cleaners: [8]Thread_Local_Cleaner
 
+// The thread ID is cached here to save time from making syscalls with every
+// call.
+@(thread_local) local_thread_id: int
+
+// Return the current thread ID.
+@(require_results)
+get_current_thread_id :: proc "contextless" () -> int {
+	if local_thread_id == 0 {
+		local_thread_id = _get_current_thread_id()
+		assert_contextless(local_thread_id != 0, "_get_current_thread_id returned 0.")
+	}
+	return local_thread_id
+}
+
 // Add a procedure that will be run at the end of a thread for the purpose of
 // deallocating state marked as `thread_local`.
 //
diff --git a/base/runtime/threading_darwin.odin b/base/runtime/threading_darwin.odin
new file mode 100644
index 00000000000..8dd76f15528
--- /dev/null
+++ b/base/runtime/threading_darwin.odin
@@ -0,0 +1,17 @@
+#+private
+package runtime
+
+foreign import lib "system:System.framework"
+
+foreign lib {
+	pthread_threadid_np :: proc "c" (rawptr, ^u64) -> i32 ---
+}
+
+_get_current_thread_id :: proc "contextless" () -> int {
+	tid: u64
+	result := pthread_threadid_np(nil, &tid)
+	if result != 0 {
+		panic_contextless("Failed to get current thread ID.")
+	}
+	return int(tid)
+}
diff --git a/base/runtime/threading_freebsd.odin b/base/runtime/threading_freebsd.odin
new file mode 100644
index 00000000000..053e636401d
--- /dev/null
+++ b/base/runtime/threading_freebsd.odin
@@ -0,0 +1,19 @@
+#+private
+package runtime
+
+import "base:intrinsics"
+
+SYS_thr_self :: uintptr(432)
+
+_get_current_thread_id :: proc "contextless" () -> int {
+	when size_of(rawptr) == 4 {
+		id: i32
+	} else {
+		id: i64
+	}
+	_, ok := intrinsics.syscall_bsd(SYS_thr_self, uintptr(&id))
+	if !ok {
+		panic_contextless("Failed to get current thread ID.")
+	}
+	return int(id)
+}
diff --git a/base/runtime/threading_linux.odin b/base/runtime/threading_linux.odin
new file mode 100644
index 00000000000..3fc7aa993ae
--- /dev/null
+++ b/base/runtime/threading_linux.odin
@@ -0,0 +1,22 @@
+#+private
+package runtime
+
+import "base:intrinsics"
+
+when ODIN_ARCH == .amd64 {
+	SYS_gettid :: uintptr(186)
+} else when ODIN_ARCH == .arm32 {
+	SYS_gettid :: uintptr(224)
+} else when ODIN_ARCH == .arm64 {
+	SYS_gettid :: uintptr(178)
+} else when ODIN_ARCH == .i386 {
+	SYS_gettid :: uintptr(224)
+} else when ODIN_ARCH == .riscv64 {
+	SYS_gettid :: uintptr(178)
+} else {
+	#panic("Syscall numbers related to threading are missing for this Linux architecture.")
+}
+
+_get_current_thread_id :: proc "contextless" () -> int {
+	return int(intrinsics.syscall(SYS_gettid))
+}
diff --git a/base/runtime/threading_netbsd.odin b/base/runtime/threading_netbsd.odin
new file mode 100644
index 00000000000..0ba6fd0fd3e
--- /dev/null
+++ b/base/runtime/threading_netbsd.odin
@@ -0,0 +1,11 @@
+#+private
+package runtime
+
+import "base:intrinsics"
+
+SYS__lwp_self :: uintptr(311)
+
+_get_current_thread_id :: proc "contextless" () -> int {
+	result, _ := intrinsics.syscall_bsd(SYS__lwp_self)
+	return int(result)
+}
diff --git a/base/runtime/threading_openbsd.odin b/base/runtime/threading_openbsd.odin
new file mode 100644
index 00000000000..8bfb3de9b24
--- /dev/null
+++ b/base/runtime/threading_openbsd.odin
@@ -0,0 +1,11 @@
+#+private
+package runtime
+
+import "base:intrinsics"
+
+SYS_getthrid :: uintptr(299)
+
+_get_current_thread_id :: proc "contextless" () -> int {
+	result, _ := intrinsics.syscall_bsd(SYS_getthrid)
+	return int(result)
+}
diff --git a/base/runtime/threading_other.odin b/base/runtime/threading_other.odin
new file mode 100644
index 00000000000..4bad1a24240
--- /dev/null
+++ b/base/runtime/threading_other.odin
@@ -0,0 +1,12 @@
+#+private
+#+build !darwin
+#+build !freebsd
+#+build !linux
+#+build !netbsd
+#+build !openbsd
+#+build !windows
+package runtime
+
+_get_current_thread_id :: proc "contextless" () -> int {
+	unimplemented_contextless("This platform does not support multithreading.")
+}
diff --git a/base/runtime/threading_windows.odin b/base/runtime/threading_windows.odin
new file mode 100644
index 00000000000..9671350614d
--- /dev/null
+++ b/base/runtime/threading_windows.odin
@@ -0,0 +1,13 @@
+#+private
+package runtime
+
+foreign import Kernel32 "system:Kernel32.lib"
+
+@(default_calling_convention="system")
+foreign Kernel32 {
+	GetCurrentThreadId :: proc() -> u32 ---
+}
+
+_get_current_thread_id :: proc "contextless" () -> int {
+	return int(GetCurrentThreadId())
+}
diff --git a/base/runtime/virtual_memory.odin b/base/runtime/virtual_memory.odin
new file mode 100644
index 00000000000..e8f6763506a
--- /dev/null
+++ b/base/runtime/virtual_memory.odin
@@ -0,0 +1,95 @@
+package runtime
+
+ODIN_VIRTUAL_MEMORY_SUPPORTED :: VIRTUAL_MEMORY_SUPPORTED
+
+// Virtually all MMUs supported by Odin should have a 4KiB page size.
+PAGE_SIZE :: 4 * Kilobyte
+
+when ODIN_ARCH == .arm32 {
+	SUPERPAGE_SIZE :: 1 * Megabyte
+} else {
+	// All other architectures should have support for 2MiB pages.
+	// i386 supports it in PAE mode.
+	// amd64, arm64, and riscv64 support it by default.
+	SUPERPAGE_SIZE :: 2 * Megabyte
+}
+
+#assert(SUPERPAGE_SIZE & (SUPERPAGE_SIZE-1) == 0, "SUPERPAGE_SIZE must be a power of two.")
+
+/*
+Allocate virtual memory from the operating system.
+
+The address returned is guaranteed to point to data that is at least `size`
+bytes large but may be larger, due to rounding `size` to the page size of the
+system.
+*/
+@(require_results)
+allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr {
+	return _allocate_virtual_memory(size)
+}
+
+/*
+Allocate a superpage of virtual memory from the operating system.
+
+This is a contiguous block of memory larger than what is normally distributed
+by the operating system, sometimes with special performance properties related
+to the Translation Lookaside Buffer.
+
+The address will be a multiple of the `SUPERPAGE_SIZE` constant, and the memory
+pointed to will be at least as long as that very same constant.
+
+The name derives from the superpage concept on the *BSD operating systems,
+where it is known as huge pages on Linux and large pages on Windows.
+*/
+@(require_results)
+allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr {
+	return _allocate_virtual_memory_superpage()
+}
+
+/*
+Allocate virtual memory from the operating system.
+
+The address returned is guaranteed to be a multiple of `alignment` and point to
+data that is at least `size` bytes large but may be larger, due to rounding
+`size` to the page size of the system.
+
+`alignment` must be a power of two.
+*/
+@(require_results)
+allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr {
+	assert_contextless(is_power_of_two(alignment))
+	return _allocate_virtual_memory_aligned(size, alignment)
+}
+
+/*
+Free virtual memory allocated by any of the `allocate_*` procs.
+*/
+free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) {
+	_free_virtual_memory(ptr, size)
+}
+
+/*
+Resize virtual memory allocated by `allocate_virtual_memory`.
+
+**Caveats:**
+
+- `new_size` must not be zero.
+- If `old_size` and `new_size` are the same, nothing happens.
+- The resulting behavior is undefined if `old_size` is incorrect.
+- If the address is changed, `alignment` will be ensured.
+- `alignment` should be the same value used when the memory was allocated.
+- Resizing memory returned by `allocate_virtual_memory_superpage` is not
+  well-defined. The memory may be resized, but it may no longer be backed by a
+  superpage.
+*/
+@(require_results)
+resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int = 0) -> rawptr {
+	// * This is due to a restriction of mremap on Linux.
+	assert_contextless(new_size != 0, "Cannot resize virtual memory address to zero.")
+	// * The statement about undefined behavior of incorrect `old_size` is due to
+	//   how VirtualFree works on Windows.
+	if old_size == new_size {
+		return ptr
+	}
+	return _resize_virtual_memory(ptr, old_size, new_size, alignment)
+}
diff --git a/base/runtime/virtual_memory_darwin.odin b/base/runtime/virtual_memory_darwin.odin
new file mode 100644
index 00000000000..77b46dc826a
--- /dev/null
+++ b/base/runtime/virtual_memory_darwin.odin
@@ -0,0 +1,112 @@
+#+private
+package runtime
+
+import "base:intrinsics"
+
+VIRTUAL_MEMORY_SUPPORTED :: true
+
+foreign import lib "system:System.framework"
+
+foreign lib {
+	mach_task_self_: u32
+	mach_vm_allocate :: proc(target: u32, address: ^u64, size: u64, flags: i32) -> i32 ---
+	mach_vm_deallocate :: proc(target: u32, address: u64, size: u64) -> i32 ---
+	mach_vm_protect :: proc(target_task: u32, address: u64, size: u64, set_maximum: b32, new_protection: i32) -> i32 ---
+	mach_vm_map :: proc(
+		target_task:    u32,
+		address:        ^u64,
+		size:           u64,
+		mask:           u64,
+		flags:          i32,
+		object:         i32,
+		offset:         uintptr,
+		copy:           b32,
+		cur_protection,
+		max_protection: i32,
+		inheritance:    u32,
+	) -> i32 ---
+	mach_vm_remap :: proc(
+		target_task:    u32,
+		target_address: ^u64,
+		size:           u64,
+		mask:           u64,
+		flags:          i32,
+		src_task:       u32,
+		src_address:    u64,
+		copy:           b32,
+		cur_protection,
+		max_protection: ^i32,
+		inheritance:    u32,
+	) -> i32 ---
+}
+
+// The following features are specific to Darwin only.
+VM_FLAGS_ANYWHERE :: 0x0001
+
+SUPERPAGE_SIZE_ANY :: 1
+SUPERPAGE_SIZE_2MB :: 2
+
+VM_FLAGS_SUPERPAGE_SHIFT    :: 16
+VM_FLAGS_SUPERPAGE_SIZE_ANY :: SUPERPAGE_SIZE_ANY << VM_FLAGS_SUPERPAGE_SHIFT
+VM_FLAGS_SUPERPAGE_SIZE_2MB :: SUPERPAGE_SIZE_2MB << VM_FLAGS_SUPERPAGE_SHIFT
+
+MEMORY_OBJECT_NULL :: 0
+VM_PROT_READ  :: 0x01
+VM_PROT_WRITE :: 0x02
+VM_INHERIT_COPY :: 1
+
+_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr {
+	address: u64
+	result := mach_vm_map(mach_task_self_, &address, u64(size), 0, VM_FLAGS_ANYWHERE, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY)
+	if result != 0 {
+		return nil
+	}
+	return rawptr(uintptr(address))
+}
+
+_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr {
+	address: u64
+	flags: i32 = VM_FLAGS_ANYWHERE
+	when ODIN_ARCH == .amd64 {
+		// NOTE(Feoramund): As far as we are aware, Darwin only supports
+		// explicit superpage allocation on AMD64 with a 2MiB parameter.
+		when SUPERPAGE_SIZE == 2 * Megabyte {
+			flags |= VM_FLAGS_SUPERPAGE_SIZE_2MB
+		} else {
+			#panic("An unsupported superpage size has been configured for AMD64 Darwin; only 2MB is supported.")
+		}
+	}
+	alignment_mask: u64 = SUPERPAGE_SIZE - 1 // Assumes a power of two size, ensured by an assertion in `virtual_memory.odin`.
+	result := mach_vm_map(mach_task_self_, &address, SUPERPAGE_SIZE, alignment_mask, flags, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY)
+	if result != 0 {
+		return nil
+	}
+	assert_contextless(address % SUPERPAGE_SIZE == 0)
+	return rawptr(uintptr(address))
+}
+
+_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr {
+	address: u64
+	alignment_mask: u64 = u64(alignment) - 1
+	result := mach_vm_map(mach_task_self_, &address, u64(size), alignment_mask, VM_FLAGS_ANYWHERE, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY)
+	if result != 0 {
+		return nil
+	}
+	return rawptr(uintptr(address))
+}
+
+_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) {
+	mach_vm_deallocate(mach_task_self_, u64(uintptr(ptr)), u64(size))
+}
+
+_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr {
+	result: rawptr = ---
+	if alignment == 0 {
+		result = _allocate_virtual_memory(new_size)
+	} else {
+		result = _allocate_virtual_memory_aligned(new_size, alignment)
+	}
+	intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size))
+	mach_vm_deallocate(mach_task_self_, u64(uintptr(ptr)), u64(old_size))
+	return result
+}
diff --git a/base/runtime/virtual_memory_freebsd.odin b/base/runtime/virtual_memory_freebsd.odin
new file mode 100644
index 00000000000..76acfeebdce
--- /dev/null
+++ b/base/runtime/virtual_memory_freebsd.odin
@@ -0,0 +1,135 @@
+#+private
+package runtime
+
+import "base:intrinsics"
+
+VIRTUAL_MEMORY_SUPPORTED :: true
+
+SYS_munmap :: uintptr(73)
+SYS_mmap   :: uintptr(477)
+
+PROT_READ   :: 0x01
+PROT_WRITE  :: 0x02
+
+MAP_PRIVATE   :: 0x0002
+MAP_ANONYMOUS :: 0x1000
+
+// The following features are specific to FreeBSD only.
+/*
+ * Request specific alignment (n == log2 of the desired alignment).
+ *
+ * MAP_ALIGNED_SUPER requests optimal superpage alignment, but does
+ * not enforce a specific alignment.
+ */
+// #define MAP_ALIGNED(n) ((n) << MAP_ALIGNMENT_SHIFT)
+MAP_ALIGNMENT_SHIFT :: 24
+MAP_ALIGNED_SUPER   :: 1 << MAP_ALIGNMENT_SHIFT
+
+SUPERPAGE_MAP_FLAGS :: (intrinsics.constant_log2(SUPERPAGE_SIZE) << MAP_ALIGNMENT_SHIFT) | MAP_ALIGNED_SUPER
+
+_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr {
+	result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0)
+	if !ok {
+		return nil
+	}
+	return rawptr(result)
+}
+
+_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr {
+	result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, SUPERPAGE_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|SUPERPAGE_MAP_FLAGS, ~uintptr(0), 0)
+	if !ok {
+		// It may be the case that FreeBSD couldn't fulfill our alignment
+		// request, but it could still give us some memory.
+		return _allocate_virtual_memory_manually_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE)
+	}
+	return rawptr(result)
+}
+
+_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr {
+	// This procedure uses the `MAP_ALIGNED` API provided by FreeBSD and falls
+	// back to manually aligned addresses, if that fails.
+	map_aligned_n: uintptr
+	if alignment >= PAGE_SIZE {
+		map_aligned_n = intrinsics.count_trailing_zeros(uintptr(alignment)) << MAP_ALIGNMENT_SHIFT
+	}
+	result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|map_aligned_n, ~uintptr(0), 0)
+	if !ok {
+		_allocate_virtual_memory_manually_aligned(size, alignment)
+	}
+	return rawptr(result)
+}
+
+_allocate_virtual_memory_manually_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr {
+	if alignment <= PAGE_SIZE {
+		// This is the simplest case.
+		//
+		// By virtue of binary arithmetic, any address aligned to a power of
+		// two is necessarily aligned to all lesser powers of two, and because
+		// mmap returns page-aligned addresses, we don't have to do anything
+		// extra here.
+		result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0)
+		if !ok {
+			return nil
+		}
+		return cast(rawptr)result
+	}
+	// We must over-allocate then adjust the address.
+	mmap_result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size + alignment), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0)
+	if !ok {
+		return nil
+	}
+	assert_contextless(mmap_result % PAGE_SIZE == 0)
+	modulo := mmap_result & uintptr(alignment-1)
+	if modulo != 0 {
+		// The address is misaligned, so we must return an adjusted address
+		// and free the pages we don't need.
+		delta := uintptr(alignment) - modulo
+		adjusted_result := mmap_result + delta
+
+		// Sanity-checking:
+		// - The adjusted address is still page-aligned, so it is a valid argument for munmap.
+		// - The adjusted address is aligned to the user's needs.
+		assert_contextless(adjusted_result % PAGE_SIZE == 0)
+		assert_contextless(adjusted_result % uintptr(alignment) == 0)
+
+		// Round the delta to a multiple of the page size.
+		delta = delta / PAGE_SIZE * PAGE_SIZE
+		if delta > 0 {
+			// Unmap the pages we don't need.
+			intrinsics.syscall_bsd(SYS_munmap, mmap_result, delta)
+		}
+
+		return rawptr(adjusted_result)
+	} else if size + alignment > PAGE_SIZE {
+		// The address is coincidentally aligned as desired, but we have space
+		// that will never be seen by the user, so we must free the backing
+		// pages for it.
+		start := size / PAGE_SIZE * PAGE_SIZE
+		if size % PAGE_SIZE != 0 {
+			start += PAGE_SIZE
+		}
+		length := size + alignment - start
+		if length > 0 {
+			intrinsics.syscall_bsd(SYS_munmap, mmap_result + uintptr(start), uintptr(length))
+		}
+	}
+	return rawptr(mmap_result)
+}
+
+_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) {
+	intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(size))
+}
+
+_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr {
+	// FreeBSD does not have a mremap syscall.
+	// All we can do is mmap a new address, copy the data, and munmap the old.
+	result: rawptr = ---
+	if alignment == 0 {
+		result = _allocate_virtual_memory(new_size)
+	} else {
+		result = _allocate_virtual_memory_aligned(new_size, alignment)
+	}
+	intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size))
+	intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(old_size))
+	return result
+}
diff --git a/base/runtime/virtual_memory_linux.odin b/base/runtime/virtual_memory_linux.odin
new file mode 100644
index 00000000000..94eea7a1373
--- /dev/null
+++ b/base/runtime/virtual_memory_linux.odin
@@ -0,0 +1,156 @@
+#+private
+package runtime
+
+import "base:intrinsics"
+
+VIRTUAL_MEMORY_SUPPORTED :: true
+
+when ODIN_ARCH == .amd64 {
+	SYS_mmap    :: uintptr(9)
+	SYS_munmap  :: uintptr(11)
+	SYS_mremap  :: uintptr(25)
+} else when ODIN_ARCH == .arm32 {
+	SYS_mmap    :: uintptr(90)
+	SYS_munmap  :: uintptr(91)
+	SYS_mremap  :: uintptr(163)
+} else when ODIN_ARCH == .arm64 {
+	SYS_mmap    :: uintptr(222)
+	SYS_munmap  :: uintptr(215)
+	SYS_mremap  :: uintptr(216)
+} else when ODIN_ARCH == .i386 {
+	SYS_mmap    :: uintptr(90)
+	SYS_munmap  :: uintptr(91)
+	SYS_mremap  :: uintptr(163)
+} else when ODIN_ARCH == .riscv64 {
+	SYS_mmap    :: uintptr(222)
+	SYS_munmap  :: uintptr(215)
+	SYS_mremap  :: uintptr(216)
+} else {
+	#panic("Syscall numbers related to virtual memory are missing for this Linux architecture.")
+}
+
+PROT_READ      :: 0x01
+PROT_WRITE     :: 0x02
+
+MAP_PRIVATE    :: 0x02
+MAP_ANONYMOUS  :: 0x20
+
+MREMAP_MAYMOVE :: 0x01
+
+ENOMEM         :: ~uintptr(11)
+
+_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr {
+	result := intrinsics.syscall(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0)
+	if int(result) < 0 {
+		return nil
+	}
+	return rawptr(result)
+}
+
+_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr {
+	// This depends on Transparent HugePage Support being enabled.
+	result := intrinsics.syscall(SYS_mmap, 0, SUPERPAGE_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0)
+	if int(result) < 0 {
+		return nil
+	}
+	if uintptr(result) % SUPERPAGE_SIZE != 0 {
+		// If THP support is not enabled, we may receive an address aligned to a
+		// page boundary instead, in which case, we must manually align a new
+		// address.
+		_free_virtual_memory(rawptr(result), SUPERPAGE_SIZE)
+		return _allocate_virtual_memory_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE)
+	}
+	return rawptr(result)
+}
+
+_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr {
+	if alignment <= PAGE_SIZE {
+		// This is the simplest case.
+		//
+		// By virtue of binary arithmetic, any address aligned to a power of
+		// two is necessarily aligned to all lesser powers of two, and because
+		// mmap returns page-aligned addresses, we don't have to do anything
+		// extra here.
+		result := intrinsics.syscall(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0)
+		if int(result) < 0 {
+			return nil
+		}
+		return rawptr(result)
+	}
+	// We must over-allocate then adjust the address.
+	mmap_result := intrinsics.syscall(SYS_mmap, 0, uintptr(size + alignment), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0)
+	if int(mmap_result) < 0 {
+		return nil
+	}
+	assert_contextless(mmap_result % PAGE_SIZE == 0)
+	modulo := mmap_result & uintptr(alignment-1)
+	if modulo != 0 {
+		// The address is misaligned, so we must return an adjusted address
+		// and free the pages we don't need.
+		delta := uintptr(alignment) - modulo
+		adjusted_result := mmap_result + delta
+
+		// Sanity-checking:
+		// - The adjusted address is still page-aligned, so it is a valid argument for mremap and munmap.
+		// - The adjusted address is aligned to the user's needs.
+		assert_contextless(adjusted_result % PAGE_SIZE == 0)
+		assert_contextless(adjusted_result % uintptr(alignment) == 0)
+
+		// Round the delta to a multiple of the page size.
+		delta = delta / PAGE_SIZE * PAGE_SIZE
+		if delta > 0 {
+			// Unmap the pages we don't need.
+			intrinsics.syscall(SYS_munmap, mmap_result, delta)
+		}
+
+		return rawptr(adjusted_result)
+	} else if size + alignment > PAGE_SIZE {
+		// The address is coincidentally aligned as desired, but we have space
+		// that will never be seen by the user, so we must free the backing
+		// pages for it.
+		start := size / PAGE_SIZE * PAGE_SIZE
+		if size % PAGE_SIZE != 0 {
+			start += PAGE_SIZE
+		}
+		length := size + alignment - start
+		if length > 0 {
+			intrinsics.syscall(SYS_munmap, mmap_result + uintptr(start), uintptr(length))
+		}
+	}
+	return rawptr(mmap_result)
+}
+
+_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) {
+	intrinsics.syscall(SYS_munmap, uintptr(ptr), uintptr(size))
+}
+
+_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr {
+	if alignment == 0 {
+		// The user does not care about alignment, which is the simpler case.
+		result := intrinsics.syscall(SYS_mremap, uintptr(ptr), uintptr(old_size), uintptr(new_size), MREMAP_MAYMOVE)
+		if int(result) < 0 {
+			return nil
+		}
+		return rawptr(result)
+	} else {
+		// First, let's try to mremap without MREMAP_MAYMOVE. We might get
+		// lucky and the operating system could expand (or shrink, as the case
+		// may be) the pages in place, which means we don't have to allocate a
+		// whole new chunk of memory.
+		mremap_result := intrinsics.syscall(SYS_mremap, uintptr(ptr), uintptr(old_size), uintptr(new_size), 0)
+		if mremap_result != ENOMEM {
+			// We got lucky.
+			return rawptr(mremap_result)
+		}
+
+		// mremap failed to resize the memory in place, which means we must
+		// allocate an entirely new aligned chunk of memory, copy the old data,
+		// and free the old pointer before returning the new one.
+		//
+		// This is costly but unavoidable with the API available to us.
+		result := _allocate_virtual_memory_aligned(new_size, alignment)
+		intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size))
+		_free_virtual_memory(ptr, old_size)
+		return result
+	}
+}
diff --git a/base/runtime/virtual_memory_netbsd.odin b/base/runtime/virtual_memory_netbsd.odin
new file mode 100644
index 00000000000..46c475d9a6f
--- /dev/null
+++ b/base/runtime/virtual_memory_netbsd.odin
@@ -0,0 +1,72 @@
+#+private
+package runtime
+
+import "base:intrinsics"
+
+VIRTUAL_MEMORY_SUPPORTED :: true
+
+SYS_munmap :: uintptr(73)
+SYS_mmap   :: uintptr(197)
+SYS_mremap :: uintptr(411)
+
+PROT_READ   :: 0x01
+PROT_WRITE  :: 0x02
+
+MAP_PRIVATE   :: 0x0002
+MAP_ANONYMOUS :: 0x1000
+
+// The following features are specific to NetBSD only.
+/*
+ * Alignment (expressed in log2).  Must be >= log2(PAGE_SIZE) and
+ * < # bits in a pointer (32 or 64).
+ */
+// #define MAP_ALIGNED(n) ((int)((unsigned int)(n) << MAP_ALIGNMENT_SHIFT))
+MAP_ALIGNMENT_SHIFT :: 24
+
+_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr {
+	result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0)
+	if !ok {
+		return nil
+	}
+	return rawptr(result)
+}
+
+_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr {
+	// NOTE(Feoramund): I am uncertain if NetBSD has direct support for
+	// superpages, so we just use the aligned allocate procedure here.
+	return _allocate_virtual_memory_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE)
+}
+
+_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr {
+	// NOTE: Unlike FreeBSD, the NetBSD man pages do not indicate that
+	// `MAP_ALIGNED` can cause this to fail, so we don't try a second time with
+	// manual alignment.
+	map_aligned_n := intrinsics.count_trailing_zeros(uintptr(alignment)) << MAP_ALIGNMENT_SHIFT
+	result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|map_aligned_n, ~uintptr(0), 0)
+	if !ok {
+		return nil
+	}
+	return rawptr(result)
+}
+
+_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) {
+	intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(size))
+}
+
+_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr {
+	if alignment == 0 {
+		// The user does not care about alignment, which is the simpler case.
+		result, ok := intrinsics.syscall_bsd(SYS_mremap, uintptr(ptr), uintptr(old_size), uintptr(new_size), 0)
+		if !ok {
+			return nil
+		}
+		return rawptr(result)
+	} else {
+		map_aligned_n := intrinsics.count_trailing_zeros(uintptr(alignment)) << MAP_ALIGNMENT_SHIFT
+		result, ok := intrinsics.syscall_bsd(SYS_mremap, uintptr(ptr), uintptr(old_size), uintptr(new_size), map_aligned_n)
+		if !ok {
+			return nil
+		}
+		return rawptr(result)
+	}
+}
diff --git a/base/runtime/virtual_memory_openbsd.odin b/base/runtime/virtual_memory_openbsd.odin
new file mode 100644
index 00000000000..6a37d4ae670
--- /dev/null
+++ b/base/runtime/virtual_memory_openbsd.odin
@@ -0,0 +1,104 @@
+#+private
+package runtime
+
+import "base:intrinsics"
+
+VIRTUAL_MEMORY_SUPPORTED :: true
+
+SYS_munmap :: uintptr(73)
+SYS_mmap   :: uintptr(49)
+
+PROT_READ   :: 0x01
+PROT_WRITE  :: 0x02
+
+MAP_PRIVATE   :: 0x0002
+MAP_ANONYMOUS :: 0x1000
+
+_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr {
+	result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0)
+	if !ok {
+		return nil
+	}
+	return rawptr(result)
+}
+
+_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr {
+	// NOTE(Feoramund): I am uncertain if OpenBSD has direct support for
+	// superpages, so we just use the aligned allocate procedure here.
+	return _allocate_virtual_memory_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE)
+}
+
+_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr {
+	if alignment <= PAGE_SIZE {
+		// This is the simplest case.
+		//
+		// By virtue of binary arithmetic, any address aligned to a power of
+		// two is necessarily aligned to all lesser powers of two, and because
+		// mmap returns page-aligned addresses, we don't have to do anything
+		// extra here.
+		result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0)
+		if !ok {
+			return nil
+		}
+		return cast(rawptr)result
+	}
+	// We must over-allocate then adjust the address.
+	mmap_result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size + alignment), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0)
+	if !ok {
+		return nil
+	}
+	assert_contextless(mmap_result % PAGE_SIZE == 0)
+	modulo := mmap_result & uintptr(alignment-1)
+	if modulo != 0 {
+		// The address is misaligned, so we must return an adjusted address
+		// and free the pages we don't need.
+		delta := uintptr(alignment) - modulo
+		adjusted_result := mmap_result + delta
+
+		// Sanity-checking:
+		// - The adjusted address is still page-aligned, so it is a valid argument for munmap.
+		// - The adjusted address is aligned to the user's needs.
+		assert_contextless(adjusted_result % PAGE_SIZE == 0)
+		assert_contextless(adjusted_result % uintptr(alignment) == 0)
+
+		// Round the delta to a multiple of the page size.
+		delta = delta / PAGE_SIZE * PAGE_SIZE
+		if delta > 0 {
+			// Unmap the pages we don't need.
+			intrinsics.syscall_bsd(SYS_munmap, mmap_result, delta)
+		}
+
+		return rawptr(adjusted_result)
+	} else if size + alignment > PAGE_SIZE {
+		// The address is coincidentally aligned as desired, but we have space
+		// that will never be seen by the user, so we must free the backing
+		// pages for it.
+		start := size / PAGE_SIZE * PAGE_SIZE
+		if size % PAGE_SIZE != 0 {
+			start += PAGE_SIZE
+		}
+		length := size + alignment - start
+		if length > 0 {
+			intrinsics.syscall_bsd(SYS_munmap, mmap_result + uintptr(start), uintptr(length))
+		}
+	}
+	return rawptr(mmap_result)
+}
+
+_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) {
+	intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(size))
+}
+
+_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr {
+	// OpenBSD does not have a mremap syscall.
+	// All we can do is mmap a new address, copy the data, and munmap the old.
+	result: rawptr = ---
+	if alignment == 0 {
+		result = _allocate_virtual_memory(new_size)
+	} else {
+		result = _allocate_virtual_memory_aligned(new_size, alignment)
+	}
+	intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size))
+	intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(old_size))
+	return result
+}
diff --git a/base/runtime/virtual_memory_other.odin b/base/runtime/virtual_memory_other.odin
new file mode 100644
index 00000000000..2c686051050
--- /dev/null
+++ b/base/runtime/virtual_memory_other.odin
@@ -0,0 +1,30 @@
+#+private
+#+build !darwin
+#+build !freebsd
+#+build !linux
+#+build !netbsd
+#+build !openbsd
+#+build !windows
+package runtime
+
+VIRTUAL_MEMORY_SUPPORTED :: false
+
+_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr {
+	unimplemented_contextless("Virtual memory is not supported on this platform.")
+}
+
+_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr {
+	unimplemented_contextless("Virtual memory is not supported on this platform.")
+}
+
+_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr {
+	unimplemented_contextless("Virtual memory is not supported on this platform.")
+}
+
+_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) {
+	unimplemented_contextless("Virtual memory is not supported on this platform.")
+}
+
+_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr {
+	unimplemented_contextless("Virtual memory is not supported on this platform.")
+}
diff --git a/base/runtime/virtual_memory_windows.odin b/base/runtime/virtual_memory_windows.odin
new file mode 100644
index 00000000000..cd39a3170b0
--- /dev/null
+++ b/base/runtime/virtual_memory_windows.odin
@@ -0,0 +1,149 @@
+#+private
+package runtime
+
+import "base:intrinsics"
+
+foreign import Kernel32 "system:Kernel32.lib"
+foreign import OneCore "system:OneCore.lib"
+
+VIRTUAL_MEMORY_SUPPORTED :: true
+
+// NOTE: Windows makes a distinction between page size and allocation
+// granularity. Addresses returned from the allocation procedures are rounded
+// to allocation granularity boundaries, at a minimum, not page sizes.
+
+@(default_calling_convention="system")
+foreign Kernel32 {
+	GetLastError  :: proc() -> u32 ---
+	GetSystemInfo :: proc(lpSystemInfo: ^SYSTEM_INFO) ---
+}
+
+@(default_calling_convention="system")
+foreign OneCore {
+	VirtualAlloc  :: proc(lpAddress: rawptr, dwSize: uint, flAllocationType: u32, flProtect: u32) -> rawptr ---
+	VirtualAlloc2 :: proc(
+		Process: rawptr,
+		BaseAddress: rawptr,
+		Size: uint,
+		AllocationType: u32,
+		PageProtection: u32,
+		ExtendedParameters: [^]MEM_EXTENDED_PARAMETER,
+		ParameterCount: u32) -> rawptr ---
+	VirtualFree :: proc(lpAddress: rawptr, dwSize: uint, dwFreeType: u32) -> b32 ---
+}
+
+SYSTEM_INFO :: struct {
+	using DUMMYUNIONNAME: struct #raw_union {
+		dwOemId: u32,
+		using DUMMYSTRUCTNAME:struct {
+			wProcessorArchitecture: u16,
+			wReserved: u16,
+		},
+	},
+	dwPageSize:                  u32,
+	lpMinimumApplicationAddress: rawptr,
+	lpMaximumApplicationAddress: rawptr,
+	dwActiveProcessorMask:       uint,
+	dwNumberOfProcessors:        u32,
+	dwProcessorType:             u32,
+	dwAllocationGranularity:     u32,
+	wProcessorLevel:             u16,
+	wProcessorRevision:          u16,
+}
+
+MemExtendedParameterAddressRequirements :: 0x01
+MEM_EXTENDED_PARAMETER_TYPE_BITS :: 8
+
+MEM_ADDRESS_REQUIREMENTS :: struct {
+	LowestStartingAddress: rawptr,
+	HighestEndingAddress: rawptr,
+	Alignment: uint,
+}
+
+MEM_EXTENDED_PARAMETER :: struct {
+	using DUMMYSTRUCTNAME: bit_field u64 {
+		Type:     u64 | MEM_EXTENDED_PARAMETER_TYPE_BITS,
+		Reserved: u64 | 64 - MEM_EXTENDED_PARAMETER_TYPE_BITS,
+	},
+	using DUMMYUNIONNAME: struct #raw_union {
+		ULong64: u64,
+		Pointer: rawptr,
+		Size:    uint,
+		Handle:  rawptr,
+		ULong:   u32,
+	},
+}
+
+
+MEM_COMMIT     :: 0x00001000
+MEM_RESERVE    :: 0x00002000
+MEM_RELEASE    :: 0x00008000
+
+PAGE_READWRITE :: 0x04
+
+_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr {
+	// `Size` must be a multiple of the page size.
+	rounded_size := size
+	if rounded_size % PAGE_SIZE != 0 {
+		rounded_size = (size / PAGE_SIZE + 1) * PAGE_SIZE
+	}
+	result := VirtualAlloc(nil, uint(rounded_size), MEM_COMMIT|MEM_RESERVE, PAGE_READWRITE)
+	if result == nil {
+		return nil
+	}
+	return rawptr(result)
+}
+
+_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr {
+	// TODO: Windows has support for Large Pages, but its usage requires privilege escalation.
+	return _allocate_virtual_memory_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE)
+}
+
+_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr {
+	sys_info: SYSTEM_INFO
+	GetSystemInfo(&sys_info)
+	if alignment <= int(sys_info.dwAllocationGranularity) {
+		// The alignment is less than or equal to the allocation granularity,
+		// which means it will automatically be aligned and any request for
+		// alignment less than the allocation granularity will result in
+		// ERROR_INVALID_PARAMETER.
+		return _allocate_virtual_memory(size)
+	}
+	addr_req := MEM_ADDRESS_REQUIREMENTS{
+		LowestStartingAddress = nil,
+		HighestEndingAddress = nil,
+		Alignment = uint(alignment),
+	}
+	param := MEM_EXTENDED_PARAMETER{
+		Type = MemExtendedParameterAddressRequirements,
+		Pointer = &addr_req,
+	}
+	// `Size` must be a multiple of the page size.
+	rounded_size := size
+	if rounded_size % PAGE_SIZE != 0 {
+		rounded_size = (size / PAGE_SIZE + 1) * PAGE_SIZE
+	}
+	result := VirtualAlloc2(nil, nil, uint(rounded_size), MEM_COMMIT|MEM_RESERVE, PAGE_READWRITE, &param, 1)
+	if result == nil {
+		return nil
+	}
+	return rawptr(result)
+}
+
+_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) {
+	VirtualFree(ptr, 0, MEM_RELEASE)
+}
+
+_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr {
+	// There is no system support for resizing addresses returned by VirtualAlloc.
+	// All we can do is request a new address, copy the data, and free the old.
+	result: rawptr = ---
+	if alignment == 0 {
+		result = _allocate_virtual_memory(new_size)
+	} else {
+		result = _allocate_virtual_memory_aligned(new_size, alignment)
+	}
+	intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size))
+	VirtualFree(ptr, 0, MEM_RELEASE)
+	return result
+}
diff --git a/base/runtime/wasm_allocator.odin b/base/runtime/wasm_allocator.odin
index 574f3dd06ef..c4799fcbe1c 100644
--- a/base/runtime/wasm_allocator.odin
+++ b/base/runtime/wasm_allocator.odin
@@ -1,3 +1,4 @@
+#+build !orca
 #+build wasm32, wasm64p32
 package runtime
 
@@ -869,3 +870,22 @@ aligned_realloc :: proc(a: ^WASM_Allocator, ptr: rawptr, alignment, size: uint,
 
 	return newptr
 }
+
+heap_allocator :: default_wasm_allocator
+heap_allocator_proc :: wasm_allocator_proc
+
+@(require_results)
+heap_alloc :: proc(size: int, zero_memory: bool = true) -> (ptr: rawptr) {
+	bytes, _ := wasm_allocator_proc(&global_default_wasm_allocator_data, .Alloc if zero_memory else .Alloc_Non_Zeroed, size, 0, nil, 0)
+	return raw_data(bytes)
+}
+
+@(require_results)
+heap_resize :: proc(old_ptr: rawptr, old_size: int, new_size: int, zero_memory: bool = true) -> (new_ptr: rawptr) {
+	bytes, _ := wasm_allocator_proc(&global_default_wasm_allocator_data, .Resize if zero_memory else .Resize_Non_Zeroed, new_size, 0, old_ptr, old_size)
+	return raw_data(bytes)
+}
+
+heap_free :: proc(ptr: rawptr) {
+	wasm_allocator_proc(&global_default_wasm_allocator_data, .Free, 0, 0, ptr, 0)
+}
diff --git a/core/os/os2/env_linux.odin b/core/os/os2/env_linux.odin
index e0ef0601058..132a5464e75 100644
--- a/core/os/os2/env_linux.odin
+++ b/core/os/os2/env_linux.odin
@@ -76,7 +76,7 @@ _set_env :: proc(key, v_new: string) -> bool {
 			// wasn't in the environment in the first place.
 			k_addr, v_addr := _kv_addr_from_val(v_curr, key)
 			if len(v_new) > len(v_curr) {
-				k_addr = ([^]u8)(runtime.heap_resize(k_addr, kv_size))
+				k_addr = ([^]u8)(runtime.heap_resize(k_addr, len(v_curr), kv_size))
 				if k_addr == nil {
 					return false
 				}
diff --git a/tests/heap_allocator/libc/heap_allocator.odin b/tests/heap_allocator/libc/heap_allocator.odin
new file mode 100644
index 00000000000..8710c59f0f3
--- /dev/null
+++ b/tests/heap_allocator/libc/heap_allocator.odin
@@ -0,0 +1,127 @@
+package tests_heap_allocator_libc
+
+import "base:intrinsics"
+import "base:runtime"
+import "core:mem"
+
+// This package contains the old libc malloc-based allocator, for comparison.
+
+Allocator          :: runtime.Allocator
+Allocator_Mode     :: runtime.Allocator_Mode
+Allocator_Mode_Set :: runtime.Allocator_Mode_Set
+Allocator_Error    :: runtime.Allocator_Error
+
+libc_allocator :: proc() -> Allocator {
+	return Allocator{
+		procedure = libc_allocator_proc,
+		data = nil,
+	}
+}
+
+libc_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
+                            size, alignment: int,
+                            old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) {
+	//
+	// NOTE(tetra, 2020-01-14): The heap doesn't respect alignment.
+	// Instead, we overallocate by `alignment + size_of(rawptr) - 1`, and insert
+	// padding. We also store the original pointer returned by heap_alloc right before
+	// the pointer we return to the user.
+	//
+
+	aligned_alloc :: proc(size, alignment: int, old_ptr: rawptr, old_size: int, zero_memory := true) -> ([]byte, Allocator_Error) {
+		// Not(flysand): We need to reserve enough space for alignment, which
+		// includes the user data itself, the space to store the pointer to
+		// allocation start, as well as the padding required to align both
+		// the user data and the pointer.
+		a := max(alignment, align_of(rawptr))
+		space := a-1 + size_of(rawptr) + size
+		allocated_mem: rawptr
+
+		force_copy := old_ptr != nil && alignment > align_of(rawptr)
+
+		if old_ptr != nil && !force_copy {
+			original_old_ptr := ([^]rawptr)(old_ptr)[-1]
+			allocated_mem = heap_resize(original_old_ptr, space)
+		} else {
+			allocated_mem = heap_alloc(space, zero_memory)
+		}
+		aligned_mem := rawptr(([^]u8)(allocated_mem)[size_of(rawptr):])
+
+		ptr := uintptr(aligned_mem)
+		aligned_ptr := (ptr + uintptr(a)-1) & ~(uintptr(a)-1)
+		if allocated_mem == nil {
+			aligned_free(old_ptr)
+			aligned_free(allocated_mem)
+			return nil, .Out_Of_Memory
+		}
+
+		aligned_mem = rawptr(aligned_ptr)
+		([^]rawptr)(aligned_mem)[-1] = allocated_mem
+
+		if force_copy {
+			runtime.mem_copy_non_overlapping(aligned_mem, old_ptr, min(old_size, size))
+			aligned_free(old_ptr)
+		}
+
+		return mem.byte_slice(aligned_mem, size), nil
+	}
+
+	aligned_free :: proc(p: rawptr) {
+		if p != nil {
+			heap_free(([^]rawptr)(p)[-1])
+		}
+	}
+
+	aligned_resize :: proc(p: rawptr, old_size: int, new_size: int, new_alignment: int, zero_memory := true) -> (new_memory: []byte, err: Allocator_Error) {
+		if p == nil {
+			return aligned_alloc(new_size, new_alignment, nil, old_size, zero_memory)
+		}
+
+		new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return
+
+		// NOTE: heap_resize does not zero the new memory, so we do it
+		if zero_memory && new_size > old_size {
+			new_region := raw_data(new_memory[old_size:])
+			intrinsics.mem_zero(new_region, new_size - old_size)
+		}
+		return
+	}
+
+	switch mode {
+	case .Alloc, .Alloc_Non_Zeroed:
+		return aligned_alloc(size, alignment, nil, 0, mode == .Alloc)
+
+	case .Free:
+		aligned_free(old_memory)
+
+	case .Free_All:
+		return nil, .Mode_Not_Implemented
+
+	case .Resize, .Resize_Non_Zeroed:
+		return aligned_resize(old_memory, old_size, size, alignment, mode == .Resize)
+
+	case .Query_Features:
+		set := (^Allocator_Mode_Set)(old_memory)
+		if set != nil {
+			set^ = {.Alloc, .Alloc_Non_Zeroed, .Free, .Resize, .Resize_Non_Zeroed, .Query_Features}
+		}
+		return nil, nil
+
+	case .Query_Info:
+		return nil, .Mode_Not_Implemented
+	}
+
+	return nil, nil
+}
+
+heap_alloc :: proc "contextless" (size: int, zero_memory := true) -> rawptr {
+	return _heap_alloc(size, zero_memory)
+}
+
+heap_resize :: proc "contextless" (ptr: rawptr, new_size: int) -> rawptr {
+	return _heap_resize(ptr, new_size)
+}
+
+heap_free :: proc "contextless" (ptr: rawptr) {
+	_heap_free(ptr)
+}
diff --git a/base/runtime/heap_allocator_orca.odin b/tests/heap_allocator/libc/heap_allocator_orca.odin
similarity index 95%
rename from base/runtime/heap_allocator_orca.odin
rename to tests/heap_allocator/libc/heap_allocator_orca.odin
index be032a20720..8fd13253db6 100644
--- a/base/runtime/heap_allocator_orca.odin
+++ b/tests/heap_allocator/libc/heap_allocator_orca.odin
@@ -1,6 +1,6 @@
 #+build orca
 #+private
-package runtime
+package tests_heap_allocator_libc
 
 foreign {
 	@(link_name="malloc")   _orca_malloc   :: proc "c" (size: int) -> rawptr ---
diff --git a/base/runtime/heap_allocator_other.odin b/tests/heap_allocator/libc/heap_allocator_other.odin
similarity index 94%
rename from base/runtime/heap_allocator_other.odin
rename to tests/heap_allocator/libc/heap_allocator_other.odin
index 507dbf3181e..5118aa61221 100644
--- a/base/runtime/heap_allocator_other.odin
+++ b/tests/heap_allocator/libc/heap_allocator_other.odin
@@ -1,6 +1,6 @@
 #+build js, wasi, freestanding, essence
 #+private
-package runtime
+package tests_heap_allocator_libc
 
 _heap_alloc :: proc "contextless" (size: int, zero_memory := true) -> rawptr {
 	context = default_context()
diff --git a/base/runtime/heap_allocator_unix.odin b/tests/heap_allocator/libc/heap_allocator_unix.odin
similarity index 96%
rename from base/runtime/heap_allocator_unix.odin
rename to tests/heap_allocator/libc/heap_allocator_unix.odin
index d4590d2dde8..49fb820bdc0 100644
--- a/base/runtime/heap_allocator_unix.odin
+++ b/tests/heap_allocator/libc/heap_allocator_unix.odin
@@ -1,6 +1,6 @@
 #+build linux, darwin, freebsd, openbsd, netbsd, haiku
 #+private
-package runtime
+package tests_heap_allocator_libc
 
 when ODIN_OS == .Darwin {
 	foreign import libc "system:System.framework"
diff --git a/base/runtime/heap_allocator_windows.odin b/tests/heap_allocator/libc/heap_allocator_windows.odin
similarity index 97%
rename from base/runtime/heap_allocator_windows.odin
rename to tests/heap_allocator/libc/heap_allocator_windows.odin
index e07df755924..d5635d612d6 100644
--- a/base/runtime/heap_allocator_windows.odin
+++ b/tests/heap_allocator/libc/heap_allocator_windows.odin
@@ -1,4 +1,4 @@
-package runtime
+package tests_heap_allocator_libc
 
 foreign import kernel32 "system:Kernel32.lib"
 
diff --git a/tests/heap_allocator/test_bench.odin b/tests/heap_allocator/test_bench.odin
new file mode 100644
index 00000000000..649c61b56e2
--- /dev/null
+++ b/tests/heap_allocator/test_bench.odin
@@ -0,0 +1,1493 @@
+// In order to test the heap allocator in a deterministic manner, we must run
+// this program without the help of the test runner, because the runner itself
+// makes use of heap allocation.
+package tests_heap_allocator
+
+import "base:intrinsics"
+import "base:runtime"
+import "core:flags"
+import "core:fmt"
+import "core:log"
+import "core:math/rand"
+import "core:os"
+import "core:sync"
+import "core:thread"
+import "core:time"
+import "core:mem"
+
+import libc_allocator "libc"
+
+ODIN_DEBUG_HEAP :: runtime.ODIN_DEBUG_HEAP
+
+INTEGER_BITS :: 8 * size_of(int)
+SECTOR_BITS  :: 8 * size_of(uint)
+
+// The tests are specific to feoramalloc, but the benchmarks are general-purpose.
+
+//
+// Utility
+//
+
+expect :: proc "contextless" (condition: bool, message := #caller_expression(condition), loc := #caller_location) {
+	if !condition {
+		@(cold)
+		internal :: proc "contextless" (message: string, loc: runtime.Source_Code_Location) {
+			runtime.print_string("\n* Expectation failed: ")
+			runtime.print_string(message)
+			runtime.print_string(" @ ")
+			runtime.print_caller_location(loc)
+			runtime.print_string("\n\n")
+			when ODIN_DEBUG {
+				intrinsics.debug_trap()
+			} else {
+				intrinsics.trap()
+			}
+		}
+		internal(message, loc)
+	}
+}
+
+verify_zeroed_slice :: proc(bytes: []byte, loc := #caller_location) {
+	for b in bytes {
+		expect(b == 0, loc = loc)
+	}
+}
+
+verify_zeroed_ptr :: proc(ptr: [^]byte, size: int, loc := #caller_location) {
+	for i := 0; i < size; i += 1 {
+		expect(ptr[i] == 0, loc = loc)
+	}
+}
+
+verify_zeroed :: proc {
+	verify_zeroed_slice,
+	verify_zeroed_ptr,
+}
+
+verify_integrity_slice :: proc(bytes: []byte, seed: u64, loc := #caller_location) {
+	buf: [1]byte
+	rand.reset(seed)
+	for i := 0; i < len(bytes); i += len(buf) {
+		expect(rand.read(buf[:]) == len(buf), loc = loc)
+		length := min(len(buf), len(bytes) - i)
+		for j := 0; j < length; j += 1 {
+			expect(bytes[i+j] == buf[j], loc = loc)
+		}
+	}
+}
+
+verify_integrity_ptr :: proc(ptr: [^]byte, size: int, seed: u64, loc := #caller_location) {
+	verify_integrity_slice(transmute([]byte)runtime.Raw_Slice{
+		data = ptr,
+		len = size,
+	}, seed, loc)
+}
+
+verify_integrity :: proc {
+	verify_integrity_slice,
+	verify_integrity_ptr,
+}
+
+
+randomize_bytes_slice :: proc(bytes: []byte, seed: u64, loc := #caller_location) {
+	rand.reset(seed)
+	buf: [1]byte
+	for i := 0; i < len(bytes); i += len(buf) {
+		expect(rand.read(buf[:]) == len(buf), loc = loc)
+		length := min(len(buf), len(bytes) - i)
+		for j := 0; j < length; j += 1 {
+			bytes[i+j] = buf[j]
+		}
+	}
+}
+
+randomize_bytes_ptr :: proc(ptr: [^]byte, size: int, seed: u64, loc := #caller_location) {
+	randomize_bytes_slice(transmute([]byte)runtime.Raw_Slice{
+		data = ptr,
+		len = size,
+	}, seed, loc)
+}
+
+randomize_bytes :: proc {
+	randomize_bytes_slice,
+	randomize_bytes_ptr,
+}
+
+dump_slabs :: proc() {
+	intrinsics.atomic_thread_fence(.Seq_Cst)
+	superpage := runtime.local_heap
+	for {
+		if superpage == nil {
+			return
+		}
+		log.infof("+++ Superpage at %p", superpage)
+		log.infof(" -  Free slabs: %i", superpage.free_slabs)
+		log.infof(" -  Next free slab index: %i", superpage.next_free_slab_index)
+		for i := 0; i < runtime.HEAP_SLAB_COUNT; /**/ {
+			slab := runtime.heap_superpage_index_slab(superpage, i)
+			if slab.bin_size == 0 {
+				// log.infof("Slab %i unused.", slab.index)
+				expect(i == slab.index)
+			} else {
+				log.infof("%#v", slab)
+				if slab.free_bins == 0 {
+					expect(intrinsics.atomic_load(&slab.is_full))
+				}
+			}
+			if slab.bin_size > runtime.HEAP_MAX_BIN_SIZE {
+				// Skip contiguous slabs.
+				i += runtime.heap_slabs_needed_for_size(slab.bin_size)
+			} else {
+				i += 1
+			}
+		}
+		log.info("")
+		superpage = superpage.next
+	}
+}
+
+validate_cache :: proc() {
+	cache := runtime.local_heap_cache
+	slab_map_terminated: [runtime.HEAP_BIN_RANKS]bool
+	superpages_with_free_slabs_terminated: bool
+	for {
+		// Validate the slab map.
+		for rank in 0..<runtime.HEAP_BIN_RANKS {
+			start := rank * runtime.HEAP_CACHE_SLAB_MAP_STRIDE
+			for i := start; i < start+runtime.HEAP_CACHE_SLAB_MAP_STRIDE; i += 1 {
+				slab := cache.slab_map[i]
+				if slab_map_terminated[rank] {
+					expect(slab == nil)
+				} else if slab == nil {
+					slab_map_terminated[rank] = true
+				} else {
+					expect(rank == runtime.heap_bin_size_to_rank(slab.bin_size))
+				}
+			}
+		}
+
+		// Validate the list of superpages with free slabs.
+		for i := 0; i < len(cache.superpages_with_free_slabs); i += 1 {
+			superpage := cache.superpages_with_free_slabs[i]
+			if superpages_with_free_slabs_terminated {
+				expect(superpage == nil)
+			} else if superpage == nil {
+				superpages_with_free_slabs_terminated = true
+			} else {
+				expect(superpage.free_slabs > 0)
+			}
+		}
+
+		next_cache := intrinsics.atomic_load_explicit(&cache.next_cache_block, .Acquire)
+		if next_cache == nil {
+			break
+		}
+		cache = next_cache
+	}
+}
+
+//
+// Allocation API Testing
+//
+
+Size_Strategy :: enum {
+	Adding,
+	Multiplying,
+	Randomizing,
+}
+
+Free_Strategy :: enum {
+	Never,
+	At_The_End,  // free at end of allocs
+	Interleaved, // free X after Y allocs
+}
+
+Free_Direction :: enum {
+	Forward,  // like a queue
+	Backward, // like a stack
+	Randomly,
+}
+
+test_alloc_write_free :: proc(
+	object_count: int,
+
+	starting_size: int,
+	final_size: int,
+
+	size_strategy: Size_Strategy,
+	size_operand: int,
+
+	allocs_per_free_operation: int,
+	free_operations_at_once: int,
+	free_strategy: Free_Strategy,
+	free_direction: Free_Direction,
+) {
+	Allocation :: struct {
+		data: []byte,
+		seed: u64,
+	}
+	pointers := make([]Allocation, object_count, context.temp_allocator)
+
+	allocator := context.allocator
+	size := starting_size
+	start_index := 0
+	end_index := 0
+
+	allocs := 0
+
+	log.infof("AWF: %i objects. Size: [%i..=%i] %v by %i each allocation. %i freed every %i, %v and %v.", object_count, starting_size, final_size, size_strategy, size_operand, free_operations_at_once, allocs_per_free_operation, free_strategy, free_direction)
+
+	for o in 1..=u64(object_count) {
+		seed := u64(intrinsics.read_cycle_counter()) * o
+		alignment := min(size, runtime.HEAP_MAX_ALIGNMENT)
+
+		bytes, alloc_err := allocator.procedure(allocator.data, .Alloc, size, alignment, nil, 0)
+		expect(alloc_err == nil)
+		pointers[end_index] = Allocation{
+			data = bytes,
+			seed = seed,
+		}
+		end_index += 1
+		allocs += 1
+
+		verify_zeroed(bytes)
+		randomize_bytes(bytes, seed)
+
+		if size < final_size {
+			switch size_strategy {
+			case .Adding:      size += size_operand
+			case .Multiplying: size *= size_operand
+			case .Randomizing: size = starting_size + rand.int_max(final_size - starting_size)
+			}
+			if final_size > starting_size {
+				size = min(size, final_size)
+			} else {
+				size = max(size, final_size)
+			}
+		}
+
+		if allocs % allocs_per_free_operation != 0 {
+			continue
+		}
+
+		switch free_strategy {
+		case .Never, .At_The_End:
+			break
+		case .Interleaved:
+			for _ in 0..<free_operations_at_once {
+				ptr: Allocation
+				switch free_direction {
+				case .Forward:
+					ptr = pointers[start_index]
+					start_index += 1
+				case .Backward:
+					ptr = pointers[end_index - 1]
+					end_index -= 1
+				case .Randomly:
+					index := start_index + rand.int_max(end_index - start_index)
+					ptr = pointers[index]
+					pointers[index] = pointers[end_index - 1] // unordered_remove
+					end_index -= 1
+				}
+				verify_integrity(ptr.data, ptr.seed)
+				_, free_err := allocator.procedure(allocator.data, .Free, 0, 0, raw_data(ptr.data), 0)
+				expect(free_err == nil)
+			}
+		}
+
+
+		if o % max(1, u64(object_count / 20)) == 0 {
+			validate_cache()
+		}
+	}
+
+	if end_index - start_index != 0 || free_strategy == .At_The_End {
+		for i in start_index..<end_index {
+			ptr := pointers[i]
+			verify_integrity(ptr.data, ptr.seed)
+			_, free_err := allocator.procedure(allocator.data, .Free, 0, 0, raw_data(ptr.data), 0)
+			expect(free_err == nil)
+		}
+	}
+}
+
+
+test_continuous_allocation_of_size_n :: proc(count: int, max_size: int) {
+	buf := make([][^]byte, count, context.temp_allocator)
+
+	log.infof("Testing continuous allocation of all sizes from 0 to %i, for %i objects each.", max_size, count)
+	allocator := context.allocator
+	base_seed := u64(intrinsics.read_cycle_counter())
+	for size in 0..<max_size {
+		alignment := min(size, runtime.HEAP_MAX_ALIGNMENT)
+		seed := base_seed * (1+u64(size))
+
+		for i in 0..<count {
+			bytes, alloc_err := allocator.procedure(allocator.data, .Alloc, size, alignment, nil, 0)
+			expect(alloc_err == nil)
+			buf[i] = raw_data(bytes)
+			// Verify the fresh memory is zeroed.
+			verify_zeroed(bytes)
+		}
+		for ptr in buf {
+			// Verify the memory is all zeroes at the end of allocation.
+			verify_zeroed(ptr, size)
+		}
+		for ptr in buf {
+			// Verify that the memory continues to be zero as other memory is being randomized.
+			verify_zeroed(ptr, size)
+			randomize_bytes(ptr, size, seed)
+		}
+		for ptr in buf {
+			// Verify that all of the memory is intact, as other memory is being freed.
+			verify_integrity(ptr, size, seed)
+			_, free_err := allocator.procedure(allocator.data, .Free, 0, 0, ptr, 0)
+			expect(free_err == nil)
+		}
+	}
+}
+
+test_individual_allocation_and_free :: proc(count: int) {
+	log.infof("Testing allocation of all sizes from 0 to %i.", count)
+	allocator := context.allocator
+	different_pointers := 0
+	for size in 0..<count {
+		if size > 0 && size % (runtime.HEAP_MAX_BIN_SIZE/8) == 0 {
+			log.infof("... %i ...", size)
+		}
+		alignment := min(size, runtime.HEAP_MAX_ALIGNMENT)
+
+		// Allocate and free twice to make sure that the memory is truly zeroed.
+		//
+		// This works on the assumption that the allocator will return the same
+		// pointer if we allocate, free, then allocate again with the same
+		// characteristics.
+		//
+		// libc malloc does not guarantee this behavior, but feoramalloc does
+		// in non-parallel scenarios.
+		old_ptr: rawptr
+		for i in 0..<2 {
+			bytes, alloc_err := allocator.procedure(allocator.data, .Alloc, size, alignment, nil, 0)
+			if i == 0 {
+				old_ptr = raw_data(bytes)
+			} else {
+				if old_ptr != raw_data(bytes) {
+					different_pointers += 1
+				}
+			}
+			expect(alloc_err == nil)
+			verify_zeroed(bytes)
+			randomize_bytes(bytes, u64(intrinsics.read_cycle_counter()))
+			_, free_err := allocator.procedure(allocator.data, .Free, 0, 0, raw_data(bytes), 0)
+			expect(free_err == nil)
+		}
+	}
+	if different_pointers > 0 {
+		log.warnf("There were %i cases in which the allocator didn't return the same pointer after allocating, freeing, then allocating again with the same size.", different_pointers)
+	}
+	log.info("Done.")
+}
+
+test_single_alloc_and_resize :: proc(start, target: int) {
+	log.infof("Testing allocation of %i bytes, resizing to %i, then resizing back.", start, target)
+	allocator := context.allocator
+	base_seed := u64(intrinsics.read_cycle_counter())
+
+	alignment := min(start, runtime.HEAP_MAX_ALIGNMENT)
+	seed := base_seed * (1+u64(start))
+
+	bytes, alloc_err := allocator.procedure(allocator.data, .Alloc, start, alignment, nil, 0)
+	expect(alloc_err == nil)
+	expect(len(bytes) == start)
+	verify_zeroed(bytes)
+	randomize_bytes(bytes, seed)
+
+	resized_bytes_1, resize_1_err := allocator.procedure(allocator.data, .Resize, target, alignment, raw_data(bytes), start)
+	expect(resize_1_err == nil)
+	expect(len(resized_bytes_1) == target)
+	verify_integrity(resized_bytes_1[:min(start, target)], seed)
+	if target > start {
+		verify_zeroed(resized_bytes_1[start:])
+	}
+
+	resized_bytes_2, resize_2_err := allocator.procedure(allocator.data, .Resize, start, alignment, raw_data(resized_bytes_1), target)
+	expect(resize_2_err == nil)
+	expect(len(resized_bytes_2) == start)
+	verify_integrity(resized_bytes_2[:min(start, target)], seed)
+	if start > target {
+		verify_zeroed(resized_bytes_2[target:])
+	}
+
+	_, free_err := allocator.procedure(allocator.data, .Free, 0, 0, raw_data(resized_bytes_2), 0)
+	expect(free_err == nil)
+}
+
+/*
+This test helped find an issue with the orphanage.
+*/
+test_parallel_pointer_passing :: proc(thread_count: int) {
+	Data :: struct {
+		thread: ^thread.Thread,
+		ptr: ^^int,
+		sema: sync.Sema,
+		friend: ^sync.Sema,
+		wg: ^sync.Wait_Group,
+	}
+
+	task :: proc(t: ^thread.Thread) {
+		data := cast(^Data)t.data
+		sync.wait(&data.sema)
+		expect(data.ptr != nil)
+		expect(data.ptr^ != nil)
+		expect(data.ptr^^ != 0)
+		free(data.ptr^)
+		data.ptr^ = new(int)
+		expect(data.ptr^^ == 0)
+		data.ptr^^ = int(intrinsics.read_cycle_counter())
+		if data.friend != nil {
+			sync.post(data.friend)
+		}
+		sync.wait_group_done(data.wg)
+	}
+
+	data := new(int)
+	data^ = int(intrinsics.read_cycle_counter())
+	tasks := make([]Data, thread_count, context.temp_allocator)
+
+	wg: sync.Wait_Group
+	sync.wait_group_add(&wg, thread_count)
+	
+	for i in 0..<thread_count-1 {
+		tasks[i].friend = &tasks[i+1].sema
+	}
+
+	for i in 0..<thread_count {
+		tasks[i].ptr = &data
+		tasks[i].wg = &wg
+		tasks[i].thread = thread.create(task)
+		tasks[i].thread.data = &tasks[i]
+		tasks[i].thread.init_context = context
+		thread.start(tasks[i].thread)
+	}
+
+	sync.post(&tasks[0].sema)
+	sync.wait_group_wait(&wg)
+
+	for i in 0..<thread_count {
+		thread.join(tasks[i].thread)
+		thread.destroy(tasks[i].thread)
+	}
+
+	// Free the final pointer.
+	free(tasks[len(tasks)-1].ptr^)
+
+	log.info("Parallel pointer write test succeeded.")
+}
+
+/*
+This test makes sure that a Superpage is reused when abandoned by a thread and
+picked up by a different one.
+*/
+test_superpage_abandonment_and_reuse :: proc() {
+	Alloc_Data :: struct {
+		thread: ^thread.Thread,
+		slice: []int,
+		signature: rawptr,
+		done: sync.Sema,
+	}
+
+	alloc_task :: proc(t: ^thread.Thread) {
+		// In this first thread, we allocate a small chunk of memory in a new
+		// superpage and mark where it came from.
+		data := cast(^Alloc_Data)t.data
+		data.slice = make([]int, 4096)
+		data.signature = runtime.find_superpage_from_pointer(raw_data(data.slice))
+		for &v, i in data.slice {
+			v = i
+		}
+		sync.post(&data.done)
+		// Shortly after this point, the thread cleanly exits
+	}
+
+	reuse_task :: proc(t: ^thread.Thread) {
+		// In the second thread here, we'll assert the memory is as we expect
+		// and check the signature of the slice's raw data.
+		data := cast(^Alloc_Data)t.data
+		for v, i in data.slice {
+			expect(v == i)
+		}
+		// Delete the data and allocate a new integer. If everything works as
+		// expected, this thread will have remotely freed the old chunk of
+		// memory, then adopted the superpage with the new operation.
+		//
+		// Upon adoption, the remote free should be acknowledged.
+		delete(data.slice)
+		x := new(int)
+		free(x)
+		// This is where we check to make sure the new pointer comes from the
+		// same place as the old data.
+		expect(runtime.find_superpage_from_pointer(x) == data.signature)
+		sync.post(&data.done)
+	}
+
+	data: Alloc_Data
+	allocer := thread.create(alloc_task)
+	allocer.init_context = context
+	allocer.data = &data
+
+	thread.start(allocer)
+
+	sync.wait(&data.done)
+
+	// It will take an infinitesimal amount of time for the superpage to be
+	// pushed to the orphanage, so let's wait a (rather long) moment.
+	time.sleep(1 * time.Millisecond)
+
+	reuser := thread.create(reuse_task)
+	reuser.init_context = context
+	reuser.data = &data
+
+	thread.start(reuser)
+
+	sync.wait(&data.done)
+
+	thread.join(allocer)
+	thread.join(reuser)
+	thread.destroy(allocer)
+	thread.destroy(reuser)
+	log.info("Superpage abandonment and reuse test succeeded.")
+}
+
+/*
+This test ensures a pointer can be resized by any thread, which should cause
+the thread to take ownership if a size change is needed.
+*/
+test_parallel_pointer_resizing :: proc(thread_count: int) {
+	Data :: struct {
+		thread: ^thread.Thread,
+		step: byte,
+		ptr: ^[^]byte,
+		len: ^int,
+		sema: sync.Sema,
+		friend: ^sync.Sema,
+		wg: ^sync.Wait_Group,
+	}
+
+	task :: proc(t: ^thread.Thread) {
+		data := cast(^Data)t.data
+		expect(data.ptr != nil)
+		expect(data.len != nil)
+		sync.wait(&data.sema)
+
+		for i := 0; i < data.len^; i += 1 {
+			expect(data.ptr^[i] == data.step)
+		}
+		allocator := context.allocator
+
+		old_len := data.len^
+		new_len := old_len*2
+
+		resized_ptr, resize_err := allocator.procedure(allocator.data, .Resize, new_len, 1, data.ptr^, old_len)
+		expect(resize_err == nil)
+
+		// If we're dealing with sub-slab sizes, the pointer should stay the
+		// same if the bin rank did not change.
+		if new_len <= runtime.HEAP_SLAB_SIZE {
+			old_rank := runtime.heap_bin_size_to_rank(runtime.heap_round_to_bin_size(old_len))
+			new_rank := runtime.heap_bin_size_to_rank(runtime.heap_round_to_bin_size(new_len))
+
+			if old_rank == new_rank {
+				expect(raw_data(resized_ptr) == data.ptr^)
+			} else {
+				expect(raw_data(resized_ptr) != data.ptr^)
+			}
+		}
+
+		data.ptr^ = raw_data(resized_ptr)
+		data.len^ = new_len
+		for i := 0; i < new_len; i += 1 {
+			resized_ptr[i] = data.step + 1
+		}
+
+		if data.friend != nil {
+			sync.post(data.friend)
+		}
+		sync.wait_group_done(data.wg)
+	}
+
+	len: int = 2
+	data: [^]byte = raw_data(make([]byte, len))
+	tasks := make([]Data, thread_count, context.temp_allocator)
+	for i := 0; i < len; i += 1 {
+		data[i] = 1
+	}
+
+	wg: sync.Wait_Group
+	sync.wait_group_add(&wg, thread_count)
+	
+	for i in 0..<thread_count-1 {
+		tasks[i].friend = &tasks[i+1].sema
+	}
+
+	for i in 0..<thread_count {
+		tasks[i].ptr = &data
+		tasks[i].len = &len
+		tasks[i].wg = &wg
+		tasks[i].thread = thread.create(task)
+		tasks[i].thread.data = &tasks[i]
+		tasks[i].thread.init_context = context
+		tasks[i].step = 1 + byte(i)
+		thread.start(tasks[i].thread)
+	}
+
+	sync.post(&tasks[0].sema)
+	sync.wait_group_wait(&wg)
+
+	for i in 0..<thread_count {
+		thread.join(tasks[i].thread)
+		thread.destroy(tasks[i].thread)
+	}
+	log.info("Parallel pointer resize test succeeded.")
+}
+
+test_orphaned_superpage_with_remote_frees :: proc() {
+	Data :: struct {
+		thread: ^thread.Thread,
+		ptr: ^int,
+		ptr_ready: sync.Sema,
+		task_done: sync.Sema,
+	}
+
+	task :: proc(t: ^thread.Thread) {
+		data := cast(^Data)t.data
+		data.ptr = new(int)
+		sync.post(&data.ptr_ready)
+		sync.wait(&data.task_done)
+	}
+
+	data: Data
+	data.thread = thread.create(task)
+	data.thread.data = &data
+	data.thread.init_context = context
+	thread.start(data.thread)
+
+	sync.wait(&data.ptr_ready)
+	assert(data.ptr != nil)
+	free(data.ptr)
+	sync.post(&data.task_done)
+
+	thread.join(data.thread)
+	thread.destroy(data.thread)
+	log.info("Oprhaned superpage with remote free test succeeded.")
+}
+
+test_orphanage_overflow :: proc(thread_count: int) {
+	Data :: struct {
+		thread: ^thread.Thread,
+		barrier: ^sync.Barrier,
+		wg: ^sync.Wait_Group,
+	}
+
+	task :: proc(t: ^thread.Thread) {
+		data := cast(^Data)t.data
+
+		x := new(int)
+		intrinsics.mem_zero_volatile(x, size_of(int))
+		sync.barrier_wait(data.barrier)
+
+		free(x)
+
+		sync.wait_group_done(data.wg)
+	}
+
+	tasks := make([]Data, thread_count, context.temp_allocator)
+
+	barrier: sync.Barrier
+	sync.barrier_init(&barrier, thread_count)
+
+	wg: sync.Wait_Group
+	sync.wait_group_add(&wg, thread_count)
+	
+	for i in 0..<thread_count {
+		tasks[i].barrier = &barrier
+		tasks[i].wg = &wg
+		tasks[i].thread = thread.create(task)
+		tasks[i].thread.data = &tasks[i]
+		tasks[i].thread.init_context = context
+		thread.start(tasks[i].thread)
+	}
+
+	sync.wait_group_wait(&wg)
+
+	for i in 0..<thread_count {
+		thread.join(tasks[i].thread)
+		thread.destroy(tasks[i].thread)
+	}
+
+	time.sleep(1 * time.Millisecond)
+
+	count := intrinsics.atomic_load_explicit(&runtime.heap_orphanage_count, .Acquire)
+	log.debug(count, runtime.HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES)
+	expect(count == runtime.HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES)
+
+	log.info("Orphanage overflow test succeeded.")
+}
+
+test_single_alloc_and_resize_incremental :: proc(start, target: int) {
+	log.infof("Testing allocation of %i bytes, resizing by increments of one until %i is reached.", start, target)
+	allocator := context.allocator
+
+	alignment := min(start, runtime.HEAP_MAX_ALIGNMENT)
+	seed := u64(intrinsics.read_cycle_counter()) * (1+u64(start))
+
+	bytes, alloc_err := allocator.procedure(allocator.data, .Alloc, start, alignment, nil, 0)
+	expect(alloc_err == nil)
+	verify_zeroed(bytes)
+	randomize_bytes(bytes, seed)
+
+	o := raw_data(bytes)
+	for new_size := start + 1; new_size < target; new_size += 1 {
+		resized, resize_err := allocator.procedure(allocator.data, .Resize, new_size, alignment, o, new_size - 1)
+		expect(resize_err == nil)
+
+		verify_integrity(resized[:new_size-1], seed)
+		verify_zeroed(resized[new_size:])
+		randomize_bytes(resized, seed)
+
+		o = raw_data(resized)
+	}
+
+	_, free_err := allocator.procedure(allocator.data, .Free, 0, 0, o, 0)
+	expect(free_err == nil)
+}
+
+test_serial_bin_sanity :: proc() {
+	log.info("Testing bitmap sanity.")
+	o := new(int)
+	slab := runtime.find_slab_from_pointer(o)
+	log.infof("The slab being tested: %#v", slab)
+	expect(slab.dirty_bins == 1)
+	expect(slab.free_bins != 0)
+	// expect(slab.is_dirty)
+	expect(slab.max_bins > 0)
+	expect(slab.bin_size > 0)
+	expect(intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Acquire) == 0)
+
+	list := make([]^int, slab.max_bins)
+	list[0] = o
+
+	for i in 0..<slab.max_bins-1 {
+		list[i + 1] = new(int)
+		expect(slab.dirty_bins == i + 2)
+	}
+
+	expect(slab.free_bins == 0)
+	expect(slab.dirty_bins == slab.max_bins)
+
+	for sector in 0..<slab.sectors {
+		expect(slab.local_free[sector] == 0)
+		expect(intrinsics.atomic_load_explicit(&slab.remote_free[sector], .Acquire) == 0)
+	}
+	expect(intrinsics.atomic_load_explicit(&slab.is_full, .Acquire))
+
+	for ptr in list {
+		bin_number := int(uintptr(ptr) - slab.data) / slab.bin_size
+		sector := bin_number / INTEGER_BITS
+		index := uint(bin_number) % INTEGER_BITS
+		expect(slab.local_free[sector] & (1 << index) == 0)
+		free(ptr)
+		expect(slab.local_free[sector] & (1 << index) != 0)
+	}
+
+	for sector in 0..<slab.sectors {
+		expect(slab.local_free[sector] != 0)
+		expect(intrinsics.atomic_load_explicit(&slab.remote_free[sector], .Acquire) == 0)
+	}
+	log.info("Done.")
+}
+
+//
+// Benchmarking
+//
+
+Struct_16 :: struct { data: [16]byte }
+Struct_32 :: struct { data: [32]byte }
+Struct_64 :: struct { data: [64]byte }
+Struct_512 :: struct { data: [512]byte }
+
+
+bench_alloc_n_then_free_n :: proc(n: int, $T: typeid, location := #caller_location) {
+	pointers := make([]^T, n)
+	defer delete(pointers)
+
+	start := time.now()
+	for &pointer in pointers {
+		pointer = new(T)
+	}
+	done := time.since(start)
+	log.infof("ALLOC: % 7ix % 12s in % 14s", n, fmt.tprintf("%s", type_info_of(T)), fmt.tprintf("%s", done), location = location)
+
+	start = time.now()
+	for pointer in pointers {
+		free(pointer)
+	}
+	done = time.since(start)
+	log.infof("FREE:  % 7ix % 12s in % 14s", n, fmt.tprintf("%s", type_info_of(T)), fmt.tprintf("%s", done), location = location)
+}
+
+bench_alloc_n_then_free_n_backwards :: proc(n: int, $T: typeid, location := #caller_location) {
+	pointers := make([]^T, n)
+	defer delete(pointers)
+
+	start := time.now()
+	for &pointer in pointers {
+		pointer = new(T)
+	}
+	done := time.since(start)
+	log.infof("ALLOC: % 7ix % 12s in % 14s", n, fmt.tprintf("%s", type_info_of(T)), fmt.tprintf("%s", done), location = location)
+
+	start = time.now()
+	#reverse for pointer in pointers {
+		free(pointer)
+	}
+	done = time.since(start)
+	log.infof("FREE:  % 7ix % 12s in % 14s", n, fmt.tprintf("%s", type_info_of(T)), fmt.tprintf("%s", done), location = location)
+}
+
+bench_alloc_n_then_free_n_randomly :: proc(n: int, $T: typeid, location := #caller_location) {
+	pointers := make([]^T, n)
+	defer delete(pointers)
+
+	start := time.now()
+	for &pointer in pointers {
+		pointer = new(T)
+	}
+	done := time.since(start)
+	log.infof("ALLOC: % 7ix % 12s in % 14s", n, fmt.tprintf("%s", type_info_of(T)), fmt.tprintf("%s", done), location = location)
+
+	rand.shuffle(pointers)
+	start = time.now()
+	for pointer in pointers {
+		free(pointer)
+	}
+	done = time.since(start)
+	log.infof("FREE:  % 7ix % 12s in % 14s", n, fmt.tprintf("%s", type_info_of(T)), fmt.tprintf("%s", done), location = location)
+}
+
+bench_alloc_1_then_free_1_repeatedly :: proc(times: int, $T: typeid, location := #caller_location) {
+	start := time.now()
+	for _ in 0..<times {
+		free(new(T))
+	}
+	done := time.since(start)
+	log.infof("ALLOC+FREE:  1x % 12s %i times in %v", fmt.tprintf("%s", type_info_of(T)), times, done, location = location)
+}
+
+/*
+This is a benchmark, but it's also a good way to test thread sanity in parallel
+operating situations.
+*/
+bench_1_producer_n_consumer_for_m_alloc :: proc(thread_count: int, allocs_per_thread: int, $T: typeid, location := #caller_location) {
+	Consumer_Data :: struct {
+		start_time: ^time.Time,
+		thread: ^thread.Thread,
+		barrier: ^sync.Barrier,
+		wg: ^sync.Wait_Group,
+
+		runs: int,
+
+		all_pointers: []rawptr,
+		all_pointers_len: ^int,
+		all_pointers_ticket: ^int,
+	}
+
+	spmc_task :: proc(t: ^thread.Thread) {
+		data := cast(^Consumer_Data)t.data
+		sync.barrier_wait(data.barrier)
+
+		for _ in 0..<data.runs {
+			ticket := intrinsics.atomic_add_explicit(data.all_pointers_ticket, 1, .Relaxed)
+
+			for ticket >= intrinsics.atomic_load_explicit(data.all_pointers_len, .Acquire) {
+				// Spinlock.
+				intrinsics.cpu_relax()
+			}
+			intrinsics.atomic_thread_fence(.Seq_Cst)
+
+			ptr := data.all_pointers[ticket]
+
+			expect(ptr != nil)
+			i_ptr := cast(^i64)ptr
+			expect(i_ptr^ == 0)
+			val := intrinsics.read_cycle_counter()
+			i_ptr^ = val
+			expect(i_ptr^ == val)
+			free(ptr)
+		}
+
+		sync.wait_group_done(data.wg)
+	}
+
+	start_time: time.Time
+
+	barrier: sync.Barrier
+	sync.barrier_init(&barrier, thread_count)
+
+	wg: sync.Wait_Group
+	sync.wait_group_add(&wg, thread_count)
+
+	// non-atomic
+	all_pointers        := make([]rawptr, thread_count*allocs_per_thread)
+	defer delete(all_pointers)
+	// atomic
+	all_pointers_len    := 0
+	all_pointers_ticket := 0
+
+	consumers := make([]Consumer_Data, thread_count)
+	defer delete(consumers)
+
+	for i in 0..<thread_count {
+		consumers[i] = {
+			all_pointers = all_pointers,
+			all_pointers_len = &all_pointers_len,
+			all_pointers_ticket = &all_pointers_ticket,
+			barrier = &barrier,
+			wg = &wg,
+			runs = allocs_per_thread,
+			start_time = &start_time,
+			thread = thread.create(spmc_task),
+		}
+		consumers[i].thread.data = &consumers[i]
+		consumers[i].thread.init_context = context
+		thread.start(consumers[i].thread)
+	}
+
+	start_time = time.now()
+
+	for i in 0..<thread_count*allocs_per_thread {
+		new_ptr := cast(rawptr)new(T)
+		expect(new_ptr != nil)
+
+		all_pointers[i] = new_ptr
+
+		intrinsics.atomic_add_explicit(&all_pointers_len, 1, .Release)
+	}
+
+	sync.wait_group_wait(&wg)
+	done := time.since(start_time)
+
+	for i in 0..<thread_count {
+		thread.join(consumers[i].thread)
+		thread.destroy(consumers[i].thread)
+	}
+
+	// Feoramund's malloc does not gather remote frees until a malloc call.
+	// Do this to trigger collection of eligible remote frees remaining.
+	x := new(int)
+	x ^= 32
+	x ^= x^ * 2
+	free(x)
+
+	log.infof("ALLOC+FREE(% 3i threads): % 7i %s/thr in %v", thread_count, allocs_per_thread, fmt.tprintf("%s", type_info_of(T)),  done, location = location)
+}
+
+//
+// Main
+//
+
+main :: proc() {
+	// Need to avoid dynamic allocation as much as possible.
+	//
+	// There are dynamic heap allocations that happen in global variables
+	// throughout the Odin core, but there's not much we can do about that
+	// here.
+	dummy_space: [8192]u8
+	dummy_allocator: mem.Arena
+	mem.arena_init(&dummy_allocator, dummy_space[:])
+	context.allocator = mem.arena_allocator(&dummy_allocator)
+
+	Allocator :: enum {
+		libc,
+		feoramalloc,
+	}
+
+	Options :: struct {
+		allocator:           Allocator `args:"required" usage:"Which allocator to test/bench."`,
+		vmem_tests:          bool `usage:"Run virtual memory tests."`,
+		serial_tests:        bool `usage:"Run single-threaded tests."`,
+		serial_benchmarks:   bool `usage:"Run single-threaded benchmarks."`,
+		parallel_tests:      bool `usage:"Run multi-threaded tests."`,
+		parallel_benchmarks: bool `usage:"Run multi-threaded benchmarks."`,
+		long:                bool `usage:"Where applicable, run tests with large constants. This will take some time to complete."`,
+		compact:             bool `usage:"Compact the heap at the end."`,
+		info:                bool `usage:"Show heap info at the end."`,
+		trap:                bool `usage:"Trigger a debug trap at the end."`,
+		coverage:            bool `usage:"Check code coverage."`,
+	}
+
+	opt: Options
+	flags.parse_or_exit(&opt, os.args)
+
+	logger_data := log.File_Console_Logger_Data{
+		file_handle = os.INVALID_HANDLE,
+		ident = "",
+	}
+	context.logger = log.Logger{log.file_console_logger_proc, &logger_data, .Debug, {
+		.Level, .Terminal_Color, .Line, .Procedure,
+	}}
+
+	if !runtime.ODIN_VIRTUAL_MEMORY_SUPPORTED {
+		log.info("Virtual memory is not supported on this platform.")
+		os.exit(1)
+	}
+
+	allocator: runtime.Allocator
+	switch opt.allocator {
+	case .libc:
+		log.info("Using libc malloc.")
+		allocator = libc_allocator.libc_allocator()
+	case .feoramalloc:
+		log.info("Using Feoramund's malloc.")
+		allocator = runtime.heap_allocator()
+	case:
+		log.info("No allocator set; exiting.")
+		os.exit(1)
+	}
+
+	{
+		rand.reset(0x1337CAFE)
+		context.allocator = allocator
+
+		if opt.vmem_tests {
+			log.info("Testing virtual memory allocation ...")
+			time.sleep(1 * time.Second)
+			for size in 12..<uint(22) {
+				size := 1 << size
+				for shift in 0..<uint(22) {
+					alignment := 1 << shift
+					v := runtime.allocate_virtual_memory_aligned(size, alignment)
+					log.debugf("%i bytes of %i align: %v", size, alignment, v)
+					expect(uintptr(v) % uintptr(alignment) == 0)
+					va := cast([^]u8)v
+					for i in 0..<size {
+						expect(va[i] == 0)
+					}
+					for i in 0..<size {
+						va[i] = 0xAA
+					}
+					for i in 0..<size {
+						expect(va[i] == 0xAA)
+					}
+					v = runtime.resize_virtual_memory(v, size, size+1, alignment)
+					va = cast([^]u8)v
+					for i in 0..<size {
+						expect(va[i] == 0xAA)
+					}
+					expect(va[size] == 0)
+					runtime.free_virtual_memory(v, size+1)
+				}
+			}
+			{
+				log.debugf("Testing superpage allocation and alignment ...")
+				v := runtime.allocate_virtual_memory_superpage()
+				expect(uintptr(v) % runtime.SUPERPAGE_SIZE == 0)
+				va := cast([^]u8)v
+				for i in 0..<runtime.SUPERPAGE_SIZE {
+					expect(va[i] == 0)
+				}
+				for i in 0..<runtime.SUPERPAGE_SIZE {
+					va[i] = 0xAA
+				}
+				for i in 0..<runtime.SUPERPAGE_SIZE {
+					expect(va[i] == 0xAA)
+				}
+				runtime.free_virtual_memory(v, runtime.SUPERPAGE_SIZE)
+			}
+			log.info("Done.")
+			time.sleep(3 * time.Second)
+		}
+
+		if opt.parallel_tests {
+			log.info("--- Multi-threaded tests ---")
+			// This test must run first in order to guarantee that the
+			// orphanage hasn't been touched yet.
+			test_superpage_abandonment_and_reuse()
+
+			test_orphanage_overflow(runtime.HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES * 3)
+
+			test_parallel_pointer_passing(4)
+			if opt.long {
+				test_parallel_pointer_resizing(22)
+			} else {
+				test_parallel_pointer_resizing(17)
+			}
+
+			test_orphaned_superpage_with_remote_frees()
+
+			// Reset the heap, removing any of the dirty slabs before the next tests.
+			runtime.compact_heap()
+		}
+
+		if opt.serial_tests {
+			log.info("--- Single-threaded tests ---")
+
+			// Test the Resize_Wide_Slab_Expanded_In_Place code path.
+			{
+				N :: runtime.HEAP_SLAB_SIZE
+				o1, err1 := allocator.procedure(allocator.data, .Alloc, N, 1, nil, 0)
+				expect(err1 == nil)
+				o2, err2 := allocator.procedure(allocator.data, .Resize, N*2, 1, raw_data(o1), N)
+				expect(err2 == nil)
+				_, err3 := allocator.procedure(allocator.data, .Free, 0, 0, raw_data(o2), 0)
+				expect(err3 == nil)
+				when ODIN_DEBUG_HEAP {
+					expect(intrinsics.atomic_load(&runtime.heap_global_code_coverage[.Resize_Wide_Slab_Expanded_In_Place]) == 1)
+				}
+			}
+
+			// Test the Resize_Wide_Slab_Shrunk_In_Place code path.
+			{
+				N :: runtime.HEAP_SLAB_SIZE * 2
+				o1, err1 := allocator.procedure(allocator.data, .Alloc, N, 1, nil, 0)
+				expect(err1 == nil)
+				o2, err2 := allocator.procedure(allocator.data, .Resize, N/2, 1, raw_data(o1), N)
+				expect(err2 == nil)
+				_, err3 := allocator.procedure(allocator.data, .Free, 0, 0, raw_data(o2), 0)
+				expect(err3 == nil)
+				when ODIN_DEBUG_HEAP {
+					expect(intrinsics.atomic_load(&runtime.heap_global_code_coverage[.Resize_Wide_Slab_Shrunk_In_Place]) == 1)
+				}
+			}
+
+			// Test the Superpage_Added_To_Open_Cache_By_Resizing_Wide_Slab code path.
+			{
+				N :: runtime.HEAP_SLAB_SIZE - runtime.HEAP_SLAB_ALLOCATION_BOOK_KEEPING
+				M :: runtime.HEAP_SLAB_SIZE * 3 - runtime.HEAP_SLAB_ALLOCATION_BOOK_KEEPING
+
+				// Take a large slab-wide allocation that we can resize down later.
+				o1, err1 := allocator.procedure(allocator.data, .Alloc, M, 1, nil, 0)
+				expect(err1 == nil)
+
+				// Fill the rest of the superpage.
+				list := make([dynamic]rawptr, context.temp_allocator)
+				for _ in 0..<runtime.HEAP_SLAB_COUNT - 3 {
+					o, err2 := allocator.procedure(allocator.data, .Alloc, N, 1, nil, 0)
+					expect(err2 == nil)
+					append(&list, raw_data(o))
+				}
+
+				o2, err3 := allocator.procedure(allocator.data, .Resize, N, 1, raw_data(o1), M)
+				expect(err3 == nil)
+
+				free(raw_data(o2))
+				for ptr in list {
+					free(ptr)
+				}
+				when ODIN_DEBUG_HEAP {
+					expect(intrinsics.atomic_load(&runtime.heap_global_code_coverage[.Superpage_Added_To_Open_Cache_By_Resizing_Wide_Slab]) == 1)
+				}
+			}
+
+			// Test the Resize_Wide_Slab_Failed_To_Find_Contiguous_Expansion code path.
+			{
+				N :: runtime.HEAP_SLAB_SIZE - runtime.HEAP_SLAB_ALLOCATION_BOOK_KEEPING
+				M :: runtime.HEAP_SLAB_SIZE * 3 - runtime.HEAP_SLAB_ALLOCATION_BOOK_KEEPING
+
+				// Fill the superpage.
+				list := make([dynamic]rawptr, context.temp_allocator)
+				for _ in 0..<runtime.HEAP_SLAB_COUNT {
+					o, err := allocator.procedure(allocator.data, .Alloc, N, 1, nil, 0)
+					expect(err == nil)
+					append(&list, raw_data(o))
+				}
+
+				// Create a situation where we have enough free slabs but not
+				// enough contiguous free slabs.
+				free(list[0])
+				free(list[2])
+				free(list[3])
+
+				o1, err1 := allocator.procedure(allocator.data, .Alloc, N, 1, nil, 0)
+				expect(err1 == nil)
+				o2, err2 := allocator.procedure(allocator.data, .Resize, M, 1, raw_data(o1), N)
+				expect(err2 == nil)
+				_, err3 := allocator.procedure(allocator.data, .Free, 0, 0, raw_data(o2), 0)
+				expect(err3 == nil)
+
+				free(list[1])
+				for i in 4..<runtime.HEAP_SLAB_COUNT {
+					free(list[i])
+				}
+				when ODIN_DEBUG_HEAP {
+					// TODO(Feoramund): This test will fail if feoramalloc is
+					// the default allocator due to allocations that Odin core
+					// makes throughout a couple global variables such as
+					// `os.args`, which will upset the expectation of a clean
+					// slate, but I've tested it to work correctly when libc was
+					// the default allocator.
+
+					// expect(intrinsics.atomic_load(&runtime.heap_global_code_coverage[.Resize_Wide_Slab_Failed_To_Find_Contiguous_Expansion]) == 1)
+				}
+			}
+
+			// Inter-bin tests.
+			test_single_alloc_and_resize(0, 8)
+			test_single_alloc_and_resize(8, 0)
+
+			test_single_alloc_and_resize(4, 8)
+			test_single_alloc_and_resize(8, 4)
+
+			// Cross-bin tests.
+			test_single_alloc_and_resize(64, 8)
+			test_single_alloc_and_resize(8, 64)
+
+			test_single_alloc_and_resize(runtime.HEAP_MAX_BIN_SIZE, runtime.HEAP_MAX_BIN_SIZE / 2)
+			test_single_alloc_and_resize(runtime.HEAP_MAX_BIN_SIZE / 2, runtime.HEAP_MAX_BIN_SIZE * 2)
+
+			// Inter-slab tests.
+			test_single_alloc_and_resize(runtime.HEAP_SLAB_SIZE - 1, runtime.HEAP_SLAB_SIZE)
+
+			// Cross-category tests.
+			// Bin <-> Slab
+			test_single_alloc_and_resize(runtime.HEAP_MAX_BIN_SIZE, runtime.HEAP_SLAB_SIZE)
+			test_single_alloc_and_resize(runtime.HEAP_SLAB_SIZE, runtime.HEAP_MAX_BIN_SIZE)
+
+			// Bin <-> Huge
+			test_single_alloc_and_resize(runtime.HEAP_MAX_BIN_SIZE, runtime.HEAP_HUGE_ALLOCATION_THRESHOLD)
+			test_single_alloc_and_resize(runtime.HEAP_HUGE_ALLOCATION_THRESHOLD, runtime.HEAP_MAX_BIN_SIZE)
+
+			// Slab <-> Huge
+			test_single_alloc_and_resize(runtime.HEAP_MAX_BIN_SIZE + 1, runtime.HEAP_HUGE_ALLOCATION_THRESHOLD)
+			test_single_alloc_and_resize(runtime.HEAP_HUGE_ALLOCATION_THRESHOLD, runtime.HEAP_MAX_BIN_SIZE + 1)
+
+			// Inter-huge tests.
+			test_single_alloc_and_resize(runtime.HEAP_HUGE_ALLOCATION_THRESHOLD + 2, runtime.HEAP_HUGE_ALLOCATION_THRESHOLD + 1)
+			test_single_alloc_and_resize(runtime.HEAP_HUGE_ALLOCATION_THRESHOLD + 1, runtime.SUPERPAGE_SIZE)
+
+			// Larger-than-superpage tests.
+			test_single_alloc_and_resize(runtime.SUPERPAGE_SIZE, runtime.SUPERPAGE_SIZE * 3)
+			test_single_alloc_and_resize(runtime.SUPERPAGE_SIZE * 3, runtime.SUPERPAGE_SIZE)
+
+			// Brute-force tests.
+			test_individual_allocation_and_free(runtime.HEAP_MAX_BIN_SIZE if opt.long else 1024)
+			test_continuous_allocation_of_size_n(16, runtime.HEAP_MAX_BIN_SIZE if opt.long else 1024)
+
+			test_alloc_write_free(
+				object_count = 400,
+				starting_size = 16, final_size = 16,
+				size_strategy = .Adding, size_operand = 0,
+				allocs_per_free_operation = 16,
+				free_operations_at_once = 4,
+				free_strategy = .Interleaved,
+				free_direction = .Randomly,
+			)
+
+			test_alloc_write_free(
+				object_count = 100,
+				starting_size = 2, final_size = 4096,
+				size_strategy = .Multiplying, size_operand = 2,
+				allocs_per_free_operation = 16,
+				free_operations_at_once = 4,
+				free_strategy = .Interleaved,
+				free_direction = .Randomly,
+			)
+
+			test_alloc_write_free(
+				object_count = 100,
+				starting_size = 2, final_size = 32768,
+				size_strategy = .Adding, size_operand = 2,
+				allocs_per_free_operation = 16,
+				free_operations_at_once = 4,
+				free_strategy = .Interleaved,
+				free_direction = .Randomly,
+			)
+
+			test_alloc_write_free(
+				object_count = 100,
+				starting_size = 2, final_size = 8096,
+				size_strategy = .Adding, size_operand = 2,
+				allocs_per_free_operation = 16,
+				free_operations_at_once = 15,
+				free_strategy = .Interleaved,
+				free_direction = .Backward,
+			)
+
+			test_alloc_write_free(
+				object_count = 10,
+				starting_size = 8096, final_size = 2,
+				size_strategy = .Adding, size_operand = -2,
+				allocs_per_free_operation = 16,
+				free_operations_at_once = 15,
+				free_strategy = .At_The_End,
+				free_direction = .Forward,
+			)
+
+			test_alloc_write_free(
+				object_count = 10,
+				starting_size = 65535, final_size = 65535,
+				size_strategy = .Adding, size_operand = 0,
+				allocs_per_free_operation = 1,
+				free_operations_at_once = 1,
+				free_strategy = .At_The_End,
+				free_direction = .Forward,
+			)
+
+			test_alloc_write_free(
+				object_count = 300000,
+				starting_size = 8, final_size = 8,
+				size_strategy = .Adding, size_operand = 0,
+				allocs_per_free_operation = 1,
+				free_operations_at_once = 1,
+				free_strategy = .At_The_End,
+				free_direction = .Forward,
+			)
+
+			test_alloc_write_free(
+				object_count = runtime.HEAP_SLAB_COUNT*2,
+				starting_size = runtime.HEAP_SLAB_SIZE/2, final_size = runtime.HEAP_SLAB_SIZE*4,
+				size_strategy = .Multiplying, size_operand = 2,
+				allocs_per_free_operation = 1,
+				free_operations_at_once = 1,
+				free_strategy = .At_The_End,
+				free_direction = .Forward,
+			)
+
+			test_alloc_write_free(
+				object_count = 2,
+				starting_size = runtime.SUPERPAGE_SIZE/2, final_size = runtime.SUPERPAGE_SIZE*4,
+				size_strategy = .Multiplying, size_operand = 2,
+				allocs_per_free_operation = 1,
+				free_operations_at_once = 1,
+				free_strategy = .At_The_End,
+				free_direction = .Forward,
+			)
+
+			// This is a lengthy test and won't tell us much more than any other test will.
+			if opt.long {
+				test_single_alloc_and_resize_incremental(0, runtime.HEAP_SLAB_SIZE)
+			}
+
+			runtime.compact_heap()
+
+			test_serial_bin_sanity()
+
+			runtime.compact_heap()
+		}
+
+		if opt.serial_benchmarks {
+			log.info("--- Single-threaded benchmarks ---")
+
+			log.info("* Freeing forwards ...")
+			bench_alloc_n_then_free_n(1_000_000, int)
+			bench_alloc_n_then_free_n(1_000_000, Struct_16)
+			bench_alloc_n_then_free_n(1_000_000, Struct_32)
+			bench_alloc_n_then_free_n(1_000_000, Struct_64)
+			bench_alloc_n_then_free_n(1_000_000, Struct_512)
+			bench_alloc_n_then_free_n(10_000, [8192]u8)
+			bench_alloc_n_then_free_n(1000, [runtime.HEAP_SLAB_SIZE/4]u8)
+			bench_alloc_n_then_free_n(1000, [runtime.HEAP_SLAB_SIZE*4]u8)
+			bench_alloc_n_then_free_n(10, [runtime.SUPERPAGE_SIZE]u8)
+
+			log.info("* Freeing backwards ...")
+			bench_alloc_n_then_free_n_backwards(1_000_000, int)
+			bench_alloc_n_then_free_n_backwards(1_000_000, Struct_16)
+			bench_alloc_n_then_free_n_backwards(1_000_000, Struct_32)
+			bench_alloc_n_then_free_n_backwards(1_000_000, Struct_64)
+			bench_alloc_n_then_free_n_backwards(1_000_000, Struct_512)
+			bench_alloc_n_then_free_n_backwards(10_000, [8192]u8)
+			bench_alloc_n_then_free_n_backwards(1000, [runtime.HEAP_SLAB_SIZE/4]u8)
+			bench_alloc_n_then_free_n_backwards(1000, [runtime.HEAP_SLAB_SIZE*4]u8)
+			bench_alloc_n_then_free_n_backwards(10, [runtime.SUPERPAGE_SIZE]u8)
+
+			log.info("* Freeing randomly ...")
+			bench_alloc_n_then_free_n_randomly(1_000_000, int)
+			bench_alloc_n_then_free_n_randomly(1_000_000, Struct_16)
+			bench_alloc_n_then_free_n_randomly(1_000_000, Struct_32)
+			bench_alloc_n_then_free_n_randomly(1_000_000, Struct_64)
+			bench_alloc_n_then_free_n_randomly(1_000_000, Struct_512)
+			bench_alloc_n_then_free_n_randomly(10_000, [8192]u8)
+			bench_alloc_n_then_free_n_randomly(10_000, [runtime.HEAP_SLAB_SIZE/4]u8)
+			bench_alloc_n_then_free_n_randomly(10_000, [runtime.HEAP_SLAB_SIZE-runtime.HEAP_SLAB_ALLOCATION_BOOK_KEEPING]u8)
+			bench_alloc_n_then_free_n_randomly(10_000, [runtime.HEAP_SLAB_SIZE]u8)
+			bench_alloc_n_then_free_n_randomly(10_000, [runtime.HEAP_SLAB_SIZE*2]u8)
+			bench_alloc_n_then_free_n_randomly(10, [runtime.SUPERPAGE_SIZE]u8)
+
+			log.info("* Allocating and freeing repeatedly ...")
+			bench_alloc_1_then_free_1_repeatedly(100_000, int)
+			bench_alloc_1_then_free_1_repeatedly(100_000, Struct_16)
+			bench_alloc_1_then_free_1_repeatedly(100_000, Struct_32)
+			bench_alloc_1_then_free_1_repeatedly(100_000, Struct_64)
+			bench_alloc_1_then_free_1_repeatedly(100_000, Struct_512)
+			bench_alloc_1_then_free_1_repeatedly(10_000, [8192]u8)
+		}
+
+		if opt.parallel_benchmarks {
+			log.info("--- Multi-threaded benchmarks ---")
+
+			bench_1_producer_n_consumer_for_m_alloc(1, 10_000, Struct_16)
+			bench_1_producer_n_consumer_for_m_alloc(2, 10_000, Struct_16)
+			bench_1_producer_n_consumer_for_m_alloc(4, 10_000, Struct_16)
+
+			bench_1_producer_n_consumer_for_m_alloc(1, 10_000, Struct_32)
+			bench_1_producer_n_consumer_for_m_alloc(2, 10_000, Struct_32)
+			bench_1_producer_n_consumer_for_m_alloc(4, 10_000, Struct_32)
+
+			bench_1_producer_n_consumer_for_m_alloc(1, 10_000, Struct_64)
+			bench_1_producer_n_consumer_for_m_alloc(2, 10_000, Struct_64)
+			bench_1_producer_n_consumer_for_m_alloc(4, 10_000, Struct_64)
+
+			bench_1_producer_n_consumer_for_m_alloc(1, 10_000, Struct_512)
+			bench_1_producer_n_consumer_for_m_alloc(2, 10_000, Struct_512)
+			bench_1_producer_n_consumer_for_m_alloc(4, 10_000, Struct_512)
+
+			bench_1_producer_n_consumer_for_m_alloc(1, 1_000, [8192]u8)
+			bench_1_producer_n_consumer_for_m_alloc(2, 1_000, [8192]u8)
+			bench_1_producer_n_consumer_for_m_alloc(4, 1_000, [8192]u8)
+
+			bench_1_producer_n_consumer_for_m_alloc(4, 10, [runtime.HEAP_SLAB_SIZE]u8)
+
+			when .Thread not_in ODIN_SANITIZER_FLAGS {
+				// NOTE: TSan doesn't work well with excessive thread counts,
+				// in my experience.
+				bench_1_producer_n_consumer_for_m_alloc(32, 1_000, Struct_32)
+				bench_1_producer_n_consumer_for_m_alloc(64, 1_000, Struct_32)
+				bench_1_producer_n_consumer_for_m_alloc(128, 1_000, Struct_32)
+			}
+		}
+
+	}
+
+	log.info("Tests complete.")
+
+	if opt.compact {
+		runtime.compact_heap()
+		log.info("The main thread's heap has been compacted.")
+	}
+
+	if opt.coverage {
+		when runtime.ODIN_DEBUG_HEAP {
+			log.info("--- Code coverage report ---")
+			for key in runtime.Heap_Code_Coverage_Type {
+				value := intrinsics.atomic_load_explicit(&runtime.heap_global_code_coverage[key], .Acquire)
+				log.infof("%s%v: %v", ">>> " if value == 0 else "", key, value)
+			}
+
+			log.infof("Full code coverage: %v", runtime._check_heap_code_coverage())
+		} else {
+			log.error("ODIN_DEBUG_HEAP is not enabled, thus coverage cannot be calculated.")
+		}
+	}
+
+	if opt.info {
+		heap_info := runtime.get_local_heap_info()
+		log.infof("%#v", heap_info)
+	}
+
+	// if .Dump_Slabs in params {
+	// 	dump_slabs()
+	// }
+
+	if opt.trap {
+		intrinsics.debug_trap()
+	}
+}