From 8504094ea8bc6ceb60e1ea0097d6a97dbc4f527d Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:00:12 -0500 Subject: [PATCH 01/21] Add `get_current_thread_id` to `base` --- ...{thread_management.odin => threading.odin} | 14 ++++++++++++ base/runtime/threading_darwin.odin | 17 ++++++++++++++ base/runtime/threading_freebsd.odin | 19 ++++++++++++++++ base/runtime/threading_linux.odin | 22 +++++++++++++++++++ base/runtime/threading_netbsd.odin | 11 ++++++++++ base/runtime/threading_openbsd.odin | 11 ++++++++++ base/runtime/threading_other.odin | 12 ++++++++++ base/runtime/threading_windows.odin | 13 +++++++++++ 8 files changed, 119 insertions(+) rename base/runtime/{thread_management.odin => threading.odin} (68%) create mode 100644 base/runtime/threading_darwin.odin create mode 100644 base/runtime/threading_freebsd.odin create mode 100644 base/runtime/threading_linux.odin create mode 100644 base/runtime/threading_netbsd.odin create mode 100644 base/runtime/threading_openbsd.odin create mode 100644 base/runtime/threading_other.odin create mode 100644 base/runtime/threading_windows.odin diff --git a/base/runtime/thread_management.odin b/base/runtime/threading.odin similarity index 68% rename from base/runtime/thread_management.odin rename to base/runtime/threading.odin index cabd4691cee..778d7e90061 100644 --- a/base/runtime/thread_management.odin +++ b/base/runtime/threading.odin @@ -5,6 +5,20 @@ Thread_Local_Cleaner :: #type proc "odin" () @(private="file") thread_local_cleaners: [8]Thread_Local_Cleaner +// The thread ID is cached here to save time from making syscalls with every +// call. +@(thread_local) local_thread_id: int + +// Return the current thread ID. +@(require_results) +get_current_thread_id :: proc "contextless" () -> int { + if local_thread_id == 0 { + local_thread_id = _get_current_thread_id() + assert_contextless(local_thread_id != 0, "_get_current_thread_id returned 0.") + } + return local_thread_id +} + // Add a procedure that will be run at the end of a thread for the purpose of // deallocating state marked as `thread_local`. // diff --git a/base/runtime/threading_darwin.odin b/base/runtime/threading_darwin.odin new file mode 100644 index 00000000000..8dd76f15528 --- /dev/null +++ b/base/runtime/threading_darwin.odin @@ -0,0 +1,17 @@ +#+private +package runtime + +foreign import lib "system:System.framework" + +foreign lib { + pthread_threadid_np :: proc "c" (rawptr, ^u64) -> i32 --- +} + +_get_current_thread_id :: proc "contextless" () -> int { + tid: u64 + result := pthread_threadid_np(nil, &tid) + if result != 0 { + panic_contextless("Failed to get current thread ID.") + } + return int(tid) +} diff --git a/base/runtime/threading_freebsd.odin b/base/runtime/threading_freebsd.odin new file mode 100644 index 00000000000..053e636401d --- /dev/null +++ b/base/runtime/threading_freebsd.odin @@ -0,0 +1,19 @@ +#+private +package runtime + +import "base:intrinsics" + +SYS_thr_self :: uintptr(432) + +_get_current_thread_id :: proc "contextless" () -> int { + when size_of(rawptr) == 4 { + id: i32 + } else { + id: i64 + } + _, ok := intrinsics.syscall_bsd(SYS_thr_self, uintptr(&id)) + if !ok { + panic_contextless("Failed to get current thread ID.") + } + return int(id) +} diff --git a/base/runtime/threading_linux.odin b/base/runtime/threading_linux.odin new file mode 100644 index 00000000000..3fc7aa993ae --- /dev/null +++ b/base/runtime/threading_linux.odin @@ -0,0 +1,22 @@ +#+private +package runtime + +import "base:intrinsics" + +when ODIN_ARCH == .amd64 { + SYS_gettid :: uintptr(186) +} else when ODIN_ARCH == .arm32 { + SYS_gettid :: uintptr(224) +} else when ODIN_ARCH == .arm64 { + SYS_gettid :: uintptr(178) +} else when ODIN_ARCH == .i386 { + SYS_gettid :: uintptr(224) +} else when ODIN_ARCH == .riscv64 { + SYS_gettid :: uintptr(178) +} else { + #panic("Syscall numbers related to threading are missing for this Linux architecture.") +} + +_get_current_thread_id :: proc "contextless" () -> int { + return int(intrinsics.syscall(SYS_gettid)) +} diff --git a/base/runtime/threading_netbsd.odin b/base/runtime/threading_netbsd.odin new file mode 100644 index 00000000000..0ba6fd0fd3e --- /dev/null +++ b/base/runtime/threading_netbsd.odin @@ -0,0 +1,11 @@ +#+private +package runtime + +import "base:intrinsics" + +SYS__lwp_self :: uintptr(311) + +_get_current_thread_id :: proc "contextless" () -> int { + result, _ := intrinsics.syscall_bsd(SYS__lwp_self) + return int(result) +} diff --git a/base/runtime/threading_openbsd.odin b/base/runtime/threading_openbsd.odin new file mode 100644 index 00000000000..8bfb3de9b24 --- /dev/null +++ b/base/runtime/threading_openbsd.odin @@ -0,0 +1,11 @@ +#+private +package runtime + +import "base:intrinsics" + +SYS_getthrid :: uintptr(299) + +_get_current_thread_id :: proc "contextless" () -> int { + result, _ := intrinsics.syscall_bsd(SYS_getthrid) + return int(result) +} diff --git a/base/runtime/threading_other.odin b/base/runtime/threading_other.odin new file mode 100644 index 00000000000..4bad1a24240 --- /dev/null +++ b/base/runtime/threading_other.odin @@ -0,0 +1,12 @@ +#+private +#+build !darwin +#+build !freebsd +#+build !linux +#+build !netbsd +#+build !openbsd +#+build !windows +package runtime + +_get_current_thread_id :: proc "contextless" () -> int { + unimplemented_contextless("This platform does not support multithreading.") +} diff --git a/base/runtime/threading_windows.odin b/base/runtime/threading_windows.odin new file mode 100644 index 00000000000..9671350614d --- /dev/null +++ b/base/runtime/threading_windows.odin @@ -0,0 +1,13 @@ +#+private +package runtime + +foreign import Kernel32 "system:Kernel32.lib" + +@(default_calling_convention="system") +foreign Kernel32 { + GetCurrentThreadId :: proc() -> u32 --- +} + +_get_current_thread_id :: proc "contextless" () -> int { + return int(GetCurrentThreadId()) +} From 3dc9b10f869c3ed859c373d3a2dfdfc08917c9ce Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Mon, 20 Jan 2025 18:30:55 -0500 Subject: [PATCH 02/21] Add virtual memory procedures to `base` --- base/runtime/virtual_memory.odin | 95 ++++++++++++++ base/runtime/virtual_memory_darwin.odin | 94 ++++++++++++++ base/runtime/virtual_memory_freebsd.odin | 135 ++++++++++++++++++++ base/runtime/virtual_memory_linux.odin | 156 +++++++++++++++++++++++ base/runtime/virtual_memory_netbsd.odin | 72 +++++++++++ base/runtime/virtual_memory_openbsd.odin | 104 +++++++++++++++ base/runtime/virtual_memory_other.odin | 30 +++++ base/runtime/virtual_memory_windows.odin | 149 ++++++++++++++++++++++ 8 files changed, 835 insertions(+) create mode 100644 base/runtime/virtual_memory.odin create mode 100644 base/runtime/virtual_memory_darwin.odin create mode 100644 base/runtime/virtual_memory_freebsd.odin create mode 100644 base/runtime/virtual_memory_linux.odin create mode 100644 base/runtime/virtual_memory_netbsd.odin create mode 100644 base/runtime/virtual_memory_openbsd.odin create mode 100644 base/runtime/virtual_memory_other.odin create mode 100644 base/runtime/virtual_memory_windows.odin diff --git a/base/runtime/virtual_memory.odin b/base/runtime/virtual_memory.odin new file mode 100644 index 00000000000..e8f6763506a --- /dev/null +++ b/base/runtime/virtual_memory.odin @@ -0,0 +1,95 @@ +package runtime + +ODIN_VIRTUAL_MEMORY_SUPPORTED :: VIRTUAL_MEMORY_SUPPORTED + +// Virtually all MMUs supported by Odin should have a 4KiB page size. +PAGE_SIZE :: 4 * Kilobyte + +when ODIN_ARCH == .arm32 { + SUPERPAGE_SIZE :: 1 * Megabyte +} else { + // All other architectures should have support for 2MiB pages. + // i386 supports it in PAE mode. + // amd64, arm64, and riscv64 support it by default. + SUPERPAGE_SIZE :: 2 * Megabyte +} + +#assert(SUPERPAGE_SIZE & (SUPERPAGE_SIZE-1) == 0, "SUPERPAGE_SIZE must be a power of two.") + +/* +Allocate virtual memory from the operating system. + +The address returned is guaranteed to point to data that is at least `size` +bytes large but may be larger, due to rounding `size` to the page size of the +system. +*/ +@(require_results) +allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + return _allocate_virtual_memory(size) +} + +/* +Allocate a superpage of virtual memory from the operating system. + +This is a contiguous block of memory larger than what is normally distributed +by the operating system, sometimes with special performance properties related +to the Translation Lookaside Buffer. + +The address will be a multiple of the `SUPERPAGE_SIZE` constant, and the memory +pointed to will be at least as long as that very same constant. + +The name derives from the superpage concept on the *BSD operating systems, +where it is known as huge pages on Linux and large pages on Windows. +*/ +@(require_results) +allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + return _allocate_virtual_memory_superpage() +} + +/* +Allocate virtual memory from the operating system. + +The address returned is guaranteed to be a multiple of `alignment` and point to +data that is at least `size` bytes large but may be larger, due to rounding +`size` to the page size of the system. + +`alignment` must be a power of two. +*/ +@(require_results) +allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + assert_contextless(is_power_of_two(alignment)) + return _allocate_virtual_memory_aligned(size, alignment) +} + +/* +Free virtual memory allocated by any of the `allocate_*` procs. +*/ +free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + _free_virtual_memory(ptr, size) +} + +/* +Resize virtual memory allocated by `allocate_virtual_memory`. + +**Caveats:** + +- `new_size` must not be zero. +- If `old_size` and `new_size` are the same, nothing happens. +- The resulting behavior is undefined if `old_size` is incorrect. +- If the address is changed, `alignment` will be ensured. +- `alignment` should be the same value used when the memory was allocated. +- Resizing memory returned by `allocate_virtual_memory_superpage` is not + well-defined. The memory may be resized, but it may no longer be backed by a + superpage. +*/ +@(require_results) +resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int = 0) -> rawptr { + // * This is due to a restriction of mremap on Linux. + assert_contextless(new_size != 0, "Cannot resize virtual memory address to zero.") + // * The statement about undefined behavior of incorrect `old_size` is due to + // how VirtualFree works on Windows. + if old_size == new_size { + return ptr + } + return _resize_virtual_memory(ptr, old_size, new_size, alignment) +} diff --git a/base/runtime/virtual_memory_darwin.odin b/base/runtime/virtual_memory_darwin.odin new file mode 100644 index 00000000000..a3d3c61adef --- /dev/null +++ b/base/runtime/virtual_memory_darwin.odin @@ -0,0 +1,94 @@ +#+private +package runtime + +import "base:intrinsics" + +VIRTUAL_MEMORY_SUPPORTED :: true + +foreign import lib "system:System.framework" + +foreign lib { + mach_task_self :: proc() -> u32 --- + mach_vm_deallocate :: proc(target: u32, address: u64, size: u64) -> i32 --- + mach_vm_map :: proc( + target_task: u32, + address: ^u64, + size: u64, + mask: u64, + flags: i32, + object: i32, + offset: uintptr, + copy: b32, + cur_protection, + max_protection: i32, + inheritance: u32, + ) -> i32 --- +} + +// The following features are specific to Darwin only. +VM_FLAGS_ANYWHERE :: 0x0001 + +SUPERPAGE_SIZE_ANY :: 1 +SUPERPAGE_SIZE_2MB :: 2 + +VM_FLAGS_SUPERPAGE_SHIFT :: 16 +VM_FLAGS_SUPERPAGE_SIZE_ANY :: SUPERPAGE_SIZE_ANY << VM_FLAGS_SUPERPAGE_SHIFT +VM_FLAGS_SUPERPAGE_SIZE_2MB :: SUPERPAGE_SIZE_2MB << VM_FLAGS_SUPERPAGE_SHIFT + +MEMORY_OBJECT_NULL :: 0 +VM_PROT_READ :: 0x01 +VM_PROT_WRITE :: 0x02 +VM_INHERIT_SHARE :: 0 + +_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + address: u64 + result := mach_vm_map(mach_task_self(), &address, u64(size), 0, VM_FLAGS_ANYWHERE, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_SHARE) + if result != 0 { + return nil + } + return rawptr(uintptr(address)) +} + +_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + address: u64 + flags: i32 = VM_FLAGS_ANYWHERE + when SUPERPAGE_SIZE == 2 * Megabyte { + flags |= VM_FLAGS_SUPERPAGE_SIZE_2MB + } else { + flags |= VM_FLAGS_SUPERPAGE_SIZE_ANY + } + alignment_mask: u64 = SUPERPAGE_SIZE - 1 // Assumes a power of two size, ensured by an assertion in `virtual_memory.odin`. + result := mach_vm_map(mach_task_self(), &address, SUPERPAGE_SIZE, alignment_mask, flags, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_SHARE) + if result != 0 { + return nil + } + assert_contextless(address % SUPERPAGE_SIZE == 0) + return rawptr(uintptr(address)) +} + +_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + address: u64 + alignment_mask: u64 = u64(alignment) - 1 + result := mach_vm_map(mach_task_self(), &address, u64(size), alignment_mask, VM_FLAGS_ANYWHERE, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_SHARE) + if result != 0 { + return nil + } + return rawptr(uintptr(address)) +} + +_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + mach_vm_deallocate(mach_task_self(), u64(uintptr(ptr)), u64(size)) +} + +_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { + // NOTE(Feoramund): mach_vm_remap does not permit resizing, as far as I understand it. + result: rawptr = --- + if alignment == 0 { + result = _allocate_virtual_memory(new_size) + } else { + result = _allocate_virtual_memory_aligned(new_size, alignment) + } + intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size)) + mach_vm_deallocate(mach_task_self(), u64(uintptr(ptr)), u64(old_size)) + return result +} diff --git a/base/runtime/virtual_memory_freebsd.odin b/base/runtime/virtual_memory_freebsd.odin new file mode 100644 index 00000000000..76acfeebdce --- /dev/null +++ b/base/runtime/virtual_memory_freebsd.odin @@ -0,0 +1,135 @@ +#+private +package runtime + +import "base:intrinsics" + +VIRTUAL_MEMORY_SUPPORTED :: true + +SYS_munmap :: uintptr(73) +SYS_mmap :: uintptr(477) + +PROT_READ :: 0x01 +PROT_WRITE :: 0x02 + +MAP_PRIVATE :: 0x0002 +MAP_ANONYMOUS :: 0x1000 + +// The following features are specific to FreeBSD only. +/* + * Request specific alignment (n == log2 of the desired alignment). + * + * MAP_ALIGNED_SUPER requests optimal superpage alignment, but does + * not enforce a specific alignment. + */ +// #define MAP_ALIGNED(n) ((n) << MAP_ALIGNMENT_SHIFT) +MAP_ALIGNMENT_SHIFT :: 24 +MAP_ALIGNED_SUPER :: 1 << MAP_ALIGNMENT_SHIFT + +SUPERPAGE_MAP_FLAGS :: (intrinsics.constant_log2(SUPERPAGE_SIZE) << MAP_ALIGNMENT_SHIFT) | MAP_ALIGNED_SUPER + +_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if !ok { + return nil + } + return rawptr(result) +} + +_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, SUPERPAGE_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|SUPERPAGE_MAP_FLAGS, ~uintptr(0), 0) + if !ok { + // It may be the case that FreeBSD couldn't fulfill our alignment + // request, but it could still give us some memory. + return _allocate_virtual_memory_manually_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE) + } + return rawptr(result) +} + +_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + // This procedure uses the `MAP_ALIGNED` API provided by FreeBSD and falls + // back to manually aligned addresses, if that fails. + map_aligned_n: uintptr + if alignment >= PAGE_SIZE { + map_aligned_n = intrinsics.count_trailing_zeros(uintptr(alignment)) << MAP_ALIGNMENT_SHIFT + } + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|map_aligned_n, ~uintptr(0), 0) + if !ok { + _allocate_virtual_memory_manually_aligned(size, alignment) + } + return rawptr(result) +} + +_allocate_virtual_memory_manually_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + if alignment <= PAGE_SIZE { + // This is the simplest case. + // + // By virtue of binary arithmetic, any address aligned to a power of + // two is necessarily aligned to all lesser powers of two, and because + // mmap returns page-aligned addresses, we don't have to do anything + // extra here. + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if !ok { + return nil + } + return cast(rawptr)result + } + // We must over-allocate then adjust the address. + mmap_result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size + alignment), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if !ok { + return nil + } + assert_contextless(mmap_result % PAGE_SIZE == 0) + modulo := mmap_result & uintptr(alignment-1) + if modulo != 0 { + // The address is misaligned, so we must return an adjusted address + // and free the pages we don't need. + delta := uintptr(alignment) - modulo + adjusted_result := mmap_result + delta + + // Sanity-checking: + // - The adjusted address is still page-aligned, so it is a valid argument for munmap. + // - The adjusted address is aligned to the user's needs. + assert_contextless(adjusted_result % PAGE_SIZE == 0) + assert_contextless(adjusted_result % uintptr(alignment) == 0) + + // Round the delta to a multiple of the page size. + delta = delta / PAGE_SIZE * PAGE_SIZE + if delta > 0 { + // Unmap the pages we don't need. + intrinsics.syscall_bsd(SYS_munmap, mmap_result, delta) + } + + return rawptr(adjusted_result) + } else if size + alignment > PAGE_SIZE { + // The address is coincidentally aligned as desired, but we have space + // that will never be seen by the user, so we must free the backing + // pages for it. + start := size / PAGE_SIZE * PAGE_SIZE + if size % PAGE_SIZE != 0 { + start += PAGE_SIZE + } + length := size + alignment - start + if length > 0 { + intrinsics.syscall_bsd(SYS_munmap, mmap_result + uintptr(start), uintptr(length)) + } + } + return rawptr(mmap_result) +} + +_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(size)) +} + +_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { + // FreeBSD does not have a mremap syscall. + // All we can do is mmap a new address, copy the data, and munmap the old. + result: rawptr = --- + if alignment == 0 { + result = _allocate_virtual_memory(new_size) + } else { + result = _allocate_virtual_memory_aligned(new_size, alignment) + } + intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size)) + intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(old_size)) + return result +} diff --git a/base/runtime/virtual_memory_linux.odin b/base/runtime/virtual_memory_linux.odin new file mode 100644 index 00000000000..94eea7a1373 --- /dev/null +++ b/base/runtime/virtual_memory_linux.odin @@ -0,0 +1,156 @@ +#+private +package runtime + +import "base:intrinsics" + +VIRTUAL_MEMORY_SUPPORTED :: true + +when ODIN_ARCH == .amd64 { + SYS_mmap :: uintptr(9) + SYS_munmap :: uintptr(11) + SYS_mremap :: uintptr(25) +} else when ODIN_ARCH == .arm32 { + SYS_mmap :: uintptr(90) + SYS_munmap :: uintptr(91) + SYS_mremap :: uintptr(163) +} else when ODIN_ARCH == .arm64 { + SYS_mmap :: uintptr(222) + SYS_munmap :: uintptr(215) + SYS_mremap :: uintptr(216) +} else when ODIN_ARCH == .i386 { + SYS_mmap :: uintptr(90) + SYS_munmap :: uintptr(91) + SYS_mremap :: uintptr(163) +} else when ODIN_ARCH == .riscv64 { + SYS_mmap :: uintptr(222) + SYS_munmap :: uintptr(215) + SYS_mremap :: uintptr(216) +} else { + #panic("Syscall numbers related to virtual memory are missing for this Linux architecture.") +} + +PROT_READ :: 0x01 +PROT_WRITE :: 0x02 + +MAP_PRIVATE :: 0x02 +MAP_ANONYMOUS :: 0x20 + +MREMAP_MAYMOVE :: 0x01 + +ENOMEM :: ~uintptr(11) + +_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + result := intrinsics.syscall(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if int(result) < 0 { + return nil + } + return rawptr(result) +} + +_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + // This depends on Transparent HugePage Support being enabled. + result := intrinsics.syscall(SYS_mmap, 0, SUPERPAGE_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if int(result) < 0 { + return nil + } + if uintptr(result) % SUPERPAGE_SIZE != 0 { + // If THP support is not enabled, we may receive an address aligned to a + // page boundary instead, in which case, we must manually align a new + // address. + _free_virtual_memory(rawptr(result), SUPERPAGE_SIZE) + return _allocate_virtual_memory_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE) + } + return rawptr(result) +} + +_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + if alignment <= PAGE_SIZE { + // This is the simplest case. + // + // By virtue of binary arithmetic, any address aligned to a power of + // two is necessarily aligned to all lesser powers of two, and because + // mmap returns page-aligned addresses, we don't have to do anything + // extra here. + result := intrinsics.syscall(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if int(result) < 0 { + return nil + } + return rawptr(result) + } + // We must over-allocate then adjust the address. + mmap_result := intrinsics.syscall(SYS_mmap, 0, uintptr(size + alignment), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if int(mmap_result) < 0 { + return nil + } + assert_contextless(mmap_result % PAGE_SIZE == 0) + modulo := mmap_result & uintptr(alignment-1) + if modulo != 0 { + // The address is misaligned, so we must return an adjusted address + // and free the pages we don't need. + delta := uintptr(alignment) - modulo + adjusted_result := mmap_result + delta + + // Sanity-checking: + // - The adjusted address is still page-aligned, so it is a valid argument for mremap and munmap. + // - The adjusted address is aligned to the user's needs. + assert_contextless(adjusted_result % PAGE_SIZE == 0) + assert_contextless(adjusted_result % uintptr(alignment) == 0) + + // Round the delta to a multiple of the page size. + delta = delta / PAGE_SIZE * PAGE_SIZE + if delta > 0 { + // Unmap the pages we don't need. + intrinsics.syscall(SYS_munmap, mmap_result, delta) + } + + return rawptr(adjusted_result) + } else if size + alignment > PAGE_SIZE { + // The address is coincidentally aligned as desired, but we have space + // that will never be seen by the user, so we must free the backing + // pages for it. + start := size / PAGE_SIZE * PAGE_SIZE + if size % PAGE_SIZE != 0 { + start += PAGE_SIZE + } + length := size + alignment - start + if length > 0 { + intrinsics.syscall(SYS_munmap, mmap_result + uintptr(start), uintptr(length)) + } + } + return rawptr(mmap_result) +} + +_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + intrinsics.syscall(SYS_munmap, uintptr(ptr), uintptr(size)) +} + +_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { + if alignment == 0 { + // The user does not care about alignment, which is the simpler case. + result := intrinsics.syscall(SYS_mremap, uintptr(ptr), uintptr(old_size), uintptr(new_size), MREMAP_MAYMOVE) + if int(result) < 0 { + return nil + } + return rawptr(result) + } else { + // First, let's try to mremap without MREMAP_MAYMOVE. We might get + // lucky and the operating system could expand (or shrink, as the case + // may be) the pages in place, which means we don't have to allocate a + // whole new chunk of memory. + mremap_result := intrinsics.syscall(SYS_mremap, uintptr(ptr), uintptr(old_size), uintptr(new_size), 0) + if mremap_result != ENOMEM { + // We got lucky. + return rawptr(mremap_result) + } + + // mremap failed to resize the memory in place, which means we must + // allocate an entirely new aligned chunk of memory, copy the old data, + // and free the old pointer before returning the new one. + // + // This is costly but unavoidable with the API available to us. + result := _allocate_virtual_memory_aligned(new_size, alignment) + intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size)) + _free_virtual_memory(ptr, old_size) + return result + } +} diff --git a/base/runtime/virtual_memory_netbsd.odin b/base/runtime/virtual_memory_netbsd.odin new file mode 100644 index 00000000000..46c475d9a6f --- /dev/null +++ b/base/runtime/virtual_memory_netbsd.odin @@ -0,0 +1,72 @@ +#+private +package runtime + +import "base:intrinsics" + +VIRTUAL_MEMORY_SUPPORTED :: true + +SYS_munmap :: uintptr(73) +SYS_mmap :: uintptr(197) +SYS_mremap :: uintptr(411) + +PROT_READ :: 0x01 +PROT_WRITE :: 0x02 + +MAP_PRIVATE :: 0x0002 +MAP_ANONYMOUS :: 0x1000 + +// The following features are specific to NetBSD only. +/* + * Alignment (expressed in log2). Must be >= log2(PAGE_SIZE) and + * < # bits in a pointer (32 or 64). + */ +// #define MAP_ALIGNED(n) ((int)((unsigned int)(n) << MAP_ALIGNMENT_SHIFT)) +MAP_ALIGNMENT_SHIFT :: 24 + +_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if !ok { + return nil + } + return rawptr(result) +} + +_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + // NOTE(Feoramund): I am uncertain if NetBSD has direct support for + // superpages, so we just use the aligned allocate procedure here. + return _allocate_virtual_memory_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE) +} + +_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + // NOTE: Unlike FreeBSD, the NetBSD man pages do not indicate that + // `MAP_ALIGNED` can cause this to fail, so we don't try a second time with + // manual alignment. + map_aligned_n := intrinsics.count_trailing_zeros(uintptr(alignment)) << MAP_ALIGNMENT_SHIFT + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|map_aligned_n, ~uintptr(0), 0) + if !ok { + return nil + } + return rawptr(result) +} + +_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(size)) +} + +_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { + if alignment == 0 { + // The user does not care about alignment, which is the simpler case. + result, ok := intrinsics.syscall_bsd(SYS_mremap, uintptr(ptr), uintptr(old_size), uintptr(new_size), 0) + if !ok { + return nil + } + return rawptr(result) + } else { + map_aligned_n := intrinsics.count_trailing_zeros(uintptr(alignment)) << MAP_ALIGNMENT_SHIFT + result, ok := intrinsics.syscall_bsd(SYS_mremap, uintptr(ptr), uintptr(old_size), uintptr(new_size), map_aligned_n) + if !ok { + return nil + } + return rawptr(result) + } +} diff --git a/base/runtime/virtual_memory_openbsd.odin b/base/runtime/virtual_memory_openbsd.odin new file mode 100644 index 00000000000..6a37d4ae670 --- /dev/null +++ b/base/runtime/virtual_memory_openbsd.odin @@ -0,0 +1,104 @@ +#+private +package runtime + +import "base:intrinsics" + +VIRTUAL_MEMORY_SUPPORTED :: true + +SYS_munmap :: uintptr(73) +SYS_mmap :: uintptr(49) + +PROT_READ :: 0x01 +PROT_WRITE :: 0x02 + +MAP_PRIVATE :: 0x0002 +MAP_ANONYMOUS :: 0x1000 + +_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if !ok { + return nil + } + return rawptr(result) +} + +_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + // NOTE(Feoramund): I am uncertain if OpenBSD has direct support for + // superpages, so we just use the aligned allocate procedure here. + return _allocate_virtual_memory_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE) +} + +_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + if alignment <= PAGE_SIZE { + // This is the simplest case. + // + // By virtue of binary arithmetic, any address aligned to a power of + // two is necessarily aligned to all lesser powers of two, and because + // mmap returns page-aligned addresses, we don't have to do anything + // extra here. + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if !ok { + return nil + } + return cast(rawptr)result + } + // We must over-allocate then adjust the address. + mmap_result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size + alignment), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if !ok { + return nil + } + assert_contextless(mmap_result % PAGE_SIZE == 0) + modulo := mmap_result & uintptr(alignment-1) + if modulo != 0 { + // The address is misaligned, so we must return an adjusted address + // and free the pages we don't need. + delta := uintptr(alignment) - modulo + adjusted_result := mmap_result + delta + + // Sanity-checking: + // - The adjusted address is still page-aligned, so it is a valid argument for munmap. + // - The adjusted address is aligned to the user's needs. + assert_contextless(adjusted_result % PAGE_SIZE == 0) + assert_contextless(adjusted_result % uintptr(alignment) == 0) + + // Round the delta to a multiple of the page size. + delta = delta / PAGE_SIZE * PAGE_SIZE + if delta > 0 { + // Unmap the pages we don't need. + intrinsics.syscall_bsd(SYS_munmap, mmap_result, delta) + } + + return rawptr(adjusted_result) + } else if size + alignment > PAGE_SIZE { + // The address is coincidentally aligned as desired, but we have space + // that will never be seen by the user, so we must free the backing + // pages for it. + start := size / PAGE_SIZE * PAGE_SIZE + if size % PAGE_SIZE != 0 { + start += PAGE_SIZE + } + length := size + alignment - start + if length > 0 { + intrinsics.syscall_bsd(SYS_munmap, mmap_result + uintptr(start), uintptr(length)) + } + } + return rawptr(mmap_result) +} + +_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(size)) +} + +_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { + // OpenBSD does not have a mremap syscall. + // All we can do is mmap a new address, copy the data, and munmap the old. + result: rawptr = --- + if alignment == 0 { + result = _allocate_virtual_memory(new_size) + } else { + result = _allocate_virtual_memory_aligned(new_size, alignment) + } + intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size)) + intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(old_size)) + return result +} diff --git a/base/runtime/virtual_memory_other.odin b/base/runtime/virtual_memory_other.odin new file mode 100644 index 00000000000..2c686051050 --- /dev/null +++ b/base/runtime/virtual_memory_other.odin @@ -0,0 +1,30 @@ +#+private +#+build !darwin +#+build !freebsd +#+build !linux +#+build !netbsd +#+build !openbsd +#+build !windows +package runtime + +VIRTUAL_MEMORY_SUPPORTED :: false + +_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + unimplemented_contextless("Virtual memory is not supported on this platform.") +} + +_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + unimplemented_contextless("Virtual memory is not supported on this platform.") +} + +_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + unimplemented_contextless("Virtual memory is not supported on this platform.") +} + +_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + unimplemented_contextless("Virtual memory is not supported on this platform.") +} + +_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { + unimplemented_contextless("Virtual memory is not supported on this platform.") +} diff --git a/base/runtime/virtual_memory_windows.odin b/base/runtime/virtual_memory_windows.odin new file mode 100644 index 00000000000..cd39a3170b0 --- /dev/null +++ b/base/runtime/virtual_memory_windows.odin @@ -0,0 +1,149 @@ +#+private +package runtime + +import "base:intrinsics" + +foreign import Kernel32 "system:Kernel32.lib" +foreign import OneCore "system:OneCore.lib" + +VIRTUAL_MEMORY_SUPPORTED :: true + +// NOTE: Windows makes a distinction between page size and allocation +// granularity. Addresses returned from the allocation procedures are rounded +// to allocation granularity boundaries, at a minimum, not page sizes. + +@(default_calling_convention="system") +foreign Kernel32 { + GetLastError :: proc() -> u32 --- + GetSystemInfo :: proc(lpSystemInfo: ^SYSTEM_INFO) --- +} + +@(default_calling_convention="system") +foreign OneCore { + VirtualAlloc :: proc(lpAddress: rawptr, dwSize: uint, flAllocationType: u32, flProtect: u32) -> rawptr --- + VirtualAlloc2 :: proc( + Process: rawptr, + BaseAddress: rawptr, + Size: uint, + AllocationType: u32, + PageProtection: u32, + ExtendedParameters: [^]MEM_EXTENDED_PARAMETER, + ParameterCount: u32) -> rawptr --- + VirtualFree :: proc(lpAddress: rawptr, dwSize: uint, dwFreeType: u32) -> b32 --- +} + +SYSTEM_INFO :: struct { + using DUMMYUNIONNAME: struct #raw_union { + dwOemId: u32, + using DUMMYSTRUCTNAME:struct { + wProcessorArchitecture: u16, + wReserved: u16, + }, + }, + dwPageSize: u32, + lpMinimumApplicationAddress: rawptr, + lpMaximumApplicationAddress: rawptr, + dwActiveProcessorMask: uint, + dwNumberOfProcessors: u32, + dwProcessorType: u32, + dwAllocationGranularity: u32, + wProcessorLevel: u16, + wProcessorRevision: u16, +} + +MemExtendedParameterAddressRequirements :: 0x01 +MEM_EXTENDED_PARAMETER_TYPE_BITS :: 8 + +MEM_ADDRESS_REQUIREMENTS :: struct { + LowestStartingAddress: rawptr, + HighestEndingAddress: rawptr, + Alignment: uint, +} + +MEM_EXTENDED_PARAMETER :: struct { + using DUMMYSTRUCTNAME: bit_field u64 { + Type: u64 | MEM_EXTENDED_PARAMETER_TYPE_BITS, + Reserved: u64 | 64 - MEM_EXTENDED_PARAMETER_TYPE_BITS, + }, + using DUMMYUNIONNAME: struct #raw_union { + ULong64: u64, + Pointer: rawptr, + Size: uint, + Handle: rawptr, + ULong: u32, + }, +} + + +MEM_COMMIT :: 0x00001000 +MEM_RESERVE :: 0x00002000 +MEM_RELEASE :: 0x00008000 + +PAGE_READWRITE :: 0x04 + +_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + // `Size` must be a multiple of the page size. + rounded_size := size + if rounded_size % PAGE_SIZE != 0 { + rounded_size = (size / PAGE_SIZE + 1) * PAGE_SIZE + } + result := VirtualAlloc(nil, uint(rounded_size), MEM_COMMIT|MEM_RESERVE, PAGE_READWRITE) + if result == nil { + return nil + } + return rawptr(result) +} + +_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + // TODO: Windows has support for Large Pages, but its usage requires privilege escalation. + return _allocate_virtual_memory_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE) +} + +_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + sys_info: SYSTEM_INFO + GetSystemInfo(&sys_info) + if alignment <= int(sys_info.dwAllocationGranularity) { + // The alignment is less than or equal to the allocation granularity, + // which means it will automatically be aligned and any request for + // alignment less than the allocation granularity will result in + // ERROR_INVALID_PARAMETER. + return _allocate_virtual_memory(size) + } + addr_req := MEM_ADDRESS_REQUIREMENTS{ + LowestStartingAddress = nil, + HighestEndingAddress = nil, + Alignment = uint(alignment), + } + param := MEM_EXTENDED_PARAMETER{ + Type = MemExtendedParameterAddressRequirements, + Pointer = &addr_req, + } + // `Size` must be a multiple of the page size. + rounded_size := size + if rounded_size % PAGE_SIZE != 0 { + rounded_size = (size / PAGE_SIZE + 1) * PAGE_SIZE + } + result := VirtualAlloc2(nil, nil, uint(rounded_size), MEM_COMMIT|MEM_RESERVE, PAGE_READWRITE, ¶m, 1) + if result == nil { + return nil + } + return rawptr(result) +} + +_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + VirtualFree(ptr, 0, MEM_RELEASE) +} + +_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { + // There is no system support for resizing addresses returned by VirtualAlloc. + // All we can do is request a new address, copy the data, and free the old. + result: rawptr = --- + if alignment == 0 { + result = _allocate_virtual_memory(new_size) + } else { + result = _allocate_virtual_memory_aligned(new_size, alignment) + } + intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size)) + VirtualFree(ptr, 0, MEM_RELEASE) + return result +} From 754e5a28f1e1e020faebda36656811f996f9488d Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Tue, 21 Jan 2025 17:38:09 -0500 Subject: [PATCH 03/21] Add native heap allocator - Add the test bench for the allocator - Move old allocator code to the test bench - Fix `heap_resize` usage in `os2/env_linux.odin` to fit new API requiring `old_size` --- base/runtime/heap_allocator.odin | 134 +- base/runtime/heap_allocator_control.odin | 89 + base/runtime/heap_allocator_debugging.odin | 93 + .../heap_allocator_implementation.odin | 1880 +++++++++++++++++ base/runtime/heap_allocator_info.odin | 156 ++ core/os/os2/env_linux.odin | 2 +- tests/heap_allocator/libc/heap_allocator.odin | 127 ++ .../libc}/heap_allocator_orca.odin | 2 +- .../libc}/heap_allocator_other.odin | 2 +- .../libc}/heap_allocator_unix.odin | 2 +- .../libc}/heap_allocator_windows.odin | 2 +- tests/heap_allocator/test_bench.odin | 1493 +++++++++++++ 12 files changed, 3887 insertions(+), 95 deletions(-) create mode 100644 base/runtime/heap_allocator_control.odin create mode 100644 base/runtime/heap_allocator_debugging.odin create mode 100644 base/runtime/heap_allocator_implementation.odin create mode 100644 base/runtime/heap_allocator_info.odin create mode 100644 tests/heap_allocator/libc/heap_allocator.odin rename {base/runtime => tests/heap_allocator/libc}/heap_allocator_orca.odin (95%) rename {base/runtime => tests/heap_allocator/libc}/heap_allocator_other.odin (94%) rename {base/runtime => tests/heap_allocator/libc}/heap_allocator_unix.odin (96%) rename {base/runtime => tests/heap_allocator/libc}/heap_allocator_windows.odin (97%) create mode 100644 tests/heap_allocator/test_bench.odin diff --git a/base/runtime/heap_allocator.odin b/base/runtime/heap_allocator.odin index 4b04dffef15..3fa8820a65c 100644 --- a/base/runtime/heap_allocator.odin +++ b/base/runtime/heap_allocator.odin @@ -9,111 +9,65 @@ heap_allocator :: proc() -> Allocator { } } -heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode, - size, alignment: int, - old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) { - // - // NOTE(tetra, 2020-01-14): The heap doesn't respect alignment. - // Instead, we overallocate by `alignment + size_of(rawptr) - 1`, and insert - // padding. We also store the original pointer returned by heap_alloc right before - // the pointer we return to the user. - // - - aligned_alloc :: proc(size, alignment: int, old_ptr: rawptr, old_size: int, zero_memory := true) -> ([]byte, Allocator_Error) { - // Not(flysand): We need to reserve enough space for alignment, which - // includes the user data itself, the space to store the pointer to - // allocation start, as well as the padding required to align both - // the user data and the pointer. - a := max(alignment, align_of(rawptr)) - space := a-1 + size_of(rawptr) + size - allocated_mem: rawptr - - force_copy := old_ptr != nil && alignment > align_of(rawptr) - - if old_ptr != nil && !force_copy { - original_old_ptr := ([^]rawptr)(old_ptr)[-1] - allocated_mem = heap_resize(original_old_ptr, space) - } else { - allocated_mem = heap_alloc(space, zero_memory) - } - aligned_mem := rawptr(([^]u8)(allocated_mem)[size_of(rawptr):]) - - ptr := uintptr(aligned_mem) - aligned_ptr := (ptr + uintptr(a)-1) & ~(uintptr(a)-1) - if allocated_mem == nil { - aligned_free(old_ptr) - aligned_free(allocated_mem) +heap_allocator_proc :: proc( + allocator_data: rawptr, + mode: Allocator_Mode, + size, alignment: int, + old_memory: rawptr, + old_size: int, + loc := #caller_location, +) -> ([]byte, Allocator_Error) { + assert(alignment <= HEAP_MAX_ALIGNMENT, "Heap allocation alignment beyond HEAP_MAX_ALIGNMENT bytes is not supported.", loc = loc) + assert(alignment >= 0, "Alignment must be greater than or equal to zero.", loc = loc) + switch mode { + case .Alloc: + // All allocations are aligned to at least their size up to + // `HEAP_MAX_ALIGNMENT`, and by virtue of binary arithmetic, any + // address aligned to N will also be aligned to N>>1. + // + // Therefore, we have no book-keeping costs for alignment. + ptr := heap_alloc(max(size, alignment)) + if ptr == nil { return nil, .Out_Of_Memory } - - aligned_mem = rawptr(aligned_ptr) - ([^]rawptr)(aligned_mem)[-1] = allocated_mem - - if force_copy { - mem_copy_non_overlapping(aligned_mem, old_ptr, min(old_size, size)) - aligned_free(old_ptr) - } - - return byte_slice(aligned_mem, size), nil - } - - aligned_free :: proc(p: rawptr) { - if p != nil { - heap_free(([^]rawptr)(p)[-1]) + return transmute([]byte)Raw_Slice{ data = ptr, len = size }, nil + case .Alloc_Non_Zeroed: + ptr := heap_alloc(max(size, alignment), zero = false) + if ptr == nil { + return nil, .Out_Of_Memory } - } - - aligned_resize :: proc(p: rawptr, old_size: int, new_size: int, new_alignment: int, zero_memory := true) -> (new_memory: []byte, err: Allocator_Error) { - if p == nil { - return aligned_alloc(new_size, new_alignment, nil, old_size, zero_memory) + return transmute([]byte)Raw_Slice{ data = ptr, len = size }, nil + case .Resize: + ptr := heap_resize(old_memory, old_size, max(size, alignment)) + if ptr == nil { + return nil, .Out_Of_Memory } - - new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return - - // NOTE: heap_resize does not zero the new memory, so we do it - if zero_memory && new_size > old_size { - new_region := raw_data(new_memory[old_size:]) - intrinsics.mem_zero(new_region, new_size - old_size) + return transmute([]byte)Raw_Slice{ data = ptr, len = size }, nil + case .Resize_Non_Zeroed: + ptr := heap_resize(old_memory, old_size, max(size, alignment), zero = false) + if ptr == nil { + return nil, .Out_Of_Memory } - return - } - - switch mode { - case .Alloc, .Alloc_Non_Zeroed: - return aligned_alloc(size, alignment, nil, 0, mode == .Alloc) - + return transmute([]byte)Raw_Slice{ data = ptr, len = size }, nil case .Free: - aligned_free(old_memory) - + heap_free(old_memory) case .Free_All: return nil, .Mode_Not_Implemented - - case .Resize, .Resize_Non_Zeroed: - return aligned_resize(old_memory, old_size, size, alignment, mode == .Resize) - case .Query_Features: set := (^Allocator_Mode_Set)(old_memory) if set != nil { - set^ = {.Alloc, .Alloc_Non_Zeroed, .Free, .Resize, .Resize_Non_Zeroed, .Query_Features} + set^ = { + .Alloc, + .Alloc_Non_Zeroed, + .Resize, + .Resize_Non_Zeroed, + .Free, + .Query_Features, + } } return nil, nil - case .Query_Info: return nil, .Mode_Not_Implemented } - return nil, nil } - - -heap_alloc :: proc "contextless" (size: int, zero_memory := true) -> rawptr { - return _heap_alloc(size, zero_memory) -} - -heap_resize :: proc "contextless" (ptr: rawptr, new_size: int) -> rawptr { - return _heap_resize(ptr, new_size) -} - -heap_free :: proc "contextless" (ptr: rawptr) { - _heap_free(ptr) -} \ No newline at end of file diff --git a/base/runtime/heap_allocator_control.odin b/base/runtime/heap_allocator_control.odin new file mode 100644 index 00000000000..356e00477ae --- /dev/null +++ b/base/runtime/heap_allocator_control.odin @@ -0,0 +1,89 @@ +package runtime + +import "base:intrinsics" + +/* +Merge all remote frees then free as many slabs as possible. + +This bypasses any heuristics that keep slabs setup. + +Returns true if the superpage was emptied and freed. +*/ +@(private) +compact_superpage :: proc "contextless" (superpage: ^Heap_Superpage) -> (freed: bool) { + for i := 0; i < HEAP_SLAB_COUNT; /**/ { + slab := heap_superpage_index_slab(superpage, i) + + if slab.bin_size > HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + i += heap_slabs_needed_for_size(slab.bin_size) + } else if slab.bin_size > 0 { + i += 1 + } else { + i += 1 + continue + } + + slab_is_cached := slab.free_bins > 0 + heap_merge_remote_frees(slab) + + if slab.free_bins == slab.max_bins { + if slab.bin_size > HEAP_MAX_BIN_SIZE { + heap_free_wide_slab(superpage, slab) + } else { + if slab_is_cached { + heap_cache_remove_slab(slab, heap_bin_size_to_rank(slab.bin_size)) + } + heap_free_slab(superpage, slab) + } + } + } + + if superpage.free_slabs == HEAP_SLAB_COUNT && !superpage.cache_block.in_use { + heap_free_superpage(superpage) + freed = true + } + return +} + +/* +Merge all remote frees then free as many slabs and superpages as possible. + +This bypasses any heuristics that keep slabs setup. +*/ +compact_heap :: proc "contextless" () { + superpage := local_heap + for { + if superpage == nil { + return + } + next_superpage := superpage.next + compact_superpage(superpage) + superpage = next_superpage + } +} + +/* +Free any empty superpages in the orphanage. + +This procedure assumes there won't ever be more than 128 superpages in the +orphanage. This limitation is due to the avoidance of heap allocation. +*/ +compact_heap_orphanage :: proc "contextless" () { + // First, try to empty the orphanage so that we can evaluate each superpage. + buffer: [128]^Heap_Superpage + for i := 0; i < len(buffer); i += 1 { + buffer[i] = heap_pop_orphan() + if buffer[i] == nil { + break + } + } + + // Next, compact each superpage and push it back to the orphanage if it was + // not freed. + for superpage in buffer { + if !compact_superpage(superpage) { + heap_push_orphan(superpage) + } + } +} diff --git a/base/runtime/heap_allocator_debugging.odin b/base/runtime/heap_allocator_debugging.odin new file mode 100644 index 00000000000..98225fddb25 --- /dev/null +++ b/base/runtime/heap_allocator_debugging.odin @@ -0,0 +1,93 @@ +package runtime + +import "base:intrinsics" + +ODIN_DEBUG_HEAP :: #config(ODIN_DEBUG_HEAP, false) + +Heap_Code_Coverage_Type :: enum { + Alloc_Bin, + Alloc_Collected_Remote_Frees, + Alloc_Heap_Initialized, + Alloc_Huge, + Alloc_Slab_Wide, + Alloc_Slab_Wide_Needed_New_Superpage, + Alloc_Slab_Wide_Used_Available_Superpage, + Alloc_Zeroed_Memory, + Freed_Bin_Freed_Slab_Which_Was_Fully_Used, + Freed_Bin_Freed_Superpage, + Freed_Bin_Reopened_Full_Slab, + Freed_Bin_Updated_Slab_Next_Free_Sector, + Freed_Huge_Allocation, + Freed_Wide_Slab, + Heap_Expanded_Cache_Data, + Huge_Alloc_Size_Adjusted, + Huge_Alloc_Size_Set_To_Superpage, + Merged_Remote_Frees, + Orphaned_Superpage_Freed_Slab, + Orphaned_Superpage_Merged_Remote_Frees, + Remotely_Freed_Bin, + Remotely_Freed_Bin_Caused_Remote_Superpage_Caching, + Remotely_Freed_Wide_Slab, + Resize_Caused_Memory_Zeroing, + Resize_Crossed_Size_Categories, + Resize_Huge, + Resize_Huge_Caused_Memory_Zeroing, + Resize_Huge_Size_Adjusted, + Resize_Huge_Size_Set_To_Superpage, + Resize_Kept_Old_Pointer, + Resize_Wide_Slab_Caused_Memory_Zeroing, + Resize_Wide_Slab_Expanded_In_Place, + Resize_Wide_Slab_Failed_To_Find_Contiguous_Expansion, + Resize_Wide_Slab_From_Remote_Thread, + Resize_Wide_Slab_Kept_Old_Pointer, + Resize_Wide_Slab_Shrunk_In_Place, + Slab_Adjusted_For_Partial_Sector, + Superpage_Add_Remote_Free_Guarded_With_Masterless, + Superpage_Add_Remote_Free_Guarded_With_Set, + Superpage_Added_Remote_Free, + Superpage_Added_To_Open_Cache_By_Freeing_Wide_Slab, + Superpage_Added_To_Open_Cache_By_Resizing_Wide_Slab, + Superpage_Added_To_Open_Cache_By_Slab, + Superpage_Adopted_From_Orphanage, + Superpage_Created_By_Empty_Orphanage, + Superpage_Freed_By_Exiting_Thread, + Superpage_Freed_By_Wide_Slab, + Superpage_Freed_On_Full_Orphanage, + Superpage_Linked, + Superpage_Cache_Block_Cleared, + Superpage_Orphaned_By_Exiting_Thread, + Superpage_Pushed_To_Orphanage, + Superpage_Registered_With_Free_Slabs, + Superpage_Registered_With_Slab_In_Use, + Superpage_Removed_From_Open_Cache_By_Slab, + Superpage_Unlinked_Non_Tail, + Superpage_Unlinked_Tail, + Superpage_Unregistered, + Superpage_Updated_Next_Free_Slab_Index, + Superpage_Updated_Next_Free_Slab_Index_As_Empty, +} + +when ODIN_DEBUG_HEAP { + heap_global_code_coverage: [Heap_Code_Coverage_Type]int // atomic +} + +@(private, disabled=!ODIN_DEBUG_HEAP) +heap_debug_cover :: #force_inline proc "contextless" (type: Heap_Code_Coverage_Type) { + when ODIN_DEBUG_HEAP { + intrinsics.atomic_add_explicit(&heap_global_code_coverage[type], 1, .Release) + } +} + +_check_heap_code_coverage :: proc "contextless" () -> bool { + when ODIN_DEBUG_HEAP { + intrinsics.atomic_thread_fence(.Seq_Cst) + for t in heap_global_code_coverage { + if t == 0 { + return false + } + } + return true + } else { + panic_contextless("ODIN_DEBUG_HEAP is not enabled, therefore the results of this procedure are meaningless.") + } +} diff --git a/base/runtime/heap_allocator_implementation.odin b/base/runtime/heap_allocator_implementation.odin new file mode 100644 index 00000000000..ab7372e3c6a --- /dev/null +++ b/base/runtime/heap_allocator_implementation.odin @@ -0,0 +1,1880 @@ +package runtime + +import "base:intrinsics" + +/* +This is the dynamic heap allocator for the Odin runtime. + +**Features** + +- Lock-free guarantee: A thread cannot deadlock or obstruct the allocator; + some thread makes progress no matter the parallel conditions. + +- Thread-local heaps: There is no allocator-induced false sharing. + +- Global storage for unused memory: When a thread finishes cleanly, its memory + is sent to a global storage where other threads can use it. + +- Headerless: All bin-sized allocations (any power of two, less than or equal + to 32KiB, by default) consume no extra space, and as a result of being tightly + packed, enhance performance with cache locality for programs. + + +**Terminology** + +- Bin: an allocation of a fixed size, shared with others of the same size category. + +- Slab: a fixed-size block within a Superpage that is divided into a constant + number of Bins at runtime based on the needs of the program. + +- Sector: a variable-sized bitmap, used to track whether a Bin is free or not. + + +**Allocation Categories** + +- Huge: anything in excess of 3/4ths of a Superpage. +- Slab-wide: anything in excess of the largest Bin size. +- Bin: anything less than or equal to the largest bin size. +*/ + +// +// Tunables +// + +// NOTE: Adjusting this constant by itself is rarely enough; `HEAP_SUPERPAGE_CACHE_RATIO` +// will have to be changed as well. +HEAP_SLAB_SIZE :: #config(ODIN_HEAP_SLAB_SIZE, 64 * Kilobyte) +HEAP_MAX_BIN_SIZE :: #config(ODIN_HEAP_MAX_BIN_SIZE, HEAP_SLAB_SIZE / 2) +HEAP_MIN_BIN_SIZE :: #config(ODIN_HEAP_MIN_BIN_SIZE, 8 * Byte) +HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES :: #config(ODIN_HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES, 3) +HEAP_SUPERPAGE_CACHE_RATIO :: #config(ODIN_HEAP_SUPERPAGE_CACHE_RATIO, 20) +HEAP_PANIC_ON_DOUBLE_FREE :: #config(ODIN_HEAP_PANIC_ON_DOUBLE_FREE, true) +HEAP_PANIC_ON_FREE_NIL :: #config(ODIN_HEAP_PANIC_ON_FREE_NIL, false) + +// +// Constants +// + +HEAP_MAX_BIN_SHIFT :: intrinsics.constant_log2(HEAP_MAX_BIN_SIZE) +HEAP_MIN_BIN_SHIFT :: intrinsics.constant_log2(HEAP_MIN_BIN_SIZE) +HEAP_BIN_RANKS :: 1 + HEAP_MAX_BIN_SHIFT - HEAP_MIN_BIN_SHIFT +HEAP_HUGE_ALLOCATION_BOOK_KEEPING :: size_of(int) + HEAP_MAX_ALIGNMENT +HEAP_HUGE_ALLOCATION_THRESHOLD :: SUPERPAGE_SIZE / 4 * 3 +HEAP_MAX_ALIGNMENT :: 64 * Byte +HEAP_CACHE_SLAB_MAP_STRIDE :: HEAP_SUPERPAGE_CACHE_RATIO * HEAP_SLAB_COUNT +HEAP_SECTOR_TYPES :: 2 // { local_free, remote_free } +HEAP_SLAB_ALLOCATION_BOOK_KEEPING :: size_of(Heap_Slab) + HEAP_SECTOR_TYPES * size_of(uint) + HEAP_MAX_ALIGNMENT +HEAP_SLAB_COUNT :: SUPERPAGE_SIZE / HEAP_SLAB_SIZE - 1 + +@(private="file") INTEGER_BITS :: 8 * size_of(int) +@(private="file") SECTOR_BITS :: 8 * size_of(uint) + +// +// Sanity checking +// + +#assert(HEAP_MAX_BIN_SIZE & (HEAP_MAX_BIN_SIZE-1) == 0, "HEAP_MAX_BIN_SIZE must be a power of two.") +#assert(HEAP_MAX_BIN_SIZE < HEAP_SLAB_SIZE - size_of(Heap_Slab) - HEAP_SECTOR_TYPES * size_of(uint), "HEAP_MAX_BIN_SIZE must be able to fit into one Slab, including its free maps.") +#assert(HEAP_MAX_BIN_SIZE >= HEAP_MIN_BIN_SIZE, "HEAP_MAX_BIN_SIZE must be greater than or equal to HEAP_MIN_BIN_SIZE.") +#assert(HEAP_HUGE_ALLOCATION_THRESHOLD <= SUPERPAGE_SIZE - HEAP_HUGE_ALLOCATION_BOOK_KEEPING, "HEAP_HUGE_ALLOCATION_THRESHOLD must be smaller than a Superpage, with enough space for HEAP_HUGE_ALLOCATION_BOOK_KEEPING.") +#assert(HEAP_MIN_BIN_SIZE & (HEAP_MIN_BIN_SIZE-1) == 0, "HEAP_MIN_BIN_SIZE must be a power of two.") +#assert(HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES >= 0, "HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES must be positive.") +#assert(HEAP_MAX_ALIGNMENT & (HEAP_MAX_ALIGNMENT-1) == 0, "HEAP_MAX_ALIGNMENT must be a power of two.") +#assert(HEAP_SLAB_COUNT > 0, "HEAP_SLAB_COUNT must be greater than zero.") +#assert(HEAP_SLAB_SIZE & (HEAP_SLAB_SIZE-1) == 0, "HEAP_SLAB_SIZE must be a power of two.") +#assert(SUPERPAGE_SIZE >= 2 * HEAP_SLAB_SIZE, "SUPERPAGE_SIZE must be at least twice HEAP_SLAB_SIZE.") +#assert(size_of(Heap_Superpage) < HEAP_SLAB_SIZE, "The Superpage struct must not exceed the size of a Slab.") + +// +// Utility Procedures +// + +@(require_results, private) +round_to_nearest_power_of_two :: #force_inline proc "contextless" (n: uint) -> int { + assert_contextless(n > 1, "This procedure does not handle the edge case of n < 2.") + return 1 << (INTEGER_BITS - intrinsics.count_leading_zeros(n-1)) +} + +@(require_results) +heap_slabs_needed_for_size :: #force_inline proc "contextless" (size: int) -> (result: int) { + assert_contextless(size > 0) + assert_contextless(size > HEAP_MAX_BIN_SIZE) + size := size + size += HEAP_SLAB_ALLOCATION_BOOK_KEEPING + result = size / HEAP_SLAB_SIZE + (0 if size % HEAP_SLAB_SIZE == 0 else 1) + assert_contextless(result > 0) + assert_contextless(result < HEAP_SLAB_COUNT, "Calculated an overly-large Slab-wide allocation.") + return +} + +@(require_results) +heap_round_to_bin_size :: #force_inline proc "contextless" (n: int) -> int { + if n <= HEAP_MIN_BIN_SIZE { + return HEAP_MIN_BIN_SIZE + } + m := round_to_nearest_power_of_two(uint(n)) + assert_contextless(m & (m-1) == 0, "Internal rounding error.") + return m +} + +@(require_results) +heap_bin_size_to_rank :: #force_inline proc "contextless" (size: int) -> (rank: int) { + // By this point, a size of zero should've been rounded up to HEAP_MIN_BIN_SIZE. + assert_contextless(size > 0, "Size must be greater-than zero.") + assert_contextless(size & (size-1) == 0, "Size must be a power of two.") + rank = intrinsics.count_trailing_zeros(size >> HEAP_MIN_BIN_SHIFT) + assert_contextless(rank <= HEAP_BIN_RANKS, "Bin rank calculated incorrectly; it must be less-than-or-equal-to HEAP_BIN_RANKS.") + return +} + +@(require_results) +find_superpage_from_pointer :: #force_inline proc "contextless" (ptr: rawptr) -> ^Heap_Superpage { + return cast(^Heap_Superpage)(uintptr(ptr) & ~uintptr(SUPERPAGE_SIZE-1)) +} + +@(require_results) +find_slab_from_pointer :: #force_inline proc "contextless" (ptr: rawptr) -> ^Heap_Slab { + return cast(^Heap_Slab)(uintptr(ptr) & ~uintptr(HEAP_SLAB_SIZE-1)) +} + +/* +Get a specific slab by index from a superpage. + +The position deltas are all constant with respect to the origin, hence this +procedure should prove faster than accessing an array of pointers. +*/ +@(require_results) +heap_superpage_index_slab :: #force_inline proc "contextless" (superpage: ^Heap_Superpage, index: int) -> (slab: ^Heap_Slab) { + assert_contextless(index >= 0, "The heap allocator tried to index a negative slab index.") + assert_contextless(index < HEAP_SLAB_COUNT, "The heap allocator tried to index a slab beyond the configured maximum.") + return cast(^Heap_Slab)(uintptr(superpage) + HEAP_SLAB_SIZE * uintptr(1 + index)) +} + +// +// Data Structures +// + +/* +The **Slab** is a fixed-size division of a Superpage, configured at runtime to +contain fixed-size allocations. It uses two bitmaps to keep track of the +state of its bins: whether a bin is locally free or remotely free. + +It is a Slab allocator in its own right, hence the name. + +Each allocation is self-aligned, up to an alignment size of `HEAP_MAX_ALIGNMENT` +(64 bytes by default). The slab itself is aligned to its size, allowing +allocations within to find their owning slab in constant time. + +Remote threads, in an atomic wait-free manner, flip bits across `remote_free` +to signal that they have freed memory which does not belong to them. The owning +thread will then collect those bits when the slab is approaching fullness. + + +**Fields**: + +`index` is used to make self-referencing easier. It must be the first field in +the Slab, as `heap_slab_clear_data` depends on this. + +`bin_size` tracks the precise byte size of the allocations. + +`is_dirty` indicates that the slab has been used in the past and its memory may +not be entirely zeroed. + +`is_full` is true at some point after `free_bins` == 0 and false otherwise. +It is atomic and used for inter-thread communication. + +`has_remote_frees` is true at some point after a bit has been flipped on in +`remote_free`. It is atomic and used for inter-thread communication. + +`max_bins` is set at initialization with the number of bins allotted. + +`free_bins` counts the exact number of free bins known to the allocating +thread. This value does not yet include any remote free bins. + +`dirty_bins` tracks which bins have been used and must be zeroed before being +handed out again. Because we always allocate linearly, this is simply an +integer indicating the greatest index the slab has ever distributed. + +`next_free_sector` points to which `uint` in `local_free` contains a free bit. +It is always the lowest index possible. + +`sectors` is the length of each bitmap. + +`local_free` tracks which bins are free. + +`remote_free` tracks which bins have been freed by other threads. +It is atomic. + +`data` points to the first bin and is used for calculating bin positions. +*/ +Heap_Slab :: struct { + index: int, // This field must be the first. + bin_size: int, + is_dirty: bool, + + max_bins: int, + free_bins: int, + dirty_bins: int, + next_free_sector: int, + + is_full: bool, // atomic + remote_free_bins_scheduled: int, // atomic + + sectors: int, + local_free: [^]uint, + remote_free: [^]uint, // data referenced is atomic + + // NOTE: `remote_free` and its contents should be fine with regards to + // alignment for the purposes of atomic access, as every field in a Slab is + // at least register-sized. + // + // Atomically accessing memory that is not aligned to the register size + // may cause an issue on some systems. + + data: uintptr, + + /* ... local_free's data ... */ + /* ... remote_free's data ... */ + /* ... allocation data ... */ +} + +/* +The **Superpage** is a single allocation from the operating system's virtual +memory subsystem, always with a fixed size at compile time, that is used to +store almost every allocation requested by the program in sub-allocators known +as Slabs. + +On almost every platform, this structure will be 2MiB by default. + +Depending on the operating system, addresses within the space occupied by the +superpage (and hence its allocations) may also have faster access times due to +leveraging properties of the Translation Lookaside Buffer. + +It is always aligned to its size, allowing any address allocated from its space +to look up the owning superpage in constant-time with bit masking. + +**Fields:** + +`huge_size` is precisely how many bytes have been allocated to the address at +which this structure resides, if it is a Huge allocation. It is zero for normal +superpages. + +`prev` points to the previous superpage in the doubly-linked list of all +superpages for a single thread's heap. + +`next` points to the next superpage in the same doubly-linked list. It is +accessed with atomics only when engaging with the orphanage, as that is the +only time it should change in a parallel situation. For all other cases, the +thread which owns the superpage is the only one to read this field. + +`remote_free_set` is true if there might be a slab with remote frees in this superpage. + +`owner` is the value of `current_thread_id` for the thread which owns this superpage.. + +`master_cache_block` points to the `local_heap_cache` for the thread which owns this +superpage. This field is used to help synchronize remote freeing. + +`free_slabs` is the count of slabs which are ready to use for new bin ranks. + +`next_free_slab_index` is the lowest index of a free slab. +A value of `HEAP_SLAB_COUNT` means there are no free slabs. + +`cache_block` is the space where data for the heap's cache is stored, if this +superpage has been claimed by the heap. It should otherwise be all zero. +*/ +Heap_Superpage :: struct { + huge_size: int, // This field must be the first. + prev: ^Heap_Superpage, + next: ^Heap_Superpage, // atomic in orphanage, otherwise non-atomic + + remote_free_set: bool, // atomic + owner: int, // atomic + master_cache_block: ^Heap_Cache_Block, // atomic + + free_slabs: int, + next_free_slab_index: int, + + cache_block: Heap_Cache_Block, +} + +/* +`Heap_Cache_Block` is a structure that lives within each `Heap_Superpage` that has been +claimed by a thread's heap to store heap-relevant metadata for improving +allocator performance. This structure makes use of space that would have +otherwise been unused due to the particular alignment needs the allocator has. + +The number of superpages that each `Heap_Cache_Block` oversees is dictated by the +`HEAP_SUPERPAGE_CACHE_RATIO` constant. As a thread's heap accumulates more +superpages, the heap will need to claim additional superpages - at the rate of +that constant - to keep track of all the possible data. + +The arrays are configured in such a manner that they can never overflow, and +the heap will always have more than enough space to record the data needed. + + +The `HEAP_CACHE_SLAB_MAP_STRIDE` is of note, as it must contain the number of slabs +per superpage (31, by default) times the number of superpages overseen per +`Heap_Cache_Block`. This is largely where all the space is allocated, but it pays off +handsomely in allowing the allocator to find if a slab is available for any +particular bin size in constant time. + +In practice, this is an excessive amount of space allocated for a map of arrays +for this purpose, but it is entirely possible that the allocator could get into +a spot where this hash map could overflow if it used any less space. For +instance, if the allocator had as many superpages as possible for one +`Heap_Cache_Block`, filled all the slabs, then the program freed one bin from each +slab, the allocator would need every slot in the map. The cache has as much +space to prevent just that problem from arising. + +**Fields:** + +`in_use` will be true, if the parent `Heap_Superpage` is using the space occupied by +this struct. This indicates that the superpage should not be freed. + +`next_cache_block` is an atomic pointer to the next `Heap_Cache_Block`. Each +`Heap_Cache_Block` is linked together in this way to allow each heap to expand +its available space for tracking information. + + +_The next three fields are only used in the `Heap_Superpage` pointed to by `local_heap`._ + +`length` is how many `Heap_Cache_Block` structs are in use across the local +thread's heap. + +`owned_superpages` is how many superpages are in use by the local thread's +heap. + +`remote_free_count` is an estimate of how many superpages have slabs with +remote frees available to merge. + + +_The next three fields constitute the main data used in this struct, and each +of them are like partitions, spread across a linked list._ + +`slab_map` is a hash map of arrays, keyed by bin rank, used to quickly find a +slab with free bins for a particular bin size. It uses linear probing. + +`superpages_with_free_slabs` is an array of superpages with `free_slabs` +greater than zero, used for quickly allocating new slabs when none can be found +in `slab_map`. + +`superpages_with_remote_frees` is an atomic set containing references to +superpages that may have slabs with remotely free bins. Superpages are only +added if the slab was full at the time, and an invasive flag keeps the +algorithm from having to search the entire span for an existence check. + +Keep in mind that even though a superpage is added to this set when a slab is +full and freed remotely, that state need not persist; the owning thread can +free a bin before the remote frees are merged, invalidating that part of the +heuristic. +*/ +Heap_Cache_Block :: struct { + in_use: bool, + next_cache_block: ^Heap_Cache_Block, // atomic + + // { only used in `local_heap` + length: int, + owned_superpages: int, + remote_free_count: int, // atomic + // } + + slab_map: [HEAP_CACHE_SLAB_MAP_STRIDE*HEAP_BIN_RANKS]^Heap_Slab, + superpages_with_free_slabs: [HEAP_SUPERPAGE_CACHE_RATIO]^Heap_Superpage, + superpages_with_remote_frees: [HEAP_SUPERPAGE_CACHE_RATIO]^Heap_Superpage, // atomic +} + +// +// Superstructure Allocation +// + +/* +Allocate memory for a Superpage from the operating system and do any initialization work. +*/ +@(require_results) +heap_make_superpage :: proc "contextless" () -> (superpage: ^Heap_Superpage) { + superpage = cast(^Heap_Superpage)allocate_virtual_memory_superpage() + assert_contextless(uintptr(superpage) & uintptr(SUPERPAGE_SIZE-1) == 0, "The operating system returned virtual memory which isn't aligned to a Superpage-sized boundary.") + + superpage.owner = get_current_thread_id() + superpage.free_slabs = HEAP_SLAB_COUNT + + // Each Slab is aligned to its size, so that finding which slab an + // allocated pointer is assigned to is a constant operation by bit masking. + // + // However, this means we must waste one Slab per Superpage so that the + // Superpage data can live nearby inside the chunk of virtual memory. + base := uintptr(superpage) + HEAP_SLAB_SIZE + for i in 0..= HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES { + intrinsics.atomic_sub_explicit(&heap_orphanage_count, 1, .Relaxed) + free_virtual_memory(superpage, SUPERPAGE_SIZE) + heap_debug_cover(.Superpage_Freed_On_Full_Orphanage) + } else { + heap_push_orphan(superpage) + heap_debug_cover(.Superpage_Pushed_To_Orphanage) + } +} + +/* +Remove a superpage from a heap's cache of superpages. +*/ +heap_cache_unregister_superpage :: proc "contextless" (superpage: ^Heap_Superpage) { + if superpage.cache_block.in_use { + return + } + local_heap_cache.owned_superpages -= 1 + intrinsics.atomic_store_explicit(&superpage.master_cache_block, nil, .Release) + heap_debug_cover(.Superpage_Unregistered) +} + +/* +Add a superpage to a heap's cache of superpages. + +This will take note if the superpage has free slabs, what are the contents of +its slabs, and it will merge any waiting remote free bins. +*/ +heap_cache_register_superpage :: proc "contextless" (superpage: ^Heap_Superpage) { + // Expand the heap cache's available space if needed. + local_heap_cache.owned_superpages += 1 + if local_heap_cache.owned_superpages / HEAP_SUPERPAGE_CACHE_RATIO > local_heap_cache.length { + tail := local_heap_cache + for /**/; tail.next_cache_block != nil; tail = tail.next_cache_block { } + heap_cache_expand(tail) + } + + // This must come after expansion to prevent a remote thread from running out of space. + // The cache is first expanded, _then_ other threads may know about it. + intrinsics.atomic_store_explicit(&superpage.master_cache_block, local_heap_cache, .Release) + + // Register the superpage for allocation. + if superpage.free_slabs > 0 { + // Register the superpage as having free slabs available. + heap_cache_add_superpage_with_free_slabs(superpage) + heap_debug_cover(.Superpage_Registered_With_Free_Slabs) + } + + // Register slabs. + if superpage.free_slabs < HEAP_SLAB_COUNT { + for i := 0; i < HEAP_SLAB_COUNT; /**/ { + slab := heap_superpage_index_slab(superpage, i) + + if slab.bin_size > HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + i += heap_slabs_needed_for_size(slab.bin_size) + } else if slab.bin_size > 0 { + i += 1 + } else { + i += 1 + continue + } + + // When adopting a new Superpage, we take the opportunity to + // merge any remote frees. This is important because it's + // possible for another thread to remotely free memory while + // the thread which owned it is in limbo. + if intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Acquire) > 0 { + heap_merge_remote_frees(slab) + + if slab.free_bins > 0 { + // Synchronize with any thread that might be trying to + // free as we merge. + intrinsics.atomic_store_explicit(&slab.is_full, false, .Release) + } + } + + // Free any empty Slabs and register the ones with free bins. + if slab.free_bins == slab.max_bins { + if slab.bin_size > HEAP_MAX_BIN_SIZE { + heap_free_wide_slab(superpage, slab) + } else { + heap_free_slab(superpage, slab) + } + } else if slab.bin_size <= HEAP_MAX_BIN_SIZE && slab.free_bins > 0 { + heap_cache_add_slab(slab, heap_bin_size_to_rank(slab.bin_size)) + } + + i += 1 + heap_debug_cover(.Superpage_Registered_With_Slab_In_Use) + } + } +} + +/* +Get a superpage, first by adopting any orphan, or allocating memory from the +operating system if one is not available. +*/ +@(require_results) +heap_get_superpage:: proc "contextless" () -> (superpage: ^Heap_Superpage) { + superpage = heap_pop_orphan() + if superpage == nil { + superpage = heap_make_superpage() + heap_debug_cover(.Superpage_Created_By_Empty_Orphanage) + } else { + heap_debug_cover(.Superpage_Adopted_From_Orphanage) + } + assert_contextless(superpage != nil, "The heap allocator failed to get a superpage.") + return +} + +/* +Make an allocation in the Huge category. +*/ +@(require_results) +heap_make_huge_allocation :: proc "contextless" (size: int) -> (ptr: rawptr) { + // NOTE: ThreadSanitizer may wrongly say that this is the source of a data + // race. This is because a virtual memory address has been allocated once, + // returned to the operating system, then given back to the process in a + // different thread. + // + // It is otherwise impossible for us to race on newly allocated memory. + size := size + if size < SUPERPAGE_SIZE - HEAP_HUGE_ALLOCATION_BOOK_KEEPING { + size = SUPERPAGE_SIZE + heap_debug_cover(.Huge_Alloc_Size_Set_To_Superpage) + } else { + size += HEAP_HUGE_ALLOCATION_BOOK_KEEPING + heap_debug_cover(.Huge_Alloc_Size_Adjusted) + } + assert_contextless(size >= SUPERPAGE_SIZE, "Calculated incorrect Huge allocation size.") + + // All free operations assume every pointer has a Superpage at the + // Superpage boundary of the pointer, and a Huge allocation is no + // different. + // + // The size of the allocation is written as an integer at the beginning, + // but all other fields are left zero-initialized. We then align forward to + // `HEAP_MAX_ALIGNMENT` (64 bytes, by default) and give that to the user. + superpage := cast(^Heap_Superpage)allocate_virtual_memory_aligned(size, SUPERPAGE_SIZE) + assert_contextless(uintptr(superpage) & uintptr(SUPERPAGE_SIZE-1) == 0, "The operating system returned virtual memory which isn't aligned to a Superpage-sized boundary.") + + superpage.huge_size = size + + u := uintptr(superpage) + HEAP_HUGE_ALLOCATION_BOOK_KEEPING + ptr = rawptr(u - u & (HEAP_MAX_ALIGNMENT-1)) + + assert_contextless(uintptr(ptr) & (HEAP_MAX_ALIGNMENT-1) == 0, "Huge allocation is not aligned to HEAP_MAX_ALIGNMENT.") + assert_contextless(find_superpage_from_pointer(ptr) == superpage, "Huge allocation reverse lookup failed.") + + return +} + +/* +Make an allocation that is at least one entire Slab wide from the provided superpage. + +This will return false if the Superpage lacks enough contiguous Slabs to fit the size. +*/ +@(require_results) +heap_make_slab_sized_allocation :: proc "contextless" (superpage: ^Heap_Superpage, size: int) -> (ptr: rawptr, ok: bool) { + assert_contextless(0 <= superpage.next_free_slab_index && superpage.next_free_slab_index < HEAP_SLAB_COUNT, "Invalid next_free_slab_index.") + contiguous := heap_slabs_needed_for_size(size) + + find_run: for start := superpage.next_free_slab_index; start < HEAP_SLAB_COUNT-contiguous+1; /**/ { + n := contiguous + + for i := start; i < HEAP_SLAB_COUNT; /**/ { + bin_size := heap_superpage_index_slab(superpage, i).bin_size + if bin_size > HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + start = i + heap_slabs_needed_for_size(bin_size) + continue find_run + } else if bin_size != 0 { + start = i + 1 + continue find_run + } + n -= 1 + if n > 0 { + i += 1 + continue + } + // Setup the Slab header. + // This will be a single-sector Slab that may span several Slabs. + slab := heap_superpage_index_slab(superpage, start) + + // Setup slab. + if slab.is_dirty { + heap_slab_clear_data(slab) + } + slab.bin_size = size + slab.is_full = true + slab.max_bins = 1 + slab.dirty_bins = 1 + slab.sectors = 1 + slab.local_free = cast([^]uint)(uintptr(slab) + size_of(Heap_Slab)) + slab.remote_free = cast([^]uint)(uintptr(slab) + size_of(Heap_Slab) + 1 * size_of(uint)) + data := uintptr(slab) + HEAP_SLAB_ALLOCATION_BOOK_KEEPING + ptr = rawptr(data - data & (HEAP_MAX_ALIGNMENT-1)) + slab.data = uintptr(ptr) + assert_contextless(uintptr(ptr) & (HEAP_MAX_ALIGNMENT-1) == 0, "Slab-wide allocation's data pointer is not correctly aligned.") + assert_contextless(int(uintptr(ptr) - uintptr(superpage)) + size < SUPERPAGE_SIZE, "Incorrectly calculated Slab-wide allocation exceeds Superpage end boundary.") + + // Wipe any non-zero data from slabs ahead of the header. + for x in start+1..=i { + next_slab := heap_superpage_index_slab(superpage, x) + next_slab.index = 0 + if next_slab.is_dirty { + heap_slab_clear_data(next_slab) + } + } + + // Update statistics. + superpage.free_slabs -= contiguous + assert_contextless(superpage.free_slabs >= 0, "The heap allocator caused a superpage's free_slabs to go negative.") + if superpage.free_slabs == 0 { + heap_cache_remove_superpage_with_free_slabs(superpage) + } + // NOTE: Start from zero again, because we may have skipped a non-contiguous block. + heap_update_next_free_slab_index(superpage, 0) + + return ptr, true + } + } + + return +} + +// +// Slabs +// + +/* +Do everything that is needed to ready a Slab for a specific bin size. + +This involves a handful of space calculations and writing the header. +*/ +@(require_results) +heap_slab_setup :: proc "contextless" (superpage: ^Heap_Superpage, rounded_size: int) -> (slab: ^Heap_Slab) { + assert_contextless(0 <= superpage.next_free_slab_index && superpage.next_free_slab_index < HEAP_SLAB_COUNT, "The heap allocator found a Superpage with an invalid next_free_slab_index.") + assert_contextless(superpage.free_slabs > 0, "The heap allocator tried to setup a Slab in an exhausted Superpage.") + + superpage.free_slabs -= 1 + assert_contextless(superpage.free_slabs >= 0, "The heap allocator caused a Superpage's free_slabs to go negative.") + + slab = heap_superpage_index_slab(superpage, superpage.next_free_slab_index) + if slab.is_dirty { + heap_slab_clear_data(slab) + } + slab.bin_size = rounded_size + + // The book-keeping structures compete for the same space as the data, + // so we have to go back and forth with the math a bit. + bins := HEAP_SLAB_SIZE / rounded_size + sectors := bins / INTEGER_BITS + sectors += 0 if bins % INTEGER_BITS == 0 else 1 + + // We'll waste `2 * HEAP_MAX_ALIGNMENT` bytes per slab to simplify the math + // behind getting the pointer to the first bin to align properly. + // Otherwise we'd have to go back and forth even more. + bookkeeping_bin_cost := max(1, int(size_of(Heap_Slab) + HEAP_SECTOR_TYPES * uintptr(sectors) * size_of(uint) + 2 * HEAP_MAX_ALIGNMENT) / rounded_size) + bins -= bookkeeping_bin_cost + sectors = bins / INTEGER_BITS + sectors += 0 if bins % INTEGER_BITS == 0 else 1 + + slab.sectors = sectors + slab.free_bins = bins + slab.max_bins = bins + + base_alignment := uintptr(min(HEAP_MAX_ALIGNMENT, rounded_size)) + + pointer_padding := (uintptr(base_alignment) - (uintptr(slab) + size_of(Heap_Slab) + HEAP_SECTOR_TYPES * uintptr(sectors) * size_of(uint))) & uintptr(base_alignment - 1) + + // These bitmaps are placed at the end of the struct, one after the other. + slab.local_free = cast([^]uint)(uintptr(slab) + size_of(Heap_Slab)) + slab.remote_free = cast([^]uint)(uintptr(slab) + size_of(Heap_Slab) + uintptr(sectors) * size_of(uint)) + // This pointer is specifically aligned. + slab.data = (uintptr(slab) + size_of(Heap_Slab) + HEAP_SECTOR_TYPES * uintptr(sectors) * size_of(uint) + pointer_padding) + + assert_contextless(slab.data & (base_alignment-1) == 0, "Incorrect calculation for aligning Slab data pointer.") + assert_contextless(size_of(Heap_Slab) + HEAP_SECTOR_TYPES * sectors * size_of(uint) + bins * rounded_size < HEAP_SLAB_SIZE, "Slab internal allocation overlimit.") + assert_contextless(find_slab_from_pointer(rawptr(slab.data)) == slab, "Slab data pointer cannot be traced back to its Slab.") + + // Set all of the local free bits. + { + full_sectors := bins / INTEGER_BITS + partial_sector := bins % INTEGER_BITS + for i in 0.. 0 { + slab.local_free[sectors-1] = (1 << uint(bins % INTEGER_BITS)) - 1 + heap_debug_cover(.Slab_Adjusted_For_Partial_Sector) + } + } + + // Update the next free slab. + heap_update_next_free_slab_index(superpage, superpage.next_free_slab_index + 1) + + // Make the slab known to the heap. + heap_cache_add_slab(slab, heap_bin_size_to_rank(rounded_size)) + + if superpage.free_slabs == 0 { + heap_cache_remove_superpage_with_free_slabs(superpage) + heap_debug_cover(.Superpage_Removed_From_Open_Cache_By_Slab) + } + + return +} + +/* +Set everything after the index field to zero in a Slab. +*/ +heap_slab_clear_data :: proc "contextless" (slab: ^Heap_Slab) { + intrinsics.mem_zero_volatile(rawptr(uintptr(slab) + size_of(int)), HEAP_SLAB_SIZE - size_of(int)) +} + +/* +Mark a Slab-wide allocation as no longer in use by the Superpage. +*/ +heap_free_wide_slab :: proc "contextless" (superpage: ^Heap_Superpage, slab: ^Heap_Slab) { + assert_contextless(slab.bin_size > HEAP_MAX_BIN_SIZE, "The heap allocator tried to wide-free a non-wide slab.") + // There is only one bit for a Slab-wide allocation, so this will be easy. + contiguous := heap_slabs_needed_for_size(slab.bin_size) + previously_full_superpage := superpage.free_slabs == 0 + superpage.free_slabs += contiguous + + for i in slab.index.. (bins_left: int) { + assert_contextless(slab.bin_size > 0, "The heap allocator tried to merge remote frees on an unused slab.") + + merged_bins := 0 + next_free_sector := slab.next_free_sector + + // Atomically merge in all of the bits set by other threads. + for i in 0.. HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + i += heap_slabs_needed_for_size(bin_size) + } else { + i += 1 + } + } + + panic_contextless("The heap allocator was unable to find a free slab in a superpage with free_slabs > 0.") +} + +// +// The Heap Cache +// + +/* +Link a new superpage into the heap. +*/ +heap_link_superpage :: proc "contextless" (superpage: ^Heap_Superpage) { + assert_contextless(superpage.prev == nil, "The heap allocator tried to link an already-linked superpage.") + superpage.prev = local_heap_tail + local_heap_tail.next = superpage + local_heap_tail = superpage + heap_debug_cover(.Superpage_Linked) +} + +/* +Unlink a superpage from the heap. + +There must always be at least one superpage in the heap after the first +non-Huge allocation, in order to track heap metadata. +*/ +heap_unlink_superpage :: proc "contextless" (superpage: ^Heap_Superpage) { + if superpage == local_heap_tail { + assert_contextless(superpage.next == nil, "The heap allocator's tail superpage has a next link.") + assert_contextless(superpage.prev != nil, "The heap allocator's tail superpage has no previous link.") + local_heap_tail = superpage.prev + // We never unlink all superpages, so no need to check validity here. + superpage.prev.next = nil + heap_debug_cover(.Superpage_Unlinked_Tail) + return + } + if superpage.prev != nil { + superpage.prev.next = superpage.next + } + if superpage.next != nil { + superpage.next.prev = superpage.prev + } + heap_debug_cover(.Superpage_Unlinked_Non_Tail) +} + +/* +Mark a superpage from another thread as having had a bin remotely freed. +*/ +heap_remote_cache_add_remote_free_superpage :: proc "contextless" (superpage: ^Heap_Superpage) { + if intrinsics.atomic_exchange_explicit(&superpage.remote_free_set, true, .Acq_Rel) { + // Already set. + heap_debug_cover(.Superpage_Add_Remote_Free_Guarded_With_Set) + return + } + // This uses `Consume` ordering, as we have a data dependency on the data + // pointed to the variable and can get away with a weaker order. + master := intrinsics.atomic_load_explicit(&superpage.master_cache_block, .Consume) + if master == nil { + // This superpage is not owned by anyone. + // Its remote frees will be acknowledged in whole when it's adopted. + heap_debug_cover(.Superpage_Add_Remote_Free_Guarded_With_Masterless) + return + } + defer heap_debug_cover(.Superpage_Added_Remote_Free) + + cache := master + for { + for i := 0; i < len(cache.superpages_with_remote_frees); i += 1 { + old, swapped := intrinsics.atomic_compare_exchange_strong_explicit(&cache.superpages_with_remote_frees[i], nil, superpage, .Acq_Rel, .Relaxed) + assert_contextless(old != superpage, "A remote thread found a duplicate of a superpage in a heap's superpages_with_remote_frees.") + if swapped { + intrinsics.atomic_add_explicit(&master.remote_free_count, 1, .Release) + return + } + } + next_cache_block := intrinsics.atomic_load_explicit(&cache.next_cache_block, .Consume) + assert_contextless(next_cache_block != nil, "A remote thread failed to find free space for a new entry in another heap's superpages_with_remote_frees.") + cache = next_cache_block + } +} + +/* +Claim an additional superpage for the heap cache. +*/ +heap_cache_expand :: proc "contextless" (cache: ^Heap_Cache_Block) { + superpage := find_superpage_from_pointer(cache) + assert_contextless(superpage.next != nil, "The heap allocator tried to expand its cache but has run out of linked superpages.") + new_next_cache_block := &(superpage.next).cache_block + assert_contextless(new_next_cache_block.in_use == false, "The heap allocator tried to expand its cache, but the candidate superpage is already using its cache block.") + new_next_cache_block.in_use = true + intrinsics.atomic_store_explicit(&cache.next_cache_block, new_next_cache_block, .Release) + local_heap_cache.length += 1 + heap_debug_cover(.Heap_Expanded_Cache_Data) +} + +/* +Add a slab to the heap's cache, keyed to the bin size rank of `rank`. +*/ +heap_cache_add_slab :: proc "contextless" (slab: ^Heap_Slab, rank: int) { + assert_contextless(slab != nil) + cache := local_heap_cache + for { + start := rank * HEAP_CACHE_SLAB_MAP_STRIDE + for i := start; i < start+HEAP_CACHE_SLAB_MAP_STRIDE; i += 1 { + assert_contextless(cache.slab_map[i] != slab, "The heap allocator found a duplicate entry in its slab map.") + if cache.slab_map[i] == nil { + cache.slab_map[i] = slab + return + } + } + assert_contextless(cache.next_cache_block != nil) + cache = cache.next_cache_block + } +} + +/* +Get a slab from the heap's cache for an allocation of `rounded_size` bytes. + +If no slabs are already available for the bin rank, one will be created. +*/ +@(require_results) +heap_cache_get_slab :: proc "contextless" (rounded_size: int) -> (slab: ^Heap_Slab) { + rank := heap_bin_size_to_rank(rounded_size) + slab = local_heap_cache.slab_map[rank * HEAP_CACHE_SLAB_MAP_STRIDE] + if slab == nil { + superpage := local_heap_cache.superpages_with_free_slabs[0] + if superpage == nil { + superpage = heap_get_superpage() + heap_link_superpage(superpage) + heap_cache_register_superpage(superpage) + } + assert_contextless(superpage.free_slabs > 0) + slab = heap_slab_setup(superpage, rounded_size) + } + return +} + +/* +Remove a slab with the corresponding bin rank from the heap's cache. +*/ +heap_cache_remove_slab :: proc "contextless" (slab: ^Heap_Slab, rank: int) { + cache := local_heap_cache + assert_contextless(cache.in_use) + assert_contextless(slab.bin_size == 1 << (HEAP_MIN_BIN_SHIFT + uint(rank))) + for { + assert_contextless(cache != nil) + start := rank * HEAP_CACHE_SLAB_MAP_STRIDE + for i := start; i < start+HEAP_CACHE_SLAB_MAP_STRIDE; i += 1 { + if cache.slab_map[i] == slab { + // Swap with the tail. + source_i := i + target_cache := cache + i += 1 + for { + for j := i; j < start+HEAP_CACHE_SLAB_MAP_STRIDE; j += 1 { + if target_cache.slab_map[j] == nil { + cache.slab_map[source_i] = target_cache.slab_map[j-1] + target_cache.slab_map[j-1] = nil + return + } + } + if target_cache.next_cache_block == nil { + // The entry is at the end of the stride and we have + // run out of space to search. + cache.slab_map[source_i] = nil + assert_contextless(source_i % (HEAP_CACHE_SLAB_MAP_STRIDE-1) == 0, "The heap allocator tried to remove a non-terminal slab map entry when it should have swapped it with the tail.") + return + } else if target_cache.next_cache_block.slab_map[start] == nil { + // The starting bucket in the next cache block is empty, + // so we terminate on the current stride's final bucket. + cache.slab_map[source_i] = target_cache.slab_map[start+HEAP_CACHE_SLAB_MAP_STRIDE-1] + target_cache.slab_map[start+HEAP_CACHE_SLAB_MAP_STRIDE-1] = nil + return + } + target_cache = target_cache.next_cache_block + // Reset `i` after the first iteration. + i = start + } + } + } + // Entry must be in the expanded cache blocks. + assert_contextless(cache.next_cache_block != nil) + cache = cache.next_cache_block + } +} + +/* +Make an allocation using contiguous slabs as the backing. + +This procedure will check through the heap's cache for viable superpages to +support the allocation. +*/ +@(require_results) +heap_cache_get_contiguous_slabs :: proc "contextless" (size: int) -> (ptr: rawptr) { + contiguous := heap_slabs_needed_for_size(size) + cache := local_heap_cache + for { + for i := 0; i < len(cache.superpages_with_free_slabs); i += 1 { + if cache.superpages_with_free_slabs[i] == nil { + // No superpages with free slabs left. + heap_debug_cover(.Alloc_Slab_Wide_Needed_New_Superpage) + for { + superpage := heap_get_superpage() + heap_link_superpage(superpage) + heap_cache_register_superpage(superpage) + if superpage.free_slabs >= contiguous { + alloc, ok := heap_make_slab_sized_allocation(superpage, size) + if ok { + return alloc + } + } + } + } else { + heap_debug_cover(.Alloc_Slab_Wide_Used_Available_Superpage) + superpage := cache.superpages_with_free_slabs[i] + if superpage.free_slabs >= contiguous { + alloc, ok := heap_make_slab_sized_allocation(superpage, size) + if ok { + return alloc + } + } + } + } + assert_contextless(cache.next_cache_block != nil) + cache = cache.next_cache_block + } +} + +/* +Add a superpage with free slabs to the heap's cache. +*/ +heap_cache_add_superpage_with_free_slabs :: proc "contextless" (superpage: ^Heap_Superpage) { + assert_contextless(intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) == get_current_thread_id(), "The heap allocator tried to cache a superpage that does not belong to it.") + cache := local_heap_cache + + for { + for i := 0; i < len(cache.superpages_with_free_slabs); i += 1 { + if cache.superpages_with_free_slabs[i] == nil { + cache.superpages_with_free_slabs[i] = superpage + return + } + } + assert_contextless(cache.next_cache_block != nil) + cache = cache.next_cache_block + } +} + +/* +Remove a superpage from the heap's cache for superpages with free slabs. +*/ +heap_cache_remove_superpage_with_free_slabs :: proc "contextless" (superpage: ^Heap_Superpage) { + cache := local_heap_cache + for { + for i := 0; i < len(cache.superpages_with_free_slabs); i += 1 { + if cache.superpages_with_free_slabs[i] == superpage { + // Swap with the tail. + source_i := i + target_cache := cache + i += 1 + for { + for j := i; j < len(cache.superpages_with_free_slabs); j += 1 { + if target_cache.superpages_with_free_slabs[j] == nil { + cache.superpages_with_free_slabs[source_i] = target_cache.superpages_with_free_slabs[j-1] + target_cache.superpages_with_free_slabs[j-1] = nil + return + } + } + if target_cache.next_cache_block == nil { + // The entry is at the end of the list and we have run + // out of space to search. + cache.superpages_with_free_slabs[source_i] = nil + assert_contextless(source_i == len(cache.superpages_with_free_slabs), "The heap allocator tried to remove a non-terminal superpage with free slabs entry when it should have swapped it with the tail.") + return + } else if target_cache.next_cache_block.superpages_with_free_slabs[0] == nil { + // The next list section is empty, so we have to end here. + cache.superpages_with_free_slabs[source_i] = target_cache.superpages_with_free_slabs[len(cache.superpages_with_free_slabs)-1] + target_cache.superpages_with_free_slabs[len(cache.superpages_with_free_slabs)-1] = nil + return + } + target_cache = target_cache.next_cache_block + // Reset `i` after the first iteration. + i = 0 + } + } + } + assert_contextless(cache.next_cache_block != nil) + cache = cache.next_cache_block + } +} + +// +// Superpage Orphanage +// + +// This construction is used to avoid the ABA problem. +// +// Virtually all systems we support should have a 64-bit CAS to make this work. +Tagged_Pointer :: bit_field u64 { + // Intel 5-level paging uses up to 56 bits of a pointer on x86-64. + // This should be enough to cover ARM64, too. + // + // We use an `i64` here to maintain sign extension on the upper bits for + // systems where this is relevant, hence the extra bit over 56. + pointer: i64 | 57, + // We only need so many bits that enough transactions don't happen so fast + // as to roll over the value to cause a situation where ABA can manifest. + version: u8 | 7, +} + +/* +Put a Superpage into the global orphanage. + +The caller is responsible for fetch-adding the count. +*/ +heap_push_orphan :: proc "contextless" (superpage: ^Heap_Superpage) { + assert_contextless(intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) != 0, "The heap allocator tried to push an unowned superpage to the orphanage.") + + // The algorithm below is one of the well-known methods of resolving the + // ABA problem, known as a tagged pointer. The gist is that if another + // thread has changed the value, we'll be able to detect that by checking + // against the version bits. + old_head := transmute(Tagged_Pointer)intrinsics.atomic_load_explicit(cast(^u64)&heap_orphanage, .Relaxed) + intrinsics.atomic_store_explicit(&superpage.master_cache_block, nil, .Seq_Cst) + // NOTE: This next instruction must not float above the previous one, as + // this superpage could host the `master_cache_block`. The order is important + // to keep other threads from trying to access it while we're clearing it. + if superpage.cache_block.in_use { + intrinsics.mem_zero_volatile(&superpage.cache_block, size_of(Heap_Cache_Block)) + heap_debug_cover(.Superpage_Cache_Block_Cleared) + } + superpage.prev = nil + intrinsics.atomic_store_explicit(&superpage.owner, 0, .Release) + for { + // NOTE: `next` is accessed atomically when pushing or popping from the + // orphanage, because this field must synchronize with other threads at + // this point. + // + // This has to do mainly with swinging the head's linking pointer. + // + // Beyond this point, the thread which owns the superpage will be the + // only one to read `next`, hence why it is not read atomically + // anywhere else. + intrinsics.atomic_store_explicit(&superpage.next, cast(^Heap_Superpage)uintptr(old_head.pointer), .Release) + new_head: Tagged_Pointer = --- + new_head.pointer = i64(uintptr(superpage)) + new_head.version = old_head.version + 1 + + old_head_, swapped := intrinsics.atomic_compare_exchange_weak_explicit(cast(^u64)&heap_orphanage, transmute(u64)old_head, transmute(u64)new_head, .Acq_Rel, .Relaxed) + if swapped { + break + } + old_head = transmute(Tagged_Pointer)old_head_ + } +} + +/* +Remove and return the first entry from the Superpage orphanage, which may be nil. +*/ +@(require_results) +heap_pop_orphan :: proc "contextless" () -> (superpage: ^Heap_Superpage) { + old_head := transmute(Tagged_Pointer)intrinsics.atomic_load_explicit(cast(^u64)&heap_orphanage, .Relaxed) + for { + superpage = cast(^Heap_Superpage)uintptr(old_head.pointer) + if superpage == nil { + return + } + new_head: Tagged_Pointer = --- + new_head.pointer = i64(uintptr(intrinsics.atomic_load_explicit(&superpage.next, .Acquire))) + new_head.version = old_head.version + 1 + + old_head_, swapped := intrinsics.atomic_compare_exchange_weak_explicit(cast(^u64)&heap_orphanage, transmute(u64)old_head, transmute(u64)new_head, .Acq_Rel, .Relaxed) + if swapped { + intrinsics.atomic_store_explicit(&superpage.next, nil, .Release) + intrinsics.atomic_store_explicit(&superpage.owner, get_current_thread_id(), .Release) + intrinsics.atomic_sub_explicit(&heap_orphanage_count, 1, .Release) + break + } + old_head = transmute(Tagged_Pointer)old_head_ + } + return +} + +// +// Globals +// + +@(init, private) +setup_superpage_orphanage :: proc "contextless" () { + when !VIRTUAL_MEMORY_SUPPORTED { + return + } + + // Upon a thread's clean exit, this procedure will compact its heap and + // distribute the superpages into the orphanage, if it has space. + add_thread_local_cleaner(proc "odin" () { + for superpage := local_heap; superpage != nil; /**/ { + next_superpage := superpage.next + // The following logic is a specialized case of the same found in + // `compact_heap` that ignores the cache since there's no need to + // update it when the thread's heap is being broken down. + for i := 0; i < HEAP_SLAB_COUNT; /**/ { + slab := heap_superpage_index_slab(superpage, i) + + if slab.bin_size > HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + i += heap_slabs_needed_for_size(slab.bin_size) + } else if slab.bin_size > 0 { + i += 1 + } else { + i += 1 + continue + } + + if intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Acquire) > 0 { + heap_merge_remote_frees(slab) + + if slab.free_bins > 0 { + // Synchronize with any thread that might be trying to + // free as we merge. + intrinsics.atomic_store_explicit(&slab.is_full, false, .Release) + } + heap_debug_cover(.Orphaned_Superpage_Merged_Remote_Frees) + } + + if slab.free_bins == slab.max_bins { + if slab.bin_size > HEAP_MAX_BIN_SIZE { + heap_free_wide_slab(superpage, slab) + } else { + heap_free_slab(superpage, slab) + } + heap_debug_cover(.Orphaned_Superpage_Freed_Slab) + } + } + + if superpage.free_slabs == HEAP_SLAB_COUNT { + if intrinsics.atomic_add_explicit(&heap_orphanage_count, 1, .Acq_Rel) >= HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES { + intrinsics.atomic_sub_explicit(&heap_orphanage_count, 1, .Relaxed) + + free_virtual_memory(superpage, SUPERPAGE_SIZE) + heap_debug_cover(.Superpage_Freed_By_Exiting_Thread) + } else { + heap_push_orphan(superpage) + heap_debug_cover(.Superpage_Orphaned_By_Exiting_Thread) + } + } else { + intrinsics.atomic_add_explicit(&heap_orphanage_count, 1, .Release) + heap_push_orphan(superpage) + heap_debug_cover(.Superpage_Orphaned_By_Exiting_Thread) + } + + superpage = next_superpage + } + }) +} + +// This is a lock-free, intrusively singly-linked list of Superpages that are +// not in any thread's heap. They are free for adoption by other threads +// needing memory. +heap_orphanage: Tagged_Pointer + +// This is an _estimate_ of the number of Superpages in the orphanage. It can +// never be entirely accurate due to the nature of the design. +heap_orphanage_count: int + +@(thread_local) local_heap: ^Heap_Superpage +@(thread_local) local_heap_tail: ^Heap_Superpage +@(thread_local) local_heap_cache: ^Heap_Cache_Block + +// +// API +// + +/* +Allocate an arbitrary amount of memory from the heap and optionally zero it. +*/ +@(require_results) +heap_alloc :: proc "contextless" (size: int, zero: bool = true) -> (ptr: rawptr) { + assert_contextless(size >= 0, "The heap allocator was given a negative size.") + + // Handle Huge allocations. + if size >= HEAP_HUGE_ALLOCATION_THRESHOLD { + heap_debug_cover(.Alloc_Huge) + return heap_make_huge_allocation(size) + } + + // Initialize the heap if needed. + if local_heap == nil { + local_heap = heap_get_superpage() + local_heap_tail = local_heap + local_heap_cache = &local_heap.cache_block + local_heap_cache.in_use = true + heap_cache_register_superpage(local_heap) + heap_debug_cover(.Alloc_Heap_Initialized) + } + + // Take care of any remote frees. + remote_free_count := intrinsics.atomic_load_explicit(&local_heap_cache.remote_free_count, .Acquire) + if remote_free_count > 0 { + cache := local_heap_cache + removed := 0 + counter := remote_free_count + // Go through all superpages in the cache for superpages with remote + // frees to see if any still have frees needing merged. + merge_loop: for { + consume_loop: for i in 0.. 0 && slab.free_bins == 0 && intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Acquire) > 0 { + bins_left := heap_merge_remote_frees(slab) + if bins_left != 0 { + // Here we've detected that there are still remote + // frees left, so the entry is left in the cache + // for the next round of merges. + // + // NOTE: If we were to loop back and try to + // re-merge the unmerged bins, we could end up in a + // situation where this thread is stalled under + // heavy load. + should_remove = false + } + if slab.free_bins == 0 { + // No bins were freed at all, which is possible due + // to the parallel nature of this code. The freeing + // thread could have signalled its intent, but we + // merged before it had a chance to flip the + // necessary bit. + break merge_block + } + intrinsics.atomic_store_explicit(&slab.is_full, false, .Release) + + if bin_size > HEAP_MAX_BIN_SIZE { + heap_free_wide_slab(superpage, slab) + } else { + if slab.free_bins == slab.max_bins { + heap_free_slab(superpage, slab) + } else { + heap_cache_add_slab(slab, heap_bin_size_to_rank(bin_size)) + } + } + } + + if bin_size > HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + j += heap_slabs_needed_for_size(bin_size) + } else { + j += 1 + } + } + + if should_remove { + intrinsics.atomic_store_explicit(&cache.superpages_with_remote_frees[i], nil, .Release) + intrinsics.atomic_store_explicit(&superpage.remote_free_set, false, .Release) + removed += 1 + } + + counter -= 1 + if counter == 0 { + break merge_loop + } + } + if cache.next_cache_block == nil { + break merge_loop + } + assert_contextless(cache.next_cache_block != nil) + cache = cache.next_cache_block + } + intrinsics.atomic_sub_explicit(&local_heap_cache.remote_free_count, removed, .Release) + heap_debug_cover(.Alloc_Collected_Remote_Frees) + } + + // Handle slab-wide allocations. + if size > HEAP_MAX_BIN_SIZE { + heap_debug_cover(.Alloc_Slab_Wide) + return heap_cache_get_contiguous_slabs(size) + } + + // Get a suitable slab from the heap. + rounded_size := heap_round_to_bin_size(size) + slab := heap_cache_get_slab(rounded_size) + assert_contextless(slab.bin_size == rounded_size, "The heap allocator found a slab with the wrong bin size during allocation.") + + // Allocate a bin inside the slab. + sector := slab.next_free_sector + sector_bits := slab.local_free[sector] + assert_contextless(sector_bits != 0, "The heap allocator found a slab with a full next_free_sector.") + + // Select the lowest free bit. + index := uintptr(intrinsics.count_trailing_zeros(sector_bits)) + + // Convert the index to a pointer. + ptr = rawptr(slab.data + (uintptr(sector * SECTOR_BITS) + index) * uintptr(rounded_size)) + when !ODIN_DISABLE_ASSERT { + base_alignment := min(HEAP_MAX_ALIGNMENT, uintptr(rounded_size)) + assert_contextless(uintptr(ptr) & uintptr(base_alignment-1) == 0, "A pointer allocated by the heap is not well-aligned.") + } + + // Clear the free bit. + slab.local_free[sector] &~= (1 << index) + + // Zero the memory, if needed. + if zero && index < uintptr(slab.dirty_bins) { + // Ensure that the memory zeroing is not optimized out by the compiler. + intrinsics.mem_zero_volatile(ptr, rounded_size) + // NOTE: A full memory fence should not be needed for any newly-zeroed + // allocation, as each thread controls its own heap, and for one thread + // to pass a memory address to another implies some secondary + // synchronization method, such as a mutex, which would be the way by + // which the threads come to agree on the state of main memory. + heap_debug_cover(.Alloc_Zeroed_Memory) + } + + // Update statistics. + slab.dirty_bins = int(max(slab.dirty_bins, 1 + sector * SECTOR_BITS + int(index))) + slab.free_bins -= 1 + + // Remove the slab from the cache if the slab's full. + // Otherwise, update the next free sector if the sector's full. + if slab.free_bins == 0 { + slab.next_free_sector = slab.sectors + if intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Seq_Cst) > 0 { + heap_merge_remote_frees(slab) + if slab.free_bins == 0 { + // We have come before the other thread, and the bit we needed to find was not set. + // Treat it as if it is full anyway. + intrinsics.atomic_store_explicit(&slab.is_full, true, .Release) + heap_cache_remove_slab(slab, heap_bin_size_to_rank(rounded_size)) + } else { + // No cache adjustment happens here; we know it's already in the cache. + } + } else { + intrinsics.atomic_store_explicit(&slab.is_full, true, .Release) + heap_cache_remove_slab(slab, heap_bin_size_to_rank(rounded_size)) + } + } else { + if slab.local_free[sector] == 0 { + sector += 1 + for /**/; sector < slab.sectors; sector += 1 { + if slab.local_free[sector] != 0 { + break + } + } + slab.next_free_sector = sector + } + } + heap_debug_cover(.Alloc_Bin) + return +} + +/* +Free memory returned by `heap_alloc`. +*/ +heap_free :: proc "contextless" (ptr: rawptr) { + // Check for nil. + if ptr == nil { + when HEAP_PANIC_ON_FREE_NIL { + panic_contextless("The heap allocator was given a nil pointer to free.") + } else { + return + } + } + + superpage := find_superpage_from_pointer(ptr) + + // Check if this is a huge allocation. + if superpage.huge_size > 0 { + // NOTE: If the allocator is passed a pointer that it does not own, + // which is a scenario that it cannot possibly detect (in any + // reasonably performant fashion), then the condition above may result + // in a segmentation violation. + // + // Regardless, the result of passing a pointer to this heap allocator + // which it did not return is undefined behavior. + free_virtual_memory(superpage, superpage.huge_size) + heap_debug_cover(.Freed_Huge_Allocation) + return + } + + // Find which slab this pointer belongs to. + slab := find_slab_from_pointer(ptr) + assert_contextless(slab.bin_size > 0, "The heap allocator tried to free a pointer belonging to an empty slab.") + + // Check if this is a slab-wide allocation. + if slab.bin_size > HEAP_MAX_BIN_SIZE { + if intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) == get_current_thread_id() { + heap_free_wide_slab(superpage, slab) + heap_debug_cover(.Freed_Wide_Slab) + } else { + // Atomically let the owner know there's a free slab. + intrinsics.atomic_add_explicit(&slab.remote_free_bins_scheduled, 1, .Release) + old := intrinsics.atomic_or_explicit(&slab.remote_free[0], 1, .Seq_Cst) + heap_remote_cache_add_remote_free_superpage(superpage) + + when HEAP_PANIC_ON_DOUBLE_FREE { + if old == 1 { + panic_contextless("The heap allocator freed an already-free pointer.") + } + } + heap_debug_cover(.Remotely_Freed_Wide_Slab) + } + return + } + + // Find which sector and bin this pointer refers to. + bin_number := int(uintptr(ptr) - slab.data) / slab.bin_size + sector := bin_number / INTEGER_BITS + index := uint(bin_number) % INTEGER_BITS + + assert_contextless(bin_number < slab.max_bins, "Calculated an incorrect bin number for slab.") + assert_contextless(sector < slab.sectors, "Calculated an incorrect sector for slab.") + + // See if we own the slab or not, then free the pointer. + if intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) == get_current_thread_id() { + when HEAP_PANIC_ON_DOUBLE_FREE { + if slab.local_free[sector] & (1 << index) != 0 { + panic_contextless("The heap allocator freed an already-free pointer.") + } + } + // Mark the bin as free. + slab.local_free[sector] |= 1 << index + if slab.free_bins == 0 { + intrinsics.atomic_store_explicit(&slab.is_full, false, .Release) + // Put this slab back in the available list. + heap_cache_add_slab(slab, heap_bin_size_to_rank(slab.bin_size)) + heap_debug_cover(.Freed_Bin_Reopened_Full_Slab) + } + slab.free_bins += 1 + assert_contextless(slab.free_bins <= slab.max_bins, "A slab of the heap allocator overflowed its free bins.") + // Free the slab if it's empty and was at one point in time completely + // full. This is a heuristic to prevent a single repeated new/free + // operation in an otherwise empty slab from bogging down the + // allocator. + if slab.free_bins == slab.max_bins && slab.dirty_bins == slab.max_bins { + heap_cache_remove_slab(slab, heap_bin_size_to_rank(slab.bin_size)) + heap_free_slab(superpage, slab) + heap_debug_cover(.Freed_Bin_Freed_Slab_Which_Was_Fully_Used) + } else { + slab.next_free_sector = min(slab.next_free_sector, sector) + heap_debug_cover(.Freed_Bin_Updated_Slab_Next_Free_Sector) + } + // Free the entire superpage if it's empty. + if superpage.free_slabs == HEAP_SLAB_COUNT && !superpage.cache_block.in_use { + heap_free_superpage(superpage) + heap_debug_cover(.Freed_Bin_Freed_Superpage) + } + } else { + // Atomically let the owner know there's a free bin. + + // NOTE: The order of operations here is important. + // + // 1. We must first check if the slab is full. If we wait until later, + // we risk causing a race. + is_full := intrinsics.atomic_load_explicit(&slab.is_full, .Acquire) + + // 2. We have to let the owner know that we intend to schedule a remote + // free. This way, it can keep the cached entry around if it isn't + // able to merge all of them due to timing differences on our part. + intrinsics.atomic_add_explicit(&slab.remote_free_bins_scheduled, 1, .Release) + + // (Technically, the compiler or processor is allowed to re-order the + // above two operations, and this is okay. However, the next one acts + // as a full barrier due to its Sequential Consistency ordering.) + + // 3. Finally, we flip the bit of the bin we want freed. This must be + // the final operation across all cores, because the owner could + // already be in the process of merging remote frees, and if our bin + // was the last one, it has the authorization to wipe the slab and + // possibly reallocate in the same block of memory. + // + // By deferring this to the end, we avoid any possibility of a race, + // even if multiple threads are in this section. + old := intrinsics.atomic_or_explicit(&slab.remote_free[sector], 1 << index, .Seq_Cst) + + // 4. Now we can safely check if the slab is full and add it to the + // owner's cache if needed. + if is_full { + heap_remote_cache_add_remote_free_superpage(superpage) + heap_debug_cover(.Remotely_Freed_Bin_Caused_Remote_Superpage_Caching) + } + + when HEAP_PANIC_ON_DOUBLE_FREE { + if old & (1 << index) != 0 { + panic_contextless("The heap allocator freed an already-free pointer.") + } + } + heap_debug_cover(.Remotely_Freed_Bin) + + // NOTE: It is possible for two threads to be here at the same time, + // thus violating any sense of order among the actual number of bins + // free and the reported number. + // + // T1 & T2 could flip bits, + // T2 could increment the free count, then + // T3 could merge the free bins. + // + // This scenario would result in an inconsistent count no matter which + // way the above procedure is carried out, hence its unreliability. + } +} + +/* +Resize memory returned by `heap_alloc`. +*/ +@(require_results) +heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int, zero: bool = true) -> (new_ptr: rawptr) { + Size_Category :: enum { + Unknown, + Bin, + Slab, + Huge, + } + + // We need to first determine if we're crossing size categories. + old_category: Size_Category + switch { + case old_size <= HEAP_MAX_BIN_SIZE: old_category = .Bin + case old_size < HEAP_HUGE_ALLOCATION_THRESHOLD: old_category = .Slab + case old_size >= HEAP_HUGE_ALLOCATION_THRESHOLD: old_category = .Huge + case: unreachable() + } + new_category: Size_Category + switch { + case new_size <= HEAP_MAX_BIN_SIZE: new_category = .Bin + case new_size < HEAP_HUGE_ALLOCATION_THRESHOLD: new_category = .Slab + case new_size >= HEAP_HUGE_ALLOCATION_THRESHOLD: new_category = .Huge + case: unreachable() + } + assert_contextless(old_category != .Unknown) + assert_contextless(new_category != .Unknown) + + if new_category != old_category { + // A change in size category cannot be optimized. + new_ptr = heap_alloc(new_size, zero) + intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) + heap_free(old_ptr) + heap_debug_cover(.Resize_Crossed_Size_Categories) + return + } + + // NOTE: Superpage owners are the only ones that can change the amount of + // memory that backs each pointer. Other threads have to allocate and copy. + + // Check if this is a huge allocation. + superpage := find_superpage_from_pointer(old_ptr) + if superpage.huge_size > 0 { + // This block follows the preamble in `heap_make_huge_allocation`. + new_real_size := new_size + if new_size < SUPERPAGE_SIZE - HEAP_HUGE_ALLOCATION_BOOK_KEEPING { + new_real_size = SUPERPAGE_SIZE + heap_debug_cover(.Resize_Huge_Size_Set_To_Superpage) + } else { + new_real_size += HEAP_HUGE_ALLOCATION_BOOK_KEEPING + heap_debug_cover(.Resize_Huge_Size_Adjusted) + } + resized_superpage := cast(^Heap_Superpage)resize_virtual_memory(superpage, superpage.huge_size, new_real_size, SUPERPAGE_SIZE) + assert_contextless(uintptr(resized_superpage) & (SUPERPAGE_SIZE-1) == 0, "After resizing a huge allocation, the pointer was no longer aligned to a superpage boundary.") + resized_superpage.huge_size = new_real_size + u := uintptr(resized_superpage) + HEAP_HUGE_ALLOCATION_BOOK_KEEPING + new_ptr = rawptr(u - u & (HEAP_MAX_ALIGNMENT-1)) + + if zero && new_size > old_size { + intrinsics.mem_zero_volatile( + rawptr(uintptr(new_ptr) + uintptr(old_size)), + new_size - old_size, + ) + heap_debug_cover(.Resize_Huge_Caused_Memory_Zeroing) + } + + heap_debug_cover(.Resize_Huge) + return + } + + // Find which slab this pointer belongs to. + slab := find_slab_from_pointer(old_ptr) + + // Check if this is a slab-wide allocation. + if slab.bin_size > HEAP_MAX_BIN_SIZE { + contiguous_old := heap_slabs_needed_for_size(slab.bin_size) + contiguous_new := heap_slabs_needed_for_size(new_size) + if contiguous_new == contiguous_old { + // We already have enough slabs to serve the request. + if zero && new_size > old_size { + intrinsics.mem_zero_volatile( + rawptr(uintptr(old_ptr) + uintptr(old_size)), + new_size - old_size, + ) + heap_debug_cover(.Resize_Wide_Slab_Caused_Memory_Zeroing) + } + heap_debug_cover(.Resize_Wide_Slab_Kept_Old_Pointer) + return old_ptr + } + + if slab.index + contiguous_new >= HEAP_SLAB_COUNT { + // Expanding this slab would go beyond the Superpage. + // We need more memory. + new_ptr = heap_alloc(new_size, zero) + intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) + heap_free(old_ptr) + return + } + + if intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) != get_current_thread_id() { + // We are not the owner of this data, therefore none of the special + // optimized paths for wide slabs are available to us, as they all + // involve touching the Superpage. + // + // We must re-allocate. + new_ptr = heap_alloc(new_size, zero) + intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) + heap_free(old_ptr) + heap_debug_cover(.Resize_Wide_Slab_From_Remote_Thread) + return + } + + // Can we shrink the wide slab, or can we expand it in-place? + if contiguous_new < contiguous_old { + previously_full_superpage := superpage.free_slabs == 0 + for i := slab.index + contiguous_new; i < slab.index + contiguous_old; i += 1 { + // Mark the latter slabs as unused. + next_slab := heap_superpage_index_slab(superpage, i) + next_slab.index = i + next_slab.is_dirty = true + next_slab.bin_size = 0 + } + superpage.next_free_slab_index = min(superpage.next_free_slab_index, slab.index + contiguous_new) + superpage.free_slabs += contiguous_old - contiguous_new + if previously_full_superpage { + heap_cache_add_superpage_with_free_slabs(superpage) + heap_debug_cover(.Superpage_Added_To_Open_Cache_By_Resizing_Wide_Slab) + } + heap_debug_cover(.Resize_Wide_Slab_Shrunk_In_Place) + } else { + // NOTE: We've already guarded against going beyond `HEAP_SLAB_COUNT` in the section above. + for i := slab.index + contiguous_old; i < slab.index + contiguous_new; i += 1 { + if heap_superpage_index_slab(superpage, i).bin_size != 0 { + // Contiguous space is unavailable. + new_ptr = heap_alloc(new_size, zero) + intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) + heap_free(old_ptr) + heap_debug_cover(.Resize_Wide_Slab_Failed_To_Find_Contiguous_Expansion) + return + } + } + for i := slab.index + contiguous_old; i < slab.index + contiguous_new; i += 1 { + // Wipe the index bits, and if needed, the rest of the data. + next_slab := heap_superpage_index_slab(superpage, i) + next_slab.index = 0 + if next_slab.is_dirty { + heap_slab_clear_data(next_slab) + } + } + superpage.free_slabs += contiguous_old - contiguous_new + if superpage.free_slabs == 0 { + heap_cache_remove_superpage_with_free_slabs(superpage) + } else { + heap_update_next_free_slab_index(superpage, 0) + } + heap_debug_cover(.Resize_Wide_Slab_Expanded_In_Place) + } + + // The slab-wide allocation has been resized in-place. + slab.bin_size = new_size + + return old_ptr + } + + // See if a bin rank change is needed. + new_rounded_size := heap_round_to_bin_size(new_size) + if slab.bin_size == new_rounded_size { + if zero && new_size > old_size { + intrinsics.mem_zero_volatile( + rawptr(uintptr(old_ptr) + uintptr(old_size)), + new_size - old_size, + ) + // It could be argued that a full memory fence is necessary here, + // because one thread may resize an address known to other threads, + // but as is the case with zeroing during allocation, we treat this + // as if the state change is not independent of the allocator. + // + // That is to say, if one thread resizes an address in-place, it's + // expected that other threads will need to be notified of this by + // the program, as with any other synchronization. + heap_debug_cover(.Resize_Caused_Memory_Zeroing) + } + heap_debug_cover(.Resize_Kept_Old_Pointer) + return old_ptr + } + + // Allocate and copy, as a last resort. + new_ptr = heap_alloc(new_size, zero) + intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) + heap_free(old_ptr) + return +} diff --git a/base/runtime/heap_allocator_info.odin b/base/runtime/heap_allocator_info.odin new file mode 100644 index 00000000000..3c69861dec6 --- /dev/null +++ b/base/runtime/heap_allocator_info.odin @@ -0,0 +1,156 @@ +package runtime + +import "base:intrinsics" + +/* +Heap_Info provides metrics on a single thread's heap memory usage. +*/ +Heap_Info :: struct { + total_memory_allocated_from_system: int `fmt:"M"`, + total_memory_used_for_book_keeping: int `fmt:"M"`, + total_memory_in_use: int `fmt:"M"`, + total_memory_free: int `fmt:"M"`, + total_memory_dirty: int `fmt:"M"`, + total_memory_remotely_free: int `fmt:"M"`, + + total_superpages: int, + total_superpages_dedicated_to_heap_cache: int, + total_huge_allocations: int, + total_slabs: int, + total_slabs_in_use: int, + + total_dirty_bins: int, + total_free_bins: int, + total_bins_in_use: int, + total_remote_free_bins: int, + + heap_slab_map_entries: int, + heap_superpages_with_free_slabs: int, + heap_slabs_with_remote_frees: int, +} + +/* +Get information about the current thread's heap. + +This will do additional sanity checking on the heap if assertions are enabled. +*/ +@(require_results) +get_local_heap_info :: proc "contextless" () -> (info: Heap_Info) { + if local_heap_cache != nil { + cache := local_heap_cache + slab_map_terminated: [HEAP_BIN_RANKS]bool + + for { + for rank := 0; rank < HEAP_BIN_RANKS; rank += 1 { + for i := 0; i < HEAP_CACHE_SLAB_MAP_STRIDE; i += 1 { + slab := cache.slab_map[rank * HEAP_CACHE_SLAB_MAP_STRIDE + i] + if slab_map_terminated[rank] { + assert_contextless(slab == nil, "The heap allocator has a gap in its slab map.") + } else if slab == nil { + slab_map_terminated[rank] = true + } else { + info.heap_slab_map_entries += 1 + assert_contextless(slab.bin_size != 0, "The heap allocator has an empty slab in its slab map.") + assert_contextless(slab.bin_size == 1 << (HEAP_MIN_BIN_SHIFT + uint(rank)), "The heap allocator has a slab in the wrong sub-array of the slab map.") + } + } + } + for superpage in cache.superpages_with_free_slabs { + if superpage != nil { + info.heap_superpages_with_free_slabs += 1 + } + } + for i in 0.. 0 { + info.total_huge_allocations += 1 + info.total_memory_allocated_from_system += superpage.huge_size + info.total_memory_in_use += superpage.huge_size - HEAP_HUGE_ALLOCATION_BOOK_KEEPING + info.total_memory_used_for_book_keeping += HEAP_HUGE_ALLOCATION_BOOK_KEEPING + } else { + if superpage.cache_block.in_use { + info.total_superpages_dedicated_to_heap_cache += 1 + } + info.total_memory_allocated_from_system += SUPERPAGE_SIZE + for i := 0; i < HEAP_SLAB_COUNT; /**/ { + slab := heap_superpage_index_slab(superpage, i) + + if slab.bin_size != 0 { + info.total_slabs_in_use += 1 + info.total_memory_in_use += slab.bin_size * (slab.max_bins - slab.free_bins) + info.total_memory_free += slab.bin_size * slab.free_bins + info.total_memory_dirty += slab.bin_size * slab.dirty_bins + info.total_bins_in_use += slab.max_bins - slab.free_bins + info.total_free_bins += slab.free_bins + info.total_dirty_bins += slab.dirty_bins + assert_contextless(slab.dirty_bins >= slab.max_bins - slab.free_bins, "A slab of the heap allocator has a number of dirty bins which is not equivalent to the number of its total bins minus the number of free bins.") + // Account for the bitmaps used by the Slab. + info.total_memory_used_for_book_keeping += int(slab.data - uintptr(slab)) + // Account for the space not used by the bins or the bitmaps. + n := int(slab.data - uintptr(slab) + uintptr(slab.max_bins * slab.bin_size)) + if slab.bin_size > HEAP_MAX_BIN_SIZE { + info.total_memory_used_for_book_keeping += heap_slabs_needed_for_size(slab.bin_size) * HEAP_SLAB_SIZE - n + } else { + info.total_memory_used_for_book_keeping += HEAP_SLAB_SIZE - n + } + remote_free_bins := 0 + for j in 0.. HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + i += heap_slabs_needed_for_size(slab.bin_size) + } else { + i += 1 + } + } + // Every superpage has to sacrifice one Slab's worth of space so + // that they're all aligned. + info.total_memory_used_for_book_keeping += HEAP_SLAB_SIZE + info.total_slabs += HEAP_SLAB_COUNT + } + superpage = superpage.next + } + + assert_contextless(info.total_memory_allocated_from_system == info.total_memory_used_for_book_keeping + info.total_memory_in_use + info.total_memory_free, "The heap allocator's metrics for total memory in use, free, and used for book-keeping do not add up to the total memory allocated from the operating system.") + + if local_heap_cache != nil { + assert_contextless(info.total_superpages == local_heap_cache.owned_superpages) + } + return +} diff --git a/core/os/os2/env_linux.odin b/core/os/os2/env_linux.odin index e0ef0601058..132a5464e75 100644 --- a/core/os/os2/env_linux.odin +++ b/core/os/os2/env_linux.odin @@ -76,7 +76,7 @@ _set_env :: proc(key, v_new: string) -> bool { // wasn't in the environment in the first place. k_addr, v_addr := _kv_addr_from_val(v_curr, key) if len(v_new) > len(v_curr) { - k_addr = ([^]u8)(runtime.heap_resize(k_addr, kv_size)) + k_addr = ([^]u8)(runtime.heap_resize(k_addr, len(v_curr), kv_size)) if k_addr == nil { return false } diff --git a/tests/heap_allocator/libc/heap_allocator.odin b/tests/heap_allocator/libc/heap_allocator.odin new file mode 100644 index 00000000000..8710c59f0f3 --- /dev/null +++ b/tests/heap_allocator/libc/heap_allocator.odin @@ -0,0 +1,127 @@ +package tests_heap_allocator_libc + +import "base:intrinsics" +import "base:runtime" +import "core:mem" + +// This package contains the old libc malloc-based allocator, for comparison. + +Allocator :: runtime.Allocator +Allocator_Mode :: runtime.Allocator_Mode +Allocator_Mode_Set :: runtime.Allocator_Mode_Set +Allocator_Error :: runtime.Allocator_Error + +libc_allocator :: proc() -> Allocator { + return Allocator{ + procedure = libc_allocator_proc, + data = nil, + } +} + +libc_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode, + size, alignment: int, + old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) { + // + // NOTE(tetra, 2020-01-14): The heap doesn't respect alignment. + // Instead, we overallocate by `alignment + size_of(rawptr) - 1`, and insert + // padding. We also store the original pointer returned by heap_alloc right before + // the pointer we return to the user. + // + + aligned_alloc :: proc(size, alignment: int, old_ptr: rawptr, old_size: int, zero_memory := true) -> ([]byte, Allocator_Error) { + // Not(flysand): We need to reserve enough space for alignment, which + // includes the user data itself, the space to store the pointer to + // allocation start, as well as the padding required to align both + // the user data and the pointer. + a := max(alignment, align_of(rawptr)) + space := a-1 + size_of(rawptr) + size + allocated_mem: rawptr + + force_copy := old_ptr != nil && alignment > align_of(rawptr) + + if old_ptr != nil && !force_copy { + original_old_ptr := ([^]rawptr)(old_ptr)[-1] + allocated_mem = heap_resize(original_old_ptr, space) + } else { + allocated_mem = heap_alloc(space, zero_memory) + } + aligned_mem := rawptr(([^]u8)(allocated_mem)[size_of(rawptr):]) + + ptr := uintptr(aligned_mem) + aligned_ptr := (ptr + uintptr(a)-1) & ~(uintptr(a)-1) + if allocated_mem == nil { + aligned_free(old_ptr) + aligned_free(allocated_mem) + return nil, .Out_Of_Memory + } + + aligned_mem = rawptr(aligned_ptr) + ([^]rawptr)(aligned_mem)[-1] = allocated_mem + + if force_copy { + runtime.mem_copy_non_overlapping(aligned_mem, old_ptr, min(old_size, size)) + aligned_free(old_ptr) + } + + return mem.byte_slice(aligned_mem, size), nil + } + + aligned_free :: proc(p: rawptr) { + if p != nil { + heap_free(([^]rawptr)(p)[-1]) + } + } + + aligned_resize :: proc(p: rawptr, old_size: int, new_size: int, new_alignment: int, zero_memory := true) -> (new_memory: []byte, err: Allocator_Error) { + if p == nil { + return aligned_alloc(new_size, new_alignment, nil, old_size, zero_memory) + } + + new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return + + // NOTE: heap_resize does not zero the new memory, so we do it + if zero_memory && new_size > old_size { + new_region := raw_data(new_memory[old_size:]) + intrinsics.mem_zero(new_region, new_size - old_size) + } + return + } + + switch mode { + case .Alloc, .Alloc_Non_Zeroed: + return aligned_alloc(size, alignment, nil, 0, mode == .Alloc) + + case .Free: + aligned_free(old_memory) + + case .Free_All: + return nil, .Mode_Not_Implemented + + case .Resize, .Resize_Non_Zeroed: + return aligned_resize(old_memory, old_size, size, alignment, mode == .Resize) + + case .Query_Features: + set := (^Allocator_Mode_Set)(old_memory) + if set != nil { + set^ = {.Alloc, .Alloc_Non_Zeroed, .Free, .Resize, .Resize_Non_Zeroed, .Query_Features} + } + return nil, nil + + case .Query_Info: + return nil, .Mode_Not_Implemented + } + + return nil, nil +} + +heap_alloc :: proc "contextless" (size: int, zero_memory := true) -> rawptr { + return _heap_alloc(size, zero_memory) +} + +heap_resize :: proc "contextless" (ptr: rawptr, new_size: int) -> rawptr { + return _heap_resize(ptr, new_size) +} + +heap_free :: proc "contextless" (ptr: rawptr) { + _heap_free(ptr) +} diff --git a/base/runtime/heap_allocator_orca.odin b/tests/heap_allocator/libc/heap_allocator_orca.odin similarity index 95% rename from base/runtime/heap_allocator_orca.odin rename to tests/heap_allocator/libc/heap_allocator_orca.odin index be032a20720..8fd13253db6 100644 --- a/base/runtime/heap_allocator_orca.odin +++ b/tests/heap_allocator/libc/heap_allocator_orca.odin @@ -1,6 +1,6 @@ #+build orca #+private -package runtime +package tests_heap_allocator_libc foreign { @(link_name="malloc") _orca_malloc :: proc "c" (size: int) -> rawptr --- diff --git a/base/runtime/heap_allocator_other.odin b/tests/heap_allocator/libc/heap_allocator_other.odin similarity index 94% rename from base/runtime/heap_allocator_other.odin rename to tests/heap_allocator/libc/heap_allocator_other.odin index 507dbf3181e..5118aa61221 100644 --- a/base/runtime/heap_allocator_other.odin +++ b/tests/heap_allocator/libc/heap_allocator_other.odin @@ -1,6 +1,6 @@ #+build js, wasi, freestanding, essence #+private -package runtime +package tests_heap_allocator_libc _heap_alloc :: proc "contextless" (size: int, zero_memory := true) -> rawptr { context = default_context() diff --git a/base/runtime/heap_allocator_unix.odin b/tests/heap_allocator/libc/heap_allocator_unix.odin similarity index 96% rename from base/runtime/heap_allocator_unix.odin rename to tests/heap_allocator/libc/heap_allocator_unix.odin index d4590d2dde8..49fb820bdc0 100644 --- a/base/runtime/heap_allocator_unix.odin +++ b/tests/heap_allocator/libc/heap_allocator_unix.odin @@ -1,6 +1,6 @@ #+build linux, darwin, freebsd, openbsd, netbsd, haiku #+private -package runtime +package tests_heap_allocator_libc when ODIN_OS == .Darwin { foreign import libc "system:System.framework" diff --git a/base/runtime/heap_allocator_windows.odin b/tests/heap_allocator/libc/heap_allocator_windows.odin similarity index 97% rename from base/runtime/heap_allocator_windows.odin rename to tests/heap_allocator/libc/heap_allocator_windows.odin index e07df755924..d5635d612d6 100644 --- a/base/runtime/heap_allocator_windows.odin +++ b/tests/heap_allocator/libc/heap_allocator_windows.odin @@ -1,4 +1,4 @@ -package runtime +package tests_heap_allocator_libc foreign import kernel32 "system:Kernel32.lib" diff --git a/tests/heap_allocator/test_bench.odin b/tests/heap_allocator/test_bench.odin new file mode 100644 index 00000000000..d4beb5b1f96 --- /dev/null +++ b/tests/heap_allocator/test_bench.odin @@ -0,0 +1,1493 @@ +// In order to test the heap allocator in a deterministic manner, we must run +// this program without the help of the test runner, because the runner itself +// makes use of heap allocation. +package tests_heap_allocator + +import "base:intrinsics" +import "base:runtime" +import "core:flags" +import "core:fmt" +import "core:log" +import "core:math/rand" +import "core:os" +import "core:sync" +import "core:thread" +import "core:time" +import "core:mem" + +import libc_allocator "libc" + +ODIN_DEBUG_HEAP :: runtime.ODIN_DEBUG_HEAP + +INTEGER_BITS :: 8 * size_of(int) +SECTOR_BITS :: 8 * size_of(uint) + +// The tests are specific to feoramalloc, but the benchmarks are general-purpose. + +// +// Utility +// + +expect :: proc "contextless" (condition: bool, message := #caller_expression(condition), loc := #caller_location) { + if !condition { + @(cold) + internal :: proc "contextless" (message: string, loc: runtime.Source_Code_Location) { + runtime.print_string("\n* Expectation failed: ") + runtime.print_string(message) + runtime.print_string(" @ ") + runtime.print_caller_location(loc) + runtime.print_string("\n\n") + when ODIN_DEBUG { + intrinsics.debug_trap() + } else { + intrinsics.trap() + } + } + internal(message, loc) + } +} + +verify_zeroed_slice :: proc(bytes: []byte, loc := #caller_location) { + for b in bytes { + expect(b == 0, loc = loc) + } +} + +verify_zeroed_ptr :: proc(ptr: [^]byte, size: int, loc := #caller_location) { + for i := 0; i < size; i += 1 { + expect(ptr[i] == 0, loc = loc) + } +} + +verify_zeroed :: proc { + verify_zeroed_slice, + verify_zeroed_ptr, +} + +verify_integrity_slice :: proc(bytes: []byte, seed: u64, loc := #caller_location) { + buf: [1]byte + rand.reset(seed) + for i := 0; i < len(bytes); i += len(buf) { + expect(rand.read(buf[:]) == len(buf), loc = loc) + length := min(len(buf), len(bytes) - i) + for j := 0; j < length; j += 1 { + expect(bytes[i+j] == buf[j], loc = loc) + } + } +} + +verify_integrity_ptr :: proc(ptr: [^]byte, size: int, seed: u64, loc := #caller_location) { + verify_integrity_slice(transmute([]byte)runtime.Raw_Slice{ + data = ptr, + len = size, + }, seed, loc) +} + +verify_integrity :: proc { + verify_integrity_slice, + verify_integrity_ptr, +} + + +randomize_bytes_slice :: proc(bytes: []byte, seed: u64, loc := #caller_location) { + rand.reset(seed) + buf: [1]byte + for i := 0; i < len(bytes); i += len(buf) { + expect(rand.read(buf[:]) == len(buf), loc = loc) + length := min(len(buf), len(bytes) - i) + for j := 0; j < length; j += 1 { + bytes[i+j] = buf[j] + } + } +} + +randomize_bytes_ptr :: proc(ptr: [^]byte, size: int, seed: u64, loc := #caller_location) { + randomize_bytes_slice(transmute([]byte)runtime.Raw_Slice{ + data = ptr, + len = size, + }, seed, loc) +} + +randomize_bytes :: proc { + randomize_bytes_slice, + randomize_bytes_ptr, +} + +dump_slabs :: proc() { + intrinsics.atomic_thread_fence(.Seq_Cst) + superpage := runtime.local_heap + for { + if superpage == nil { + return + } + log.infof("+++ Superpage at %p", superpage) + log.infof(" - Free slabs: %i", superpage.free_slabs) + log.infof(" - Next free slab index: %i", superpage.next_free_slab_index) + for i := 0; i < runtime.HEAP_SLAB_COUNT; /**/ { + slab := runtime.heap_superpage_index_slab(superpage, i) + if slab.bin_size == 0 { + // log.infof("Slab %i unused.", slab.index) + expect(i == slab.index) + } else { + log.infof("%#v", slab) + if slab.free_bins == 0 { + expect(intrinsics.atomic_load(&slab.is_full)) + } + } + if slab.bin_size > runtime.HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + i += runtime.heap_slabs_needed_for_size(slab.bin_size) + } else { + i += 1 + } + } + log.info("") + superpage = superpage.next + } +} + +validate_cache :: proc() { + cache := runtime.local_heap_cache + slab_map_terminated: [runtime.HEAP_BIN_RANKS]bool + superpages_with_free_slabs_terminated: bool + for { + // Validate the slab map. + for rank in 0.. 0) + } + } + + next_cache := intrinsics.atomic_load_explicit(&cache.next_cache_block, .Consume) + if next_cache == nil { + break + } + cache = next_cache + } +} + +// +// Allocation API Testing +// + +Size_Strategy :: enum { + Adding, + Multiplying, + Randomizing, +} + +Free_Strategy :: enum { + Never, + At_The_End, // free at end of allocs + Interleaved, // free X after Y allocs +} + +Free_Direction :: enum { + Forward, // like a queue + Backward, // like a stack + Randomly, +} + +test_alloc_write_free :: proc( + object_count: int, + + starting_size: int, + final_size: int, + + size_strategy: Size_Strategy, + size_operand: int, + + allocs_per_free_operation: int, + free_operations_at_once: int, + free_strategy: Free_Strategy, + free_direction: Free_Direction, +) { + Allocation :: struct { + data: []byte, + seed: u64, + } + pointers := make([]Allocation, object_count, context.temp_allocator) + + allocator := context.allocator + size := starting_size + start_index := 0 + end_index := 0 + + allocs := 0 + + log.infof("AWF: %i objects. Size: [%i..=%i] %v by %i each allocation. %i freed every %i, %v and %v.", object_count, starting_size, final_size, size_strategy, size_operand, free_operations_at_once, allocs_per_free_operation, free_strategy, free_direction) + + for o in 1..=u64(object_count) { + seed := u64(intrinsics.read_cycle_counter()) * o + alignment := min(size, runtime.HEAP_MAX_ALIGNMENT) + + bytes, alloc_err := allocator.procedure(allocator.data, .Alloc, size, alignment, nil, 0) + expect(alloc_err == nil) + pointers[end_index] = Allocation{ + data = bytes, + seed = seed, + } + end_index += 1 + allocs += 1 + + verify_zeroed(bytes) + randomize_bytes(bytes, seed) + + if size < final_size { + switch size_strategy { + case .Adding: size += size_operand + case .Multiplying: size *= size_operand + case .Randomizing: size = starting_size + rand.int_max(final_size - starting_size) + } + if final_size > starting_size { + size = min(size, final_size) + } else { + size = max(size, final_size) + } + } + + if allocs % allocs_per_free_operation != 0 { + continue + } + + switch free_strategy { + case .Never, .At_The_End: + break + case .Interleaved: + for _ in 0.. 0 && size % (runtime.HEAP_MAX_BIN_SIZE/8) == 0 { + log.infof("... %i ...", size) + } + alignment := min(size, runtime.HEAP_MAX_ALIGNMENT) + + // Allocate and free twice to make sure that the memory is truly zeroed. + // + // This works on the assumption that the allocator will return the same + // pointer if we allocate, free, then allocate again with the same + // characteristics. + // + // libc malloc does not guarantee this behavior, but feoramalloc does + // in non-parallel scenarios. + old_ptr: rawptr + for i in 0..<2 { + bytes, alloc_err := allocator.procedure(allocator.data, .Alloc, size, alignment, nil, 0) + if i == 0 { + old_ptr = raw_data(bytes) + } else { + if old_ptr != raw_data(bytes) { + different_pointers += 1 + } + } + expect(alloc_err == nil) + verify_zeroed(bytes) + randomize_bytes(bytes, u64(intrinsics.read_cycle_counter())) + _, free_err := allocator.procedure(allocator.data, .Free, 0, 0, raw_data(bytes), 0) + expect(free_err == nil) + } + } + if different_pointers > 0 { + log.warnf("There were %i cases in which the allocator didn't return the same pointer after allocating, freeing, then allocating again with the same size.", different_pointers) + } + log.info("Done.") +} + +test_single_alloc_and_resize :: proc(start, target: int) { + log.infof("Testing allocation of %i bytes, resizing to %i, then resizing back.", start, target) + allocator := context.allocator + base_seed := u64(intrinsics.read_cycle_counter()) + + alignment := min(start, runtime.HEAP_MAX_ALIGNMENT) + seed := base_seed * (1+u64(start)) + + bytes, alloc_err := allocator.procedure(allocator.data, .Alloc, start, alignment, nil, 0) + expect(alloc_err == nil) + expect(len(bytes) == start) + verify_zeroed(bytes) + randomize_bytes(bytes, seed) + + resized_bytes_1, resize_1_err := allocator.procedure(allocator.data, .Resize, target, alignment, raw_data(bytes), start) + expect(resize_1_err == nil) + expect(len(resized_bytes_1) == target) + verify_integrity(resized_bytes_1[:min(start, target)], seed) + if target > start { + verify_zeroed(resized_bytes_1[start:]) + } + + resized_bytes_2, resize_2_err := allocator.procedure(allocator.data, .Resize, start, alignment, raw_data(resized_bytes_1), target) + expect(resize_2_err == nil) + expect(len(resized_bytes_2) == start) + verify_integrity(resized_bytes_2[:min(start, target)], seed) + if start > target { + verify_zeroed(resized_bytes_2[target:]) + } + + _, free_err := allocator.procedure(allocator.data, .Free, 0, 0, raw_data(resized_bytes_2), 0) + expect(free_err == nil) +} + +/* +This test helped find an issue with the orphanage. +*/ +test_parallel_pointer_passing :: proc(thread_count: int) { + Data :: struct { + thread: ^thread.Thread, + ptr: ^^int, + sema: sync.Sema, + friend: ^sync.Sema, + wg: ^sync.Wait_Group, + } + + task :: proc(t: ^thread.Thread) { + data := cast(^Data)t.data + sync.wait(&data.sema) + expect(data.ptr != nil) + expect(data.ptr^ != nil) + expect(data.ptr^^ != 0) + free(data.ptr^) + data.ptr^ = new(int) + expect(data.ptr^^ == 0) + data.ptr^^ = int(intrinsics.read_cycle_counter()) + if data.friend != nil { + sync.post(data.friend) + } + sync.wait_group_done(data.wg) + } + + data := new(int) + data^ = int(intrinsics.read_cycle_counter()) + tasks := make([]Data, thread_count, context.temp_allocator) + + wg: sync.Wait_Group + sync.wait_group_add(&wg, thread_count) + + for i in 0.. 0) + expect(slab.bin_size > 0) + expect(intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Acquire) == 0) + + list := make([]^int, slab.max_bins) + list[0] = o + + for i in 0..= intrinsics.atomic_load_explicit(data.all_pointers_len, .Acquire) { + // Spinlock. + intrinsics.cpu_relax() + } + intrinsics.atomic_thread_fence(.Seq_Cst) + + ptr := data.all_pointers[ticket] + + expect(ptr != nil) + i_ptr := cast(^i64)ptr + expect(i_ptr^ == 0) + val := intrinsics.read_cycle_counter() + i_ptr^ = val + expect(i_ptr^ == val) + free(ptr) + } + + sync.wait_group_done(data.wg) + } + + start_time: time.Time + + barrier: sync.Barrier + sync.barrier_init(&barrier, thread_count) + + wg: sync.Wait_Group + sync.wait_group_add(&wg, thread_count) + + // non-atomic + all_pointers := make([]rawptr, thread_count*allocs_per_thread) + defer delete(all_pointers) + // atomic + all_pointers_len := 0 + all_pointers_ticket := 0 + + consumers := make([]Consumer_Data, thread_count) + defer delete(consumers) + + for i in 0.. Slab + test_single_alloc_and_resize(runtime.HEAP_MAX_BIN_SIZE, runtime.HEAP_SLAB_SIZE) + test_single_alloc_and_resize(runtime.HEAP_SLAB_SIZE, runtime.HEAP_MAX_BIN_SIZE) + + // Bin <-> Huge + test_single_alloc_and_resize(runtime.HEAP_MAX_BIN_SIZE, runtime.HEAP_HUGE_ALLOCATION_THRESHOLD) + test_single_alloc_and_resize(runtime.HEAP_HUGE_ALLOCATION_THRESHOLD, runtime.HEAP_MAX_BIN_SIZE) + + // Slab <-> Huge + test_single_alloc_and_resize(runtime.HEAP_MAX_BIN_SIZE + 1, runtime.HEAP_HUGE_ALLOCATION_THRESHOLD) + test_single_alloc_and_resize(runtime.HEAP_HUGE_ALLOCATION_THRESHOLD, runtime.HEAP_MAX_BIN_SIZE + 1) + + // Inter-huge tests. + test_single_alloc_and_resize(runtime.HEAP_HUGE_ALLOCATION_THRESHOLD + 2, runtime.HEAP_HUGE_ALLOCATION_THRESHOLD + 1) + test_single_alloc_and_resize(runtime.HEAP_HUGE_ALLOCATION_THRESHOLD + 1, runtime.SUPERPAGE_SIZE) + + // Larger-than-superpage tests. + test_single_alloc_and_resize(runtime.SUPERPAGE_SIZE, runtime.SUPERPAGE_SIZE * 3) + test_single_alloc_and_resize(runtime.SUPERPAGE_SIZE * 3, runtime.SUPERPAGE_SIZE) + + // Brute-force tests. + test_individual_allocation_and_free(runtime.HEAP_MAX_BIN_SIZE if opt.long else 1024) + test_continuous_allocation_of_size_n(16, runtime.HEAP_MAX_BIN_SIZE if opt.long else 1024) + + test_alloc_write_free( + object_count = 400, + starting_size = 16, final_size = 16, + size_strategy = .Adding, size_operand = 0, + allocs_per_free_operation = 16, + free_operations_at_once = 4, + free_strategy = .Interleaved, + free_direction = .Randomly, + ) + + test_alloc_write_free( + object_count = 100, + starting_size = 2, final_size = 4096, + size_strategy = .Multiplying, size_operand = 2, + allocs_per_free_operation = 16, + free_operations_at_once = 4, + free_strategy = .Interleaved, + free_direction = .Randomly, + ) + + test_alloc_write_free( + object_count = 100, + starting_size = 2, final_size = 32768, + size_strategy = .Adding, size_operand = 2, + allocs_per_free_operation = 16, + free_operations_at_once = 4, + free_strategy = .Interleaved, + free_direction = .Randomly, + ) + + test_alloc_write_free( + object_count = 100, + starting_size = 2, final_size = 8096, + size_strategy = .Adding, size_operand = 2, + allocs_per_free_operation = 16, + free_operations_at_once = 15, + free_strategy = .Interleaved, + free_direction = .Backward, + ) + + test_alloc_write_free( + object_count = 10, + starting_size = 8096, final_size = 2, + size_strategy = .Adding, size_operand = -2, + allocs_per_free_operation = 16, + free_operations_at_once = 15, + free_strategy = .At_The_End, + free_direction = .Forward, + ) + + test_alloc_write_free( + object_count = 10, + starting_size = 65535, final_size = 65535, + size_strategy = .Adding, size_operand = 0, + allocs_per_free_operation = 1, + free_operations_at_once = 1, + free_strategy = .At_The_End, + free_direction = .Forward, + ) + + test_alloc_write_free( + object_count = 300000, + starting_size = 8, final_size = 8, + size_strategy = .Adding, size_operand = 0, + allocs_per_free_operation = 1, + free_operations_at_once = 1, + free_strategy = .At_The_End, + free_direction = .Forward, + ) + + test_alloc_write_free( + object_count = runtime.HEAP_SLAB_COUNT*2, + starting_size = runtime.HEAP_SLAB_SIZE/2, final_size = runtime.HEAP_SLAB_SIZE*4, + size_strategy = .Multiplying, size_operand = 2, + allocs_per_free_operation = 1, + free_operations_at_once = 1, + free_strategy = .At_The_End, + free_direction = .Forward, + ) + + test_alloc_write_free( + object_count = 2, + starting_size = runtime.SUPERPAGE_SIZE/2, final_size = runtime.SUPERPAGE_SIZE*4, + size_strategy = .Multiplying, size_operand = 2, + allocs_per_free_operation = 1, + free_operations_at_once = 1, + free_strategy = .At_The_End, + free_direction = .Forward, + ) + + // This is a lengthy test and won't tell us much more than any other test will. + if opt.long { + test_single_alloc_and_resize_incremental(0, runtime.HEAP_SLAB_SIZE) + } + + runtime.compact_heap() + + test_serial_bin_sanity() + + runtime.compact_heap() + } + + if opt.serial_benchmarks { + log.info("--- Single-threaded benchmarks ---") + + log.info("* Freeing forwards ...") + bench_alloc_n_then_free_n(1_000_000, int) + bench_alloc_n_then_free_n(1_000_000, Struct_16) + bench_alloc_n_then_free_n(1_000_000, Struct_32) + bench_alloc_n_then_free_n(1_000_000, Struct_64) + bench_alloc_n_then_free_n(1_000_000, Struct_512) + bench_alloc_n_then_free_n(10_000, [8192]u8) + bench_alloc_n_then_free_n(1000, [runtime.HEAP_SLAB_SIZE/4]u8) + bench_alloc_n_then_free_n(1000, [runtime.HEAP_SLAB_SIZE*4]u8) + bench_alloc_n_then_free_n(10, [runtime.SUPERPAGE_SIZE]u8) + + log.info("* Freeing backwards ...") + bench_alloc_n_then_free_n_backwards(1_000_000, int) + bench_alloc_n_then_free_n_backwards(1_000_000, Struct_16) + bench_alloc_n_then_free_n_backwards(1_000_000, Struct_32) + bench_alloc_n_then_free_n_backwards(1_000_000, Struct_64) + bench_alloc_n_then_free_n_backwards(1_000_000, Struct_512) + bench_alloc_n_then_free_n_backwards(10_000, [8192]u8) + bench_alloc_n_then_free_n_backwards(1000, [runtime.HEAP_SLAB_SIZE/4]u8) + bench_alloc_n_then_free_n_backwards(1000, [runtime.HEAP_SLAB_SIZE*4]u8) + bench_alloc_n_then_free_n_backwards(10, [runtime.SUPERPAGE_SIZE]u8) + + log.info("* Freeing randomly ...") + bench_alloc_n_then_free_n_randomly(1_000_000, int) + bench_alloc_n_then_free_n_randomly(1_000_000, Struct_16) + bench_alloc_n_then_free_n_randomly(1_000_000, Struct_32) + bench_alloc_n_then_free_n_randomly(1_000_000, Struct_64) + bench_alloc_n_then_free_n_randomly(1_000_000, Struct_512) + bench_alloc_n_then_free_n_randomly(10_000, [8192]u8) + bench_alloc_n_then_free_n_randomly(10_000, [runtime.HEAP_SLAB_SIZE/4]u8) + bench_alloc_n_then_free_n_randomly(10_000, [runtime.HEAP_SLAB_SIZE-runtime.HEAP_SLAB_ALLOCATION_BOOK_KEEPING]u8) + bench_alloc_n_then_free_n_randomly(10_000, [runtime.HEAP_SLAB_SIZE]u8) + bench_alloc_n_then_free_n_randomly(10_000, [runtime.HEAP_SLAB_SIZE*2]u8) + bench_alloc_n_then_free_n_randomly(10, [runtime.SUPERPAGE_SIZE]u8) + + log.info("* Allocating and freeing repeatedly ...") + bench_alloc_1_then_free_1_repeatedly(100_000, int) + bench_alloc_1_then_free_1_repeatedly(100_000, Struct_16) + bench_alloc_1_then_free_1_repeatedly(100_000, Struct_32) + bench_alloc_1_then_free_1_repeatedly(100_000, Struct_64) + bench_alloc_1_then_free_1_repeatedly(100_000, Struct_512) + bench_alloc_1_then_free_1_repeatedly(10_000, [8192]u8) + } + + if opt.parallel_benchmarks { + log.info("--- Multi-threaded benchmarks ---") + + bench_1_producer_n_consumer_for_m_alloc(1, 10_000, Struct_16) + bench_1_producer_n_consumer_for_m_alloc(2, 10_000, Struct_16) + bench_1_producer_n_consumer_for_m_alloc(4, 10_000, Struct_16) + + bench_1_producer_n_consumer_for_m_alloc(1, 10_000, Struct_32) + bench_1_producer_n_consumer_for_m_alloc(2, 10_000, Struct_32) + bench_1_producer_n_consumer_for_m_alloc(4, 10_000, Struct_32) + + bench_1_producer_n_consumer_for_m_alloc(1, 10_000, Struct_64) + bench_1_producer_n_consumer_for_m_alloc(2, 10_000, Struct_64) + bench_1_producer_n_consumer_for_m_alloc(4, 10_000, Struct_64) + + bench_1_producer_n_consumer_for_m_alloc(1, 10_000, Struct_512) + bench_1_producer_n_consumer_for_m_alloc(2, 10_000, Struct_512) + bench_1_producer_n_consumer_for_m_alloc(4, 10_000, Struct_512) + + bench_1_producer_n_consumer_for_m_alloc(1, 1_000, [8192]u8) + bench_1_producer_n_consumer_for_m_alloc(2, 1_000, [8192]u8) + bench_1_producer_n_consumer_for_m_alloc(4, 1_000, [8192]u8) + + bench_1_producer_n_consumer_for_m_alloc(4, 10, [runtime.HEAP_SLAB_SIZE]u8) + + when .Thread not_in ODIN_SANITIZER_FLAGS { + // NOTE: TSan doesn't work well with excessive thread counts, + // in my experience. + bench_1_producer_n_consumer_for_m_alloc(32, 1_000, Struct_32) + bench_1_producer_n_consumer_for_m_alloc(64, 1_000, Struct_32) + bench_1_producer_n_consumer_for_m_alloc(128, 1_000, Struct_32) + } + } + + } + + log.info("Tests complete.") + + if opt.compact { + runtime.compact_heap() + log.info("The main thread's heap has been compacted.") + } + + if opt.coverage { + when runtime.ODIN_DEBUG_HEAP { + log.info("--- Code coverage report ---") + for key in runtime.Heap_Code_Coverage_Type { + value := intrinsics.atomic_load_explicit(&runtime.heap_global_code_coverage[key], .Acquire) + log.infof("%s%v: %v", ">>> " if value == 0 else "", key, value) + } + + log.infof("Full code coverage: %v", runtime._check_heap_code_coverage()) + } else { + log.error("ODIN_DEBUG_HEAP is not enabled, thus coverage cannot be calculated.") + } + } + + if opt.info { + heap_info := runtime.get_local_heap_info() + log.infof("%#v", heap_info) + } + + // if .Dump_Slabs in params { + // dump_slabs() + // } + + if opt.trap { + intrinsics.debug_trap() + } +} From 4f3c518a26065e23fe11c829383d0b84bbf46ce8 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Tue, 21 Jan 2025 17:40:07 -0500 Subject: [PATCH 04/21] Add heap allocator tests to CI --- .github/workflows/ci.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dc4240b0414..a32d7e68040 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,6 +34,7 @@ jobs: ./odin check examples/all -vet -strict-style -disallow-do -target:netbsd_arm64 ./odin check vendor/sdl3 -vet -strict-style -disallow-do -target:netbsd_amd64 -no-entry-point ./odin check vendor/sdl3 -vet -strict-style -disallow-do -target:netbsd_arm64 -no-entry-point + ./odin run tests/heap_allocator -vet -strict-style -disallow-do -define:ODIN_DEBUG_HEAP=true -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true ./odin test tests/core/speed.odin -file -all-packages -vet -strict-style -disallow-do -o:speed -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true ./odin test tests/vendor -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true @@ -65,6 +66,7 @@ jobs: gmake -C vendor/miniaudio/src ./odin check examples/all -vet -strict-style -disallow-do -target:freebsd_amd64 ./odin check vendor/sdl3 -vet -strict-style -disallow-do -target:freebsd_amd64 -no-entry-point + ./odin run tests/heap_allocator -vet -strict-style -disallow-do -define:ODIN_DEBUG_HEAP=true -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true ./odin test tests/core/speed.odin -file -all-packages -vet -strict-style -disallow-do -o:speed -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true ./odin test tests/vendor -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true @@ -123,6 +125,8 @@ jobs: run: ./odin check examples/all -strict-style -vet -disallow-do - name: Odin check vendor/sdl3 run: ./odin check vendor/sdl3 -strict-style -vet -disallow-do -no-entry-point + - name: Odin heap allocator tests + run: ./odin run tests/heap_allocator -vet -strict-style -disallow-do -define:ODIN_DEBUG_HEAP=true -sanitize:thread -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests - name: Normal Core library tests run: ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true - name: Optimized Core library tests @@ -211,6 +215,11 @@ jobs: run: | call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat odin check vendor/sdl3 -vet -strict-style -disallow-do -no-entry-point + - name: Odin heap allocator tests + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat + odin run tests/heap_allocator -vet -strict-style -disallow-do -define:ODIN_DEBUG_HEAP=true -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests - name: Core library tests shell: cmd run: | @@ -305,6 +314,9 @@ jobs: - name: Odin run -debug run: ./odin run examples/demo -debug -vet -strict-style -disallow-do -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static" -no-rpath + - name: Odin heap allocator tests + run: ./odin run tests/heap_allocator -vet -strict-style -disallow-do -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static" -no-rpath -define:ODIN_DEBUG_HEAP=true -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests + - name: Normal Core library tests run: ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static" -no-rpath From 3a24df1f27203243ddf61ad65a67b1320e414457 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Mon, 27 Jan 2025 18:48:56 -0500 Subject: [PATCH 05/21] Fix indentation --- base/runtime/heap_allocator.odin | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/base/runtime/heap_allocator.odin b/base/runtime/heap_allocator.odin index 3fa8820a65c..2a6029284be 100644 --- a/base/runtime/heap_allocator.odin +++ b/base/runtime/heap_allocator.odin @@ -12,8 +12,8 @@ heap_allocator :: proc() -> Allocator { heap_allocator_proc :: proc( allocator_data: rawptr, mode: Allocator_Mode, - size, alignment: int, - old_memory: rawptr, + size, alignment: int, + old_memory: rawptr, old_size: int, loc := #caller_location, ) -> ([]byte, Allocator_Error) { From 3326d6c56c06f9be2110573b326eda13cc0e5b19 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Mon, 27 Jan 2025 19:05:58 -0500 Subject: [PATCH 06/21] Strengthen `Consume` to `Acquire` in heap allocator --- base/runtime/heap_allocator_implementation.odin | 8 +++----- tests/heap_allocator/test_bench.odin | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/base/runtime/heap_allocator_implementation.odin b/base/runtime/heap_allocator_implementation.odin index ab7372e3c6a..04683aba794 100644 --- a/base/runtime/heap_allocator_implementation.odin +++ b/base/runtime/heap_allocator_implementation.odin @@ -922,9 +922,7 @@ heap_remote_cache_add_remote_free_superpage :: proc "contextless" (superpage: ^H heap_debug_cover(.Superpage_Add_Remote_Free_Guarded_With_Set) return } - // This uses `Consume` ordering, as we have a data dependency on the data - // pointed to the variable and can get away with a weaker order. - master := intrinsics.atomic_load_explicit(&superpage.master_cache_block, .Consume) + master := intrinsics.atomic_load_explicit(&superpage.master_cache_block, .Acquire) if master == nil { // This superpage is not owned by anyone. // Its remote frees will be acknowledged in whole when it's adopted. @@ -943,7 +941,7 @@ heap_remote_cache_add_remote_free_superpage :: proc "contextless" (superpage: ^H return } } - next_cache_block := intrinsics.atomic_load_explicit(&cache.next_cache_block, .Consume) + next_cache_block := intrinsics.atomic_load_explicit(&cache.next_cache_block, .Acquire) assert_contextless(next_cache_block != nil, "A remote thread failed to find free space for a new entry in another heap's superpages_with_remote_frees.") cache = next_cache_block } @@ -1373,7 +1371,7 @@ heap_alloc :: proc "contextless" (size: int, zero: bool = true) -> (ptr: rawptr) // frees to see if any still have frees needing merged. merge_loop: for { consume_loop: for i in 0.. Date: Mon, 27 Jan 2025 19:11:39 -0500 Subject: [PATCH 07/21] Hoist reused calculations out into variables --- base/runtime/heap_allocator_implementation.odin | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/base/runtime/heap_allocator_implementation.odin b/base/runtime/heap_allocator_implementation.odin index 04683aba794..5e21bfa148a 100644 --- a/base/runtime/heap_allocator_implementation.odin +++ b/base/runtime/heap_allocator_implementation.odin @@ -704,16 +704,18 @@ heap_slab_setup :: proc "contextless" (superpage: ^Heap_Superpage, rounded_size: base_alignment := uintptr(min(HEAP_MAX_ALIGNMENT, rounded_size)) - pointer_padding := (uintptr(base_alignment) - (uintptr(slab) + size_of(Heap_Slab) + HEAP_SECTOR_TYPES * uintptr(sectors) * size_of(uint))) & uintptr(base_alignment - 1) + slab_bitmap_base := uintptr(slab) + size_of(Heap_Slab) + total_byte_size_of_all_bitmaps := HEAP_SECTOR_TYPES * uintptr(sectors) * size_of(uint) + pointer_padding := (uintptr(base_alignment) - (slab_bitmap_base + total_byte_size_of_all_bitmaps)) & uintptr(base_alignment - 1) // These bitmaps are placed at the end of the struct, one after the other. - slab.local_free = cast([^]uint)(uintptr(slab) + size_of(Heap_Slab)) - slab.remote_free = cast([^]uint)(uintptr(slab) + size_of(Heap_Slab) + uintptr(sectors) * size_of(uint)) + slab.local_free = cast([^]uint)(slab_bitmap_base) + slab.remote_free = cast([^]uint)(slab_bitmap_base + uintptr(sectors) * size_of(uint)) // This pointer is specifically aligned. - slab.data = (uintptr(slab) + size_of(Heap_Slab) + HEAP_SECTOR_TYPES * uintptr(sectors) * size_of(uint) + pointer_padding) + slab.data = (slab_bitmap_base + total_byte_size_of_all_bitmaps + pointer_padding) assert_contextless(slab.data & (base_alignment-1) == 0, "Incorrect calculation for aligning Slab data pointer.") - assert_contextless(size_of(Heap_Slab) + HEAP_SECTOR_TYPES * sectors * size_of(uint) + bins * rounded_size < HEAP_SLAB_SIZE, "Slab internal allocation overlimit.") + assert_contextless(size_of(Heap_Slab) + int(total_byte_size_of_all_bitmaps) + bins * rounded_size < HEAP_SLAB_SIZE, "Slab internal allocation overlimit.") assert_contextless(find_slab_from_pointer(rawptr(slab.data)) == slab, "Slab data pointer cannot be traced back to its Slab.") // Set all of the local free bits. From 4fe98cbd11146e58d36491b5f90e98ae9c626bf6 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Mon, 27 Jan 2025 19:41:22 -0500 Subject: [PATCH 08/21] Rearrange slab iteration conditionals --- base/runtime/heap_allocator_control.odin | 6 +++--- base/runtime/heap_allocator_implementation.odin | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/base/runtime/heap_allocator_control.odin b/base/runtime/heap_allocator_control.odin index 356e00477ae..737870c0475 100644 --- a/base/runtime/heap_allocator_control.odin +++ b/base/runtime/heap_allocator_control.odin @@ -17,11 +17,11 @@ compact_superpage :: proc "contextless" (superpage: ^Heap_Superpage) -> (freed: if slab.bin_size > HEAP_MAX_BIN_SIZE { // Skip contiguous slabs. i += heap_slabs_needed_for_size(slab.bin_size) - } else if slab.bin_size > 0 { - i += 1 } else { i += 1 - continue + if slab.bin_size == 0 { + continue + } } slab_is_cached := slab.free_bins > 0 diff --git a/base/runtime/heap_allocator_implementation.odin b/base/runtime/heap_allocator_implementation.odin index 5e21bfa148a..d7148d8d23a 100644 --- a/base/runtime/heap_allocator_implementation.odin +++ b/base/runtime/heap_allocator_implementation.odin @@ -490,11 +490,11 @@ heap_cache_register_superpage :: proc "contextless" (superpage: ^Heap_Superpage) if slab.bin_size > HEAP_MAX_BIN_SIZE { // Skip contiguous slabs. i += heap_slabs_needed_for_size(slab.bin_size) - } else if slab.bin_size > 0 { - i += 1 } else { i += 1 - continue + if slab.bin_size == 0 { + continue + } } // When adopting a new Superpage, we take the opportunity to @@ -1274,11 +1274,11 @@ setup_superpage_orphanage :: proc "contextless" () { if slab.bin_size > HEAP_MAX_BIN_SIZE { // Skip contiguous slabs. i += heap_slabs_needed_for_size(slab.bin_size) - } else if slab.bin_size > 0 { - i += 1 } else { i += 1 - continue + if slab.bin_size == 0 { + continue + } } if intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Acquire) > 0 { From 3e450ade13bb25cfa00d96aa39777111e9e869d1 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Tue, 28 Jan 2025 14:13:04 -0500 Subject: [PATCH 09/21] Restrict superpage allocation on Darwin to AMD64 and 2MiB There's a possibility that Darwin does not support anything else, in terms of size or architecture. --- base/runtime/virtual_memory_darwin.odin | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/base/runtime/virtual_memory_darwin.odin b/base/runtime/virtual_memory_darwin.odin index a3d3c61adef..1f17635d6fc 100644 --- a/base/runtime/virtual_memory_darwin.odin +++ b/base/runtime/virtual_memory_darwin.odin @@ -52,10 +52,14 @@ _allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { _allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { address: u64 flags: i32 = VM_FLAGS_ANYWHERE - when SUPERPAGE_SIZE == 2 * Megabyte { - flags |= VM_FLAGS_SUPERPAGE_SIZE_2MB - } else { - flags |= VM_FLAGS_SUPERPAGE_SIZE_ANY + when ODIN_ARCH == .amd64 { + // NOTE(Feoramund): As far as we are aware, Darwin only supports + // explicit superpage allocation on AMD64 with a 2MiB parameter. + when SUPERPAGE_SIZE == 2 * Megabyte { + flags |= VM_FLAGS_SUPERPAGE_SIZE_2MB + } else { + #panic("An unsupported superpage size has been configured for AMD64 Darwin; only 2MB is supported.") + } } alignment_mask: u64 = SUPERPAGE_SIZE - 1 // Assumes a power of two size, ensured by an assertion in `virtual_memory.odin`. result := mach_vm_map(mach_task_self(), &address, SUPERPAGE_SIZE, alignment_mask, flags, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_SHARE) From fe3bf9de47ab94a31e7735e216e1e485fbed4489 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Tue, 28 Jan 2025 14:19:23 -0500 Subject: [PATCH 10/21] Use `VM_INHERIT_COPY` instead This aligns with the use of `MAP_PRIVATE` in `mmap`-based implementations. --- base/runtime/virtual_memory_darwin.odin | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/base/runtime/virtual_memory_darwin.odin b/base/runtime/virtual_memory_darwin.odin index 1f17635d6fc..60c59eac912 100644 --- a/base/runtime/virtual_memory_darwin.odin +++ b/base/runtime/virtual_memory_darwin.odin @@ -38,11 +38,11 @@ VM_FLAGS_SUPERPAGE_SIZE_2MB :: SUPERPAGE_SIZE_2MB << VM_FLAGS_SUPERPAGE_SHIFT MEMORY_OBJECT_NULL :: 0 VM_PROT_READ :: 0x01 VM_PROT_WRITE :: 0x02 -VM_INHERIT_SHARE :: 0 +VM_INHERIT_COPY :: 1 _allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { address: u64 - result := mach_vm_map(mach_task_self(), &address, u64(size), 0, VM_FLAGS_ANYWHERE, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_SHARE) + result := mach_vm_map(mach_task_self(), &address, u64(size), 0, VM_FLAGS_ANYWHERE, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY) if result != 0 { return nil } @@ -62,7 +62,7 @@ _allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { } } alignment_mask: u64 = SUPERPAGE_SIZE - 1 // Assumes a power of two size, ensured by an assertion in `virtual_memory.odin`. - result := mach_vm_map(mach_task_self(), &address, SUPERPAGE_SIZE, alignment_mask, flags, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_SHARE) + result := mach_vm_map(mach_task_self(), &address, SUPERPAGE_SIZE, alignment_mask, flags, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY) if result != 0 { return nil } @@ -73,7 +73,7 @@ _allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { _allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { address: u64 alignment_mask: u64 = u64(alignment) - 1 - result := mach_vm_map(mach_task_self(), &address, u64(size), alignment_mask, VM_FLAGS_ANYWHERE, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_SHARE) + result := mach_vm_map(mach_task_self(), &address, u64(size), alignment_mask, VM_FLAGS_ANYWHERE, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY) if result != 0 { return nil } From b66734e3e5421bb4aa26f605dca6b9fb226749e9 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Tue, 28 Jan 2025 14:49:44 -0500 Subject: [PATCH 11/21] Add heap allocator exception for Orca This is a copy of the libc malloc-based allocator that we previously had. It's preferable to use the Orca runtime allocator over our own. --- base/runtime/heap_allocator.odin | 1 + base/runtime/heap_allocator_control.odin | 1 + base/runtime/heap_allocator_debugging.odin | 1 + base/runtime/heap_allocator_fallback.odin | 138 ++++++++++++++++++ .../heap_allocator_implementation.odin | 1 + base/runtime/heap_allocator_info.odin | 1 + 6 files changed, 143 insertions(+) create mode 100644 base/runtime/heap_allocator_fallback.odin diff --git a/base/runtime/heap_allocator.odin b/base/runtime/heap_allocator.odin index 2a6029284be..fba56c10a1c 100644 --- a/base/runtime/heap_allocator.odin +++ b/base/runtime/heap_allocator.odin @@ -1,3 +1,4 @@ +#+build !orca package runtime import "base:intrinsics" diff --git a/base/runtime/heap_allocator_control.odin b/base/runtime/heap_allocator_control.odin index 737870c0475..f7ead8f57f6 100644 --- a/base/runtime/heap_allocator_control.odin +++ b/base/runtime/heap_allocator_control.odin @@ -1,3 +1,4 @@ +#+build !orca package runtime import "base:intrinsics" diff --git a/base/runtime/heap_allocator_debugging.odin b/base/runtime/heap_allocator_debugging.odin index 98225fddb25..e2315c37a82 100644 --- a/base/runtime/heap_allocator_debugging.odin +++ b/base/runtime/heap_allocator_debugging.odin @@ -1,3 +1,4 @@ +#+build !orca package runtime import "base:intrinsics" diff --git a/base/runtime/heap_allocator_fallback.odin b/base/runtime/heap_allocator_fallback.odin new file mode 100644 index 00000000000..129cd9d864d --- /dev/null +++ b/base/runtime/heap_allocator_fallback.odin @@ -0,0 +1,138 @@ +#+build orca +package runtime + +// This is the libc malloc-based Odin allocator, used as a fallback for +// platforms that do not support virtual memory, superpages, or aligned +// allocation, or for platforms where we would prefer to use the system's +// built-in allocator through the malloc interface. + +import "base:intrinsics" + +foreign { + @(link_name="malloc") _libc_malloc :: proc "c" (size: int) -> rawptr --- + @(link_name="calloc") _libc_calloc :: proc "c" (num, size: int) -> rawptr --- + @(link_name="free") _libc_free :: proc "c" (ptr: rawptr) --- + @(link_name="realloc") _libc_realloc :: proc "c" (ptr: rawptr, size: int) -> rawptr --- +} + +heap_alloc :: proc "contextless" (size: int, zero_memory := true) -> rawptr { + if size <= 0 { + return nil + } + if zero_memory { + return _libc_calloc(1, size) + } else { + return _libc_malloc(size) + } +} + +heap_resize :: proc "contextless" (ptr: rawptr, new_size: int) -> rawptr { + return _libc_realloc(ptr, new_size) +} + +heap_free :: proc "contextless" (ptr: rawptr) { + _libc_free(ptr) +} + +heap_allocator :: proc() -> Allocator { + return { + data = nil, + procedure = heap_allocator_proc, + } +} + +heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode, + size, alignment: int, + old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) { + // + // NOTE(tetra, 2020-01-14): The heap doesn't respect alignment. + // Instead, we overallocate by `alignment + size_of(rawptr) - 1`, and insert + // padding. We also store the original pointer returned by heap_alloc right before + // the pointer we return to the user. + // + + aligned_alloc :: proc(size, alignment: int, old_ptr: rawptr, old_size: int, zero_memory := true) -> ([]byte, Allocator_Error) { + // NOTE(flysand): We need to reserve enough space for alignment, which + // includes the user data itself, the space to store the pointer to + // allocation start, as well as the padding required to align both + // the user data and the pointer. + a := max(alignment, align_of(rawptr)) + space := a-1 + size_of(rawptr) + size + allocated_mem: rawptr + + force_copy := old_ptr != nil && alignment > align_of(rawptr) + + if old_ptr != nil && !force_copy { + original_old_ptr := ([^]rawptr)(old_ptr)[-1] + allocated_mem = heap_resize(original_old_ptr, space) + } else { + allocated_mem = heap_alloc(space, zero_memory) + } + aligned_mem := rawptr(([^]u8)(allocated_mem)[size_of(rawptr):]) + + ptr := uintptr(aligned_mem) + aligned_ptr := (ptr + uintptr(a)-1) & ~(uintptr(a)-1) + if allocated_mem == nil { + aligned_free(old_ptr) + aligned_free(allocated_mem) + return nil, .Out_Of_Memory + } + + aligned_mem = rawptr(aligned_ptr) + ([^]rawptr)(aligned_mem)[-1] = allocated_mem + + if force_copy { + mem_copy_non_overlapping(aligned_mem, old_ptr, min(old_size, size)) + aligned_free(old_ptr) + } + + return byte_slice(aligned_mem, size), nil + } + + aligned_free :: proc(p: rawptr) { + if p != nil { + heap_free(([^]rawptr)(p)[-1]) + } + } + + aligned_resize :: proc(p: rawptr, old_size: int, new_size: int, new_alignment: int, zero_memory := true) -> (new_memory: []byte, err: Allocator_Error) { + if p == nil { + return aligned_alloc(new_size, new_alignment, nil, old_size, zero_memory) + } + + new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return + + // NOTE: heap_resize does not zero the new memory, so we do it + if zero_memory && new_size > old_size { + new_region := raw_data(new_memory[old_size:]) + intrinsics.mem_zero(new_region, new_size - old_size) + } + return + } + + switch mode { + case .Alloc, .Alloc_Non_Zeroed: + return aligned_alloc(size, alignment, nil, 0, mode == .Alloc) + + case .Free: + aligned_free(old_memory) + + case .Free_All: + return nil, .Mode_Not_Implemented + + case .Resize, .Resize_Non_Zeroed: + return aligned_resize(old_memory, old_size, size, alignment, mode == .Resize) + + case .Query_Features: + set := (^Allocator_Mode_Set)(old_memory) + if set != nil { + set^ = {.Alloc, .Alloc_Non_Zeroed, .Free, .Resize, .Resize_Non_Zeroed, .Query_Features} + } + return nil, nil + + case .Query_Info: + return nil, .Mode_Not_Implemented + } + + return nil, nil +} diff --git a/base/runtime/heap_allocator_implementation.odin b/base/runtime/heap_allocator_implementation.odin index d7148d8d23a..59738fcf6f8 100644 --- a/base/runtime/heap_allocator_implementation.odin +++ b/base/runtime/heap_allocator_implementation.odin @@ -1,3 +1,4 @@ +#+build !orca package runtime import "base:intrinsics" diff --git a/base/runtime/heap_allocator_info.odin b/base/runtime/heap_allocator_info.odin index 3c69861dec6..8fcc9cfb6da 100644 --- a/base/runtime/heap_allocator_info.odin +++ b/base/runtime/heap_allocator_info.odin @@ -1,3 +1,4 @@ +#+build !orca package runtime import "base:intrinsics" From 8c197937759aa58365c3a93e38cf7f96e9458a47 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Tue, 28 Jan 2025 14:52:33 -0500 Subject: [PATCH 12/21] Don't build feoramalloc on web platforms Their allocation pathways are already tuned for their respective platforms. --- base/runtime/heap_allocator.odin | 2 ++ base/runtime/heap_allocator_control.odin | 2 ++ base/runtime/heap_allocator_debugging.odin | 2 ++ base/runtime/heap_allocator_implementation.odin | 2 ++ base/runtime/heap_allocator_info.odin | 2 ++ 5 files changed, 10 insertions(+) diff --git a/base/runtime/heap_allocator.odin b/base/runtime/heap_allocator.odin index fba56c10a1c..0fb16034740 100644 --- a/base/runtime/heap_allocator.odin +++ b/base/runtime/heap_allocator.odin @@ -1,4 +1,6 @@ +#+build !js #+build !orca +#+build !wasi package runtime import "base:intrinsics" diff --git a/base/runtime/heap_allocator_control.odin b/base/runtime/heap_allocator_control.odin index f7ead8f57f6..4224058aef8 100644 --- a/base/runtime/heap_allocator_control.odin +++ b/base/runtime/heap_allocator_control.odin @@ -1,4 +1,6 @@ +#+build !js #+build !orca +#+build !wasi package runtime import "base:intrinsics" diff --git a/base/runtime/heap_allocator_debugging.odin b/base/runtime/heap_allocator_debugging.odin index e2315c37a82..d055f2ce902 100644 --- a/base/runtime/heap_allocator_debugging.odin +++ b/base/runtime/heap_allocator_debugging.odin @@ -1,4 +1,6 @@ +#+build !js #+build !orca +#+build !wasi package runtime import "base:intrinsics" diff --git a/base/runtime/heap_allocator_implementation.odin b/base/runtime/heap_allocator_implementation.odin index 59738fcf6f8..be3a4097fd6 100644 --- a/base/runtime/heap_allocator_implementation.odin +++ b/base/runtime/heap_allocator_implementation.odin @@ -1,4 +1,6 @@ +#+build !js #+build !orca +#+build !wasi package runtime import "base:intrinsics" diff --git a/base/runtime/heap_allocator_info.odin b/base/runtime/heap_allocator_info.odin index 8fcc9cfb6da..9249845b486 100644 --- a/base/runtime/heap_allocator_info.odin +++ b/base/runtime/heap_allocator_info.odin @@ -1,4 +1,6 @@ +#+build !js #+build !orca +#+build !wasi package runtime import "base:intrinsics" From 7a3e5478f1e1a8614a49cb738bafb6c5ce341528 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 2 Feb 2025 16:45:58 -0500 Subject: [PATCH 13/21] Simplify loop --- base/runtime/heap_allocator_control.odin | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/base/runtime/heap_allocator_control.odin b/base/runtime/heap_allocator_control.odin index 4224058aef8..4f5ae68b468 100644 --- a/base/runtime/heap_allocator_control.odin +++ b/base/runtime/heap_allocator_control.odin @@ -75,9 +75,9 @@ orphanage. This limitation is due to the avoidance of heap allocation. compact_heap_orphanage :: proc "contextless" () { // First, try to empty the orphanage so that we can evaluate each superpage. buffer: [128]^Heap_Superpage - for i := 0; i < len(buffer); i += 1 { - buffer[i] = heap_pop_orphan() - if buffer[i] == nil { + for &b in buffer { + b = heap_pop_orphan() + if b == nil { break } } From 4a77f2dbf3795b96bc983a2d653c8615e84b9481 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Tue, 4 Feb 2025 15:31:00 -0500 Subject: [PATCH 14/21] Strengthen order to prevent reordering --- base/runtime/heap_allocator_implementation.odin | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/base/runtime/heap_allocator_implementation.odin b/base/runtime/heap_allocator_implementation.odin index be3a4097fd6..3faacbe314b 100644 --- a/base/runtime/heap_allocator_implementation.odin +++ b/base/runtime/heap_allocator_implementation.odin @@ -1438,8 +1438,11 @@ heap_alloc :: proc "contextless" (size: int, zero: bool = true) -> (ptr: rawptr) } if should_remove { + // NOTE: The order of operations here is important to keep + // the cache from overflowing. The entry must first be + // removed, then the superpage has its flag cleared. intrinsics.atomic_store_explicit(&cache.superpages_with_remote_frees[i], nil, .Release) - intrinsics.atomic_store_explicit(&superpage.remote_free_set, false, .Release) + intrinsics.atomic_store_explicit(&superpage.remote_free_set, false, .Seq_Cst) removed += 1 } From 7b739f49fdf9d6c756d86c59ddc8b10662b43881 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Thu, 6 Feb 2025 19:20:30 -0500 Subject: [PATCH 15/21] Unify heap allocators - Simplify the malloc-based allocator - Correct the type signatures on malloc/calloc/realloc - Make `heap_alloc/heap_free/heap_resize` API consistent across platforms --- base/runtime/default_allocators_general.odin | 3 - base/runtime/heap_allocator.odin | 4 +- base/runtime/heap_allocator_fallback.odin | 139 +++++++++--------- .../heap_allocator_implementation.odin | 22 +-- base/runtime/wasm_allocator.odin | 20 +++ 5 files changed, 101 insertions(+), 87 deletions(-) diff --git a/base/runtime/default_allocators_general.odin b/base/runtime/default_allocators_general.odin index 64af6c90455..cbaf4d22a31 100644 --- a/base/runtime/default_allocators_general.odin +++ b/base/runtime/default_allocators_general.odin @@ -6,9 +6,6 @@ when ODIN_DEFAULT_TO_NIL_ALLOCATOR { } else when ODIN_DEFAULT_TO_PANIC_ALLOCATOR { default_allocator_proc :: panic_allocator_proc default_allocator :: panic_allocator -} else when ODIN_OS != .Orca && (ODIN_ARCH == .wasm32 || ODIN_ARCH == .wasm64p32) { - default_allocator :: default_wasm_allocator - default_allocator_proc :: wasm_allocator_proc } else { default_allocator :: heap_allocator default_allocator_proc :: heap_allocator_proc diff --git a/base/runtime/heap_allocator.odin b/base/runtime/heap_allocator.odin index 0fb16034740..7fc1c45c635 100644 --- a/base/runtime/heap_allocator.odin +++ b/base/runtime/heap_allocator.odin @@ -35,7 +35,7 @@ heap_allocator_proc :: proc( } return transmute([]byte)Raw_Slice{ data = ptr, len = size }, nil case .Alloc_Non_Zeroed: - ptr := heap_alloc(max(size, alignment), zero = false) + ptr := heap_alloc(max(size, alignment), zero_memory = false) if ptr == nil { return nil, .Out_Of_Memory } @@ -47,7 +47,7 @@ heap_allocator_proc :: proc( } return transmute([]byte)Raw_Slice{ data = ptr, len = size }, nil case .Resize_Non_Zeroed: - ptr := heap_resize(old_memory, old_size, max(size, alignment), zero = false) + ptr := heap_resize(old_memory, old_size, max(size, alignment), zero_memory = false) if ptr == nil { return nil, .Out_Of_Memory } diff --git a/base/runtime/heap_allocator_fallback.odin b/base/runtime/heap_allocator_fallback.odin index 129cd9d864d..25e906d7e3d 100644 --- a/base/runtime/heap_allocator_fallback.odin +++ b/base/runtime/heap_allocator_fallback.odin @@ -9,25 +9,39 @@ package runtime import "base:intrinsics" foreign { - @(link_name="malloc") _libc_malloc :: proc "c" (size: int) -> rawptr --- - @(link_name="calloc") _libc_calloc :: proc "c" (num, size: int) -> rawptr --- + @(link_name="malloc") _libc_malloc :: proc "c" (size: uint) -> rawptr --- + @(link_name="calloc") _libc_calloc :: proc "c" (num, size: uint) -> rawptr --- @(link_name="free") _libc_free :: proc "c" (ptr: rawptr) --- - @(link_name="realloc") _libc_realloc :: proc "c" (ptr: rawptr, size: int) -> rawptr --- + @(link_name="realloc") _libc_realloc :: proc "c" (ptr: rawptr, size: uint) -> rawptr --- } +@(require_results) heap_alloc :: proc "contextless" (size: int, zero_memory := true) -> rawptr { if size <= 0 { return nil } if zero_memory { - return _libc_calloc(1, size) + return _libc_calloc(1, uint(size)) } else { - return _libc_malloc(size) + return _libc_malloc(uint(size)) } } -heap_resize :: proc "contextless" (ptr: rawptr, new_size: int) -> rawptr { - return _libc_realloc(ptr, new_size) +@(require_results) +heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int, zero_memory: bool = true) -> (new_ptr: rawptr) { + new_ptr = _libc_realloc(old_ptr, uint(new_size)) + + // Section 7.22.3.5.2 of the C17 standard: "The contents of the new object + // shall be the same as that of the old object prior to deallocation, up to + // the lesser of the new and old sizes. Any bytes in the new object beyond + // the size of the old object have indeterminate values." + // + // Therefore, we zero the memory ourselves. + if zero_memory && new_size > old_size { + intrinsics.mem_zero(rawptr(uintptr(new_ptr) + uintptr(old_size)), new_size - old_size) + } + + return } heap_free :: proc "contextless" (ptr: rawptr) { @@ -44,84 +58,67 @@ heap_allocator :: proc() -> Allocator { heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode, size, alignment: int, old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) { - // - // NOTE(tetra, 2020-01-14): The heap doesn't respect alignment. - // Instead, we overallocate by `alignment + size_of(rawptr) - 1`, and insert - // padding. We also store the original pointer returned by heap_alloc right before - // the pointer we return to the user. - // - - aligned_alloc :: proc(size, alignment: int, old_ptr: rawptr, old_size: int, zero_memory := true) -> ([]byte, Allocator_Error) { - // NOTE(flysand): We need to reserve enough space for alignment, which - // includes the user data itself, the space to store the pointer to - // allocation start, as well as the padding required to align both - // the user data and the pointer. - a := max(alignment, align_of(rawptr)) - space := a-1 + size_of(rawptr) + size - allocated_mem: rawptr - - force_copy := old_ptr != nil && alignment > align_of(rawptr) - if old_ptr != nil && !force_copy { - original_old_ptr := ([^]rawptr)(old_ptr)[-1] - allocated_mem = heap_resize(original_old_ptr, space) - } else { - allocated_mem = heap_alloc(space, zero_memory) - } - aligned_mem := rawptr(([^]u8)(allocated_mem)[size_of(rawptr):]) + // Because malloc does not support alignment requests, and aligned_alloc + // has specific requirements for what sizes it supports, this allocator + // over-allocates by the alignment requested and stores the original + // pointer behind the address returned to the user. - ptr := uintptr(aligned_mem) - aligned_ptr := (ptr + uintptr(a)-1) & ~(uintptr(a)-1) - if allocated_mem == nil { - aligned_free(old_ptr) - aligned_free(allocated_mem) + switch mode { + case .Alloc, .Alloc_Non_Zeroed: + padding := max(alignment, size_of(rawptr)) + ptr := heap_alloc(size + padding, mode == .Alloc) + if ptr == nil { return nil, .Out_Of_Memory } + shift := uintptr(padding) - uintptr(ptr) & uintptr(padding-1) + aligned_ptr := rawptr(uintptr(ptr) + shift) + ([^]rawptr)(aligned_ptr)[-1] = ptr + return byte_slice(aligned_ptr, size), nil - aligned_mem = rawptr(aligned_ptr) - ([^]rawptr)(aligned_mem)[-1] = allocated_mem - - if force_copy { - mem_copy_non_overlapping(aligned_mem, old_ptr, min(old_size, size)) - aligned_free(old_ptr) + case .Free: + if old_memory != nil { + heap_free(([^]rawptr)(old_memory)[-1]) } - return byte_slice(aligned_mem, size), nil - } - - aligned_free :: proc(p: rawptr) { - if p != nil { - heap_free(([^]rawptr)(p)[-1]) - } - } + case .Free_All: + return nil, .Mode_Not_Implemented - aligned_resize :: proc(p: rawptr, old_size: int, new_size: int, new_alignment: int, zero_memory := true) -> (new_memory: []byte, err: Allocator_Error) { - if p == nil { - return aligned_alloc(new_size, new_alignment, nil, old_size, zero_memory) - } + case .Resize, .Resize_Non_Zeroed: + new_padding := max(alignment, size_of(rawptr)) + original_ptr := ([^]rawptr)(old_memory)[-1] + ptr: rawptr + + if alignment > align_of(rawptr) { + // The alignment is in excess of what malloc/realloc will return + // for address alignment per the C standard, so we must reallocate + // manually in order to guarantee alignment for the user. + // + // Resizing through realloc simply won't work because it's possible + // that our target address originally only needed a padding of 8 + // bytes, but if we expand the memory used and the address is moved, + // we may then need 16 bytes for proper alignment, for example. + // + // We'll copy the old data later. + ptr = heap_alloc(size + new_padding, mode == .Resize) - new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return + } else { + real_old_size := size_of(rawptr) + old_size + real_new_size := new_padding + size - // NOTE: heap_resize does not zero the new memory, so we do it - if zero_memory && new_size > old_size { - new_region := raw_data(new_memory[old_size:]) - intrinsics.mem_zero(new_region, new_size - old_size) + ptr = heap_resize(original_ptr, real_old_size, real_new_size, mode == .Resize) } - return - } - switch mode { - case .Alloc, .Alloc_Non_Zeroed: - return aligned_alloc(size, alignment, nil, 0, mode == .Alloc) - - case .Free: - aligned_free(old_memory) + shift := uintptr(new_padding) - uintptr(ptr) & uintptr(new_padding-1) + aligned_ptr := rawptr(uintptr(ptr) + shift) + ([^]rawptr)(aligned_ptr)[-1] = ptr - case .Free_All: - return nil, .Mode_Not_Implemented + if alignment > align_of(rawptr) { + intrinsics.mem_copy_non_overlapping(aligned_ptr, old_memory, min(size, old_size)) + heap_free(original_ptr) + } - case .Resize, .Resize_Non_Zeroed: - return aligned_resize(old_memory, old_size, size, alignment, mode == .Resize) + return byte_slice(aligned_ptr, size), nil case .Query_Features: set := (^Allocator_Mode_Set)(old_memory) diff --git a/base/runtime/heap_allocator_implementation.odin b/base/runtime/heap_allocator_implementation.odin index 3faacbe314b..5a940688c6c 100644 --- a/base/runtime/heap_allocator_implementation.odin +++ b/base/runtime/heap_allocator_implementation.odin @@ -1347,7 +1347,7 @@ heap_orphanage_count: int Allocate an arbitrary amount of memory from the heap and optionally zero it. */ @(require_results) -heap_alloc :: proc "contextless" (size: int, zero: bool = true) -> (ptr: rawptr) { +heap_alloc :: proc "contextless" (size: int, zero_memory: bool = true) -> (ptr: rawptr) { assert_contextless(size >= 0, "The heap allocator was given a negative size.") // Handle Huge allocations. @@ -1491,7 +1491,7 @@ heap_alloc :: proc "contextless" (size: int, zero: bool = true) -> (ptr: rawptr) slab.local_free[sector] &~= (1 << index) // Zero the memory, if needed. - if zero && index < uintptr(slab.dirty_bins) { + if zero_memory && index < uintptr(slab.dirty_bins) { // Ensure that the memory zeroing is not optimized out by the compiler. intrinsics.mem_zero_volatile(ptr, rounded_size) // NOTE: A full memory fence should not be needed for any newly-zeroed @@ -1694,7 +1694,7 @@ heap_free :: proc "contextless" (ptr: rawptr) { Resize memory returned by `heap_alloc`. */ @(require_results) -heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int, zero: bool = true) -> (new_ptr: rawptr) { +heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int, zero_memory: bool = true) -> (new_ptr: rawptr) { Size_Category :: enum { Unknown, Bin, @@ -1722,7 +1722,7 @@ heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int if new_category != old_category { // A change in size category cannot be optimized. - new_ptr = heap_alloc(new_size, zero) + new_ptr = heap_alloc(new_size, zero_memory) intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) heap_free(old_ptr) heap_debug_cover(.Resize_Crossed_Size_Categories) @@ -1750,7 +1750,7 @@ heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int u := uintptr(resized_superpage) + HEAP_HUGE_ALLOCATION_BOOK_KEEPING new_ptr = rawptr(u - u & (HEAP_MAX_ALIGNMENT-1)) - if zero && new_size > old_size { + if zero_memory && new_size > old_size { intrinsics.mem_zero_volatile( rawptr(uintptr(new_ptr) + uintptr(old_size)), new_size - old_size, @@ -1771,7 +1771,7 @@ heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int contiguous_new := heap_slabs_needed_for_size(new_size) if contiguous_new == contiguous_old { // We already have enough slabs to serve the request. - if zero && new_size > old_size { + if zero_memory && new_size > old_size { intrinsics.mem_zero_volatile( rawptr(uintptr(old_ptr) + uintptr(old_size)), new_size - old_size, @@ -1785,7 +1785,7 @@ heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int if slab.index + contiguous_new >= HEAP_SLAB_COUNT { // Expanding this slab would go beyond the Superpage. // We need more memory. - new_ptr = heap_alloc(new_size, zero) + new_ptr = heap_alloc(new_size, zero_memory) intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) heap_free(old_ptr) return @@ -1797,7 +1797,7 @@ heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int // involve touching the Superpage. // // We must re-allocate. - new_ptr = heap_alloc(new_size, zero) + new_ptr = heap_alloc(new_size, zero_memory) intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) heap_free(old_ptr) heap_debug_cover(.Resize_Wide_Slab_From_Remote_Thread) @@ -1826,7 +1826,7 @@ heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int for i := slab.index + contiguous_old; i < slab.index + contiguous_new; i += 1 { if heap_superpage_index_slab(superpage, i).bin_size != 0 { // Contiguous space is unavailable. - new_ptr = heap_alloc(new_size, zero) + new_ptr = heap_alloc(new_size, zero_memory) intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) heap_free(old_ptr) heap_debug_cover(.Resize_Wide_Slab_Failed_To_Find_Contiguous_Expansion) @@ -1859,7 +1859,7 @@ heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int // See if a bin rank change is needed. new_rounded_size := heap_round_to_bin_size(new_size) if slab.bin_size == new_rounded_size { - if zero && new_size > old_size { + if zero_memory && new_size > old_size { intrinsics.mem_zero_volatile( rawptr(uintptr(old_ptr) + uintptr(old_size)), new_size - old_size, @@ -1879,7 +1879,7 @@ heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int } // Allocate and copy, as a last resort. - new_ptr = heap_alloc(new_size, zero) + new_ptr = heap_alloc(new_size, zero_memory) intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) heap_free(old_ptr) return diff --git a/base/runtime/wasm_allocator.odin b/base/runtime/wasm_allocator.odin index 574f3dd06ef..c4799fcbe1c 100644 --- a/base/runtime/wasm_allocator.odin +++ b/base/runtime/wasm_allocator.odin @@ -1,3 +1,4 @@ +#+build !orca #+build wasm32, wasm64p32 package runtime @@ -869,3 +870,22 @@ aligned_realloc :: proc(a: ^WASM_Allocator, ptr: rawptr, alignment, size: uint, return newptr } + +heap_allocator :: default_wasm_allocator +heap_allocator_proc :: wasm_allocator_proc + +@(require_results) +heap_alloc :: proc(size: int, zero_memory: bool = true) -> (ptr: rawptr) { + bytes, _ := wasm_allocator_proc(&global_default_wasm_allocator_data, .Alloc if zero_memory else .Alloc_Non_Zeroed, size, 0, nil, 0) + return raw_data(bytes) +} + +@(require_results) +heap_resize :: proc(old_ptr: rawptr, old_size: int, new_size: int, zero_memory: bool = true) -> (new_ptr: rawptr) { + bytes, _ := wasm_allocator_proc(&global_default_wasm_allocator_data, .Resize if zero_memory else .Resize_Non_Zeroed, new_size, 0, old_ptr, old_size) + return raw_data(bytes) +} + +heap_free :: proc(ptr: rawptr) { + wasm_allocator_proc(&global_default_wasm_allocator_data, .Free, 0, 0, ptr, 0) +} From a08fef0eb8c211e81640d400348a1bc4273406a5 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Mon, 10 Feb 2025 14:11:31 -0500 Subject: [PATCH 16/21] Optimize virtual memory resizing on Darwin --- base/runtime/virtual_memory_darwin.odin | 56 +++++++++++++++++++++---- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/base/runtime/virtual_memory_darwin.odin b/base/runtime/virtual_memory_darwin.odin index 60c59eac912..963d227c5b6 100644 --- a/base/runtime/virtual_memory_darwin.odin +++ b/base/runtime/virtual_memory_darwin.odin @@ -9,6 +9,7 @@ foreign import lib "system:System.framework" foreign lib { mach_task_self :: proc() -> u32 --- + mach_vm_allocate :: proc(target: u32, address: ^u64, size: u64, flags: i32) -> i32 --- mach_vm_deallocate :: proc(target: u32, address: u64, size: u64) -> i32 --- mach_vm_map :: proc( target_task: u32, @@ -23,6 +24,19 @@ foreign lib { max_protection: i32, inheritance: u32, ) -> i32 --- + mach_vm_remap :: proc( + target_task: u32, + target_address: ^u64, + size: u64, + mask: u64, + flags: i32, + src_task: u32, + src_address: u64, + copy: b32, + cur_protection, + max_protection: ^i32, + inheritance: u32, + ) -> i32 --- } // The following features are specific to Darwin only. @@ -85,14 +99,40 @@ _free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { } _resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { - // NOTE(Feoramund): mach_vm_remap does not permit resizing, as far as I understand it. - result: rawptr = --- - if alignment == 0 { - result = _allocate_virtual_memory(new_size) + old_size_pages := old_size / PAGE_SIZE + new_size_pages := new_size / PAGE_SIZE + if old_size % PAGE_SIZE != 0 { + old_size_pages += 1 + } + if new_size % PAGE_SIZE != 0 { + new_size_pages += 1 + } + + if new_size_pages == old_size_pages { + return ptr + } else if new_size_pages > old_size_pages { + new_address: u64 + result_alloc := mach_vm_allocate(mach_task_self(), &new_address, u64(new_size), VM_FLAGS_ANYWHERE) + if result_alloc != 0 { + return nil + } + + alignment_mask: u64 + if alignment != 0 { + alignment_mask = u64(alignment) - 1 + } + + cur_protection, max_protection: i32 + result_remap := mach_vm_remap(mach_task_self(), &new_address, u64(old_size), alignment_mask, VM_FLAGS_ANYWHERE, mach_task_self(), u64(uintptr(ptr)), true, &cur_protection, &max_protection, VM_INHERIT_COPY) + if result_remap != 0 { + return nil + } + mach_vm_deallocate(mach_task_self(), u64(uintptr(ptr)), u64(old_size)) + return rawptr(uintptr(new_address)) } else { - result = _allocate_virtual_memory_aligned(new_size, alignment) + new_size_boundary := new_size_pages * PAGE_SIZE + shrink_by := u64(old_size - new_size) + mach_vm_deallocate(mach_task_self(), u64(uintptr(ptr) + uintptr(new_size_boundary)), shrink_by) + return rawptr(uintptr(ptr)) } - intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size)) - mach_vm_deallocate(mach_task_self(), u64(uintptr(ptr)), u64(old_size)) - return result } From 5f00ecc88b1b502f2624c35371b94cb0324fe009 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Mon, 10 Feb 2025 14:18:33 -0500 Subject: [PATCH 17/21] Favor rescheduling superpage cache entry instead This should be more robust during merges that happen simultaneously with remote frees. Previously, we would preserve the cache entry, but there is an edge case where a slab of a lower index could be remotely freed and missed during a merge. --- .../heap_allocator_implementation.odin | 62 +++++++++++-------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/base/runtime/heap_allocator_implementation.odin b/base/runtime/heap_allocator_implementation.odin index 5a940688c6c..f7acd5bce32 100644 --- a/base/runtime/heap_allocator_implementation.odin +++ b/base/runtime/heap_allocator_implementation.odin @@ -1380,14 +1380,23 @@ heap_alloc :: proc "contextless" (size: int, zero_memory: bool = true) -> (ptr: if superpage == nil { continue consume_loop } - // `should_remove` determines whether or not we keep the cache - // entry around, pending comparison of the consistency of the - // number of free bins we expect to have versus the free bins - // we do merge. + + // First, we will remove the cache entry, to avoid issues with + // freeing operations that happen simultaneously while we're + // merging. + // + // For example, if we merge the freed bins in one slab while a + // lower-index slab receives a remote free, we would otherwise + // be unaware of that, and if the superpage was still marked as + // being in the cache, the freeing thread would not schedule + // it. // - // This is a heuristic to help with mergers that happen during - // simultaneous remote frees. - should_remove := true + // The performance hit should be negligible in the event that + // we have to reschedule the superpage. + intrinsics.atomic_store_explicit(&cache.superpages_with_remote_frees[i], nil, .Release) + // Seq_Cst, to prevent these two from being seen out of order. + intrinsics.atomic_store_explicit(&superpage.remote_free_set, false, .Seq_Cst) + should_reschedule := false for j := 0; j < HEAP_SLAB_COUNT; /**/ { slab := heap_superpage_index_slab(superpage, j) @@ -1396,24 +1405,14 @@ heap_alloc :: proc "contextless" (size: int, zero_memory: bool = true) -> (ptr: // signals that it has remote frees. This is for the sake // of speed, given how large the bitmaps can be. merge_block: if bin_size > 0 && slab.free_bins == 0 && intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Acquire) > 0 { - bins_left := heap_merge_remote_frees(slab) - if bins_left != 0 { - // Here we've detected that there are still remote - // frees left, so the entry is left in the cache - // for the next round of merges. - // - // NOTE: If we were to loop back and try to - // re-merge the unmerged bins, we could end up in a - // situation where this thread is stalled under - // heavy load. - should_remove = false - } + heap_merge_remote_frees(slab) if slab.free_bins == 0 { // No bins were freed at all, which is possible due // to the parallel nature of this code. The freeing // thread could have signalled its intent, but we // merged before it had a chance to flip the // necessary bit. + should_reschedule = true break merge_block } intrinsics.atomic_store_explicit(&slab.is_full, false, .Release) @@ -1437,12 +1436,25 @@ heap_alloc :: proc "contextless" (size: int, zero_memory: bool = true) -> (ptr: } } - if should_remove { - // NOTE: The order of operations here is important to keep - // the cache from overflowing. The entry must first be - // removed, then the superpage has its flag cleared. - intrinsics.atomic_store_explicit(&cache.superpages_with_remote_frees[i], nil, .Release) - intrinsics.atomic_store_explicit(&superpage.remote_free_set, false, .Seq_Cst) + if should_reschedule { + // This is the logic found in `heap_remote_cache_add_remote_free_superpage`, simplified. + if !intrinsics.atomic_exchange_explicit(&superpage.remote_free_set, true, .Acq_Rel) { + cache_reschedule := local_heap_cache + reschedule_loop: for { + for j := 0; j < len(cache_reschedule.superpages_with_remote_frees); j += 1 { + old, swapped := intrinsics.atomic_compare_exchange_strong_explicit(&cache_reschedule.superpages_with_remote_frees[j], nil, superpage, .Acq_Rel, .Relaxed) + assert_contextless(old != superpage, "The heap allocator found a duplicate of a superpage in its superpages_with_remote_frees while rescheduling.") + if swapped { + intrinsics.atomic_add_explicit(&local_heap_cache.remote_free_count, 1, .Release) + break reschedule_loop + } + } + next_cache_block := intrinsics.atomic_load_explicit(&cache_reschedule.next_cache_block, .Acquire) + assert_contextless(next_cache_block != nil, "The heap allocator failed to find free space for a new entry in its superpages_with_remote_frees cache.") + cache_reschedule = next_cache_block + } + } + } else { removed += 1 } From 06ef45720859465cf5cdebcc187da23069a071a1 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Mon, 10 Feb 2025 16:06:58 -0500 Subject: [PATCH 18/21] Use `mach_task_self_` global instead --- base/runtime/virtual_memory_darwin.odin | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/base/runtime/virtual_memory_darwin.odin b/base/runtime/virtual_memory_darwin.odin index 963d227c5b6..4642edf8437 100644 --- a/base/runtime/virtual_memory_darwin.odin +++ b/base/runtime/virtual_memory_darwin.odin @@ -8,7 +8,7 @@ VIRTUAL_MEMORY_SUPPORTED :: true foreign import lib "system:System.framework" foreign lib { - mach_task_self :: proc() -> u32 --- + mach_task_self_: u32 mach_vm_allocate :: proc(target: u32, address: ^u64, size: u64, flags: i32) -> i32 --- mach_vm_deallocate :: proc(target: u32, address: u64, size: u64) -> i32 --- mach_vm_map :: proc( @@ -56,7 +56,7 @@ VM_INHERIT_COPY :: 1 _allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { address: u64 - result := mach_vm_map(mach_task_self(), &address, u64(size), 0, VM_FLAGS_ANYWHERE, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY) + result := mach_vm_map(mach_task_self_, &address, u64(size), 0, VM_FLAGS_ANYWHERE, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY) if result != 0 { return nil } @@ -76,7 +76,7 @@ _allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { } } alignment_mask: u64 = SUPERPAGE_SIZE - 1 // Assumes a power of two size, ensured by an assertion in `virtual_memory.odin`. - result := mach_vm_map(mach_task_self(), &address, SUPERPAGE_SIZE, alignment_mask, flags, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY) + result := mach_vm_map(mach_task_self_, &address, SUPERPAGE_SIZE, alignment_mask, flags, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY) if result != 0 { return nil } @@ -87,7 +87,7 @@ _allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { _allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { address: u64 alignment_mask: u64 = u64(alignment) - 1 - result := mach_vm_map(mach_task_self(), &address, u64(size), alignment_mask, VM_FLAGS_ANYWHERE, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY) + result := mach_vm_map(mach_task_self_, &address, u64(size), alignment_mask, VM_FLAGS_ANYWHERE, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY) if result != 0 { return nil } @@ -95,7 +95,7 @@ _allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: in } _free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { - mach_vm_deallocate(mach_task_self(), u64(uintptr(ptr)), u64(size)) + mach_vm_deallocate(mach_task_self_, u64(uintptr(ptr)), u64(size)) } _resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { @@ -112,7 +112,7 @@ _resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_si return ptr } else if new_size_pages > old_size_pages { new_address: u64 - result_alloc := mach_vm_allocate(mach_task_self(), &new_address, u64(new_size), VM_FLAGS_ANYWHERE) + result_alloc := mach_vm_allocate(mach_task_self_, &new_address, u64(new_size), VM_FLAGS_ANYWHERE) if result_alloc != 0 { return nil } @@ -123,16 +123,16 @@ _resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_si } cur_protection, max_protection: i32 - result_remap := mach_vm_remap(mach_task_self(), &new_address, u64(old_size), alignment_mask, VM_FLAGS_ANYWHERE, mach_task_self(), u64(uintptr(ptr)), true, &cur_protection, &max_protection, VM_INHERIT_COPY) + result_remap := mach_vm_remap(mach_task_self_, &new_address, u64(old_size), alignment_mask, VM_FLAGS_ANYWHERE, mach_task_self_, u64(uintptr(ptr)), true, &cur_protection, &max_protection, VM_INHERIT_COPY) if result_remap != 0 { return nil } - mach_vm_deallocate(mach_task_self(), u64(uintptr(ptr)), u64(old_size)) + mach_vm_deallocate(mach_task_self_, u64(uintptr(ptr)), u64(old_size)) return rawptr(uintptr(new_address)) } else { new_size_boundary := new_size_pages * PAGE_SIZE shrink_by := u64(old_size - new_size) - mach_vm_deallocate(mach_task_self(), u64(uintptr(ptr) + uintptr(new_size_boundary)), shrink_by) + mach_vm_deallocate(mach_task_self_, u64(uintptr(ptr) + uintptr(new_size_boundary)), shrink_by) return rawptr(uintptr(ptr)) } } From 21c61167e96dd93097f25185e66476b2f2adeb50 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Mon, 10 Feb 2025 16:14:46 -0500 Subject: [PATCH 19/21] Ensure remapped memory has desired protection on Darwin --- base/runtime/virtual_memory_darwin.odin | 3 +++ 1 file changed, 3 insertions(+) diff --git a/base/runtime/virtual_memory_darwin.odin b/base/runtime/virtual_memory_darwin.odin index 4642edf8437..548b5ff0438 100644 --- a/base/runtime/virtual_memory_darwin.odin +++ b/base/runtime/virtual_memory_darwin.odin @@ -11,6 +11,7 @@ foreign lib { mach_task_self_: u32 mach_vm_allocate :: proc(target: u32, address: ^u64, size: u64, flags: i32) -> i32 --- mach_vm_deallocate :: proc(target: u32, address: u64, size: u64) -> i32 --- + mach_vm_protect :: proc(target_task: u32, address: u64, size: u64, set_maximum: b32, new_protection: i32) -> i32 --- mach_vm_map :: proc( target_task: u32, address: ^u64, @@ -122,6 +123,8 @@ _resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_si alignment_mask = u64(alignment) - 1 } + mach_vm_protect(mach_task_self_, new_address, u64(new_size), true, VM_PROT_READ|VM_PROT_WRITE) + mach_vm_protect(mach_task_self_, new_address, u64(new_size), false, VM_PROT_READ|VM_PROT_WRITE) cur_protection, max_protection: i32 result_remap := mach_vm_remap(mach_task_self_, &new_address, u64(old_size), alignment_mask, VM_FLAGS_ANYWHERE, mach_task_self_, u64(uintptr(ptr)), true, &cur_protection, &max_protection, VM_INHERIT_COPY) if result_remap != 0 { From c625bc97e8d9d657e259c0aa26021b747d2cd201 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Mon, 10 Feb 2025 18:03:18 -0500 Subject: [PATCH 20/21] Revert to unoptimized behavior on Darwin --- base/runtime/virtual_memory_darwin.odin | 43 ++++--------------------- 1 file changed, 7 insertions(+), 36 deletions(-) diff --git a/base/runtime/virtual_memory_darwin.odin b/base/runtime/virtual_memory_darwin.odin index 548b5ff0438..77b46dc826a 100644 --- a/base/runtime/virtual_memory_darwin.odin +++ b/base/runtime/virtual_memory_darwin.odin @@ -100,42 +100,13 @@ _free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { } _resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { - old_size_pages := old_size / PAGE_SIZE - new_size_pages := new_size / PAGE_SIZE - if old_size % PAGE_SIZE != 0 { - old_size_pages += 1 - } - if new_size % PAGE_SIZE != 0 { - new_size_pages += 1 - } - - if new_size_pages == old_size_pages { - return ptr - } else if new_size_pages > old_size_pages { - new_address: u64 - result_alloc := mach_vm_allocate(mach_task_self_, &new_address, u64(new_size), VM_FLAGS_ANYWHERE) - if result_alloc != 0 { - return nil - } - - alignment_mask: u64 - if alignment != 0 { - alignment_mask = u64(alignment) - 1 - } - - mach_vm_protect(mach_task_self_, new_address, u64(new_size), true, VM_PROT_READ|VM_PROT_WRITE) - mach_vm_protect(mach_task_self_, new_address, u64(new_size), false, VM_PROT_READ|VM_PROT_WRITE) - cur_protection, max_protection: i32 - result_remap := mach_vm_remap(mach_task_self_, &new_address, u64(old_size), alignment_mask, VM_FLAGS_ANYWHERE, mach_task_self_, u64(uintptr(ptr)), true, &cur_protection, &max_protection, VM_INHERIT_COPY) - if result_remap != 0 { - return nil - } - mach_vm_deallocate(mach_task_self_, u64(uintptr(ptr)), u64(old_size)) - return rawptr(uintptr(new_address)) + result: rawptr = --- + if alignment == 0 { + result = _allocate_virtual_memory(new_size) } else { - new_size_boundary := new_size_pages * PAGE_SIZE - shrink_by := u64(old_size - new_size) - mach_vm_deallocate(mach_task_self_, u64(uintptr(ptr) + uintptr(new_size_boundary)), shrink_by) - return rawptr(uintptr(ptr)) + result = _allocate_virtual_memory_aligned(new_size, alignment) } + intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size)) + mach_vm_deallocate(mach_task_self_, u64(uintptr(ptr)), u64(old_size)) + return result } From 6e0f5184f177cc953084a22fa750cbe101efcd6f Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Tue, 11 Feb 2025 18:12:14 -0500 Subject: [PATCH 21/21] Remove `HEAP_PANIC_ON_FREE_NIL` --- base/runtime/heap_allocator_implementation.odin | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/base/runtime/heap_allocator_implementation.odin b/base/runtime/heap_allocator_implementation.odin index f7acd5bce32..cf3f2d795c3 100644 --- a/base/runtime/heap_allocator_implementation.odin +++ b/base/runtime/heap_allocator_implementation.odin @@ -52,7 +52,6 @@ HEAP_MIN_BIN_SIZE :: #config(ODIN_HEAP_MIN_BIN_SIZE, 8 * Byte) HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES :: #config(ODIN_HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES, 3) HEAP_SUPERPAGE_CACHE_RATIO :: #config(ODIN_HEAP_SUPERPAGE_CACHE_RATIO, 20) HEAP_PANIC_ON_DOUBLE_FREE :: #config(ODIN_HEAP_PANIC_ON_DOUBLE_FREE, true) -HEAP_PANIC_ON_FREE_NIL :: #config(ODIN_HEAP_PANIC_ON_FREE_NIL, false) // // Constants @@ -1557,11 +1556,7 @@ Free memory returned by `heap_alloc`. heap_free :: proc "contextless" (ptr: rawptr) { // Check for nil. if ptr == nil { - when HEAP_PANIC_ON_FREE_NIL { - panic_contextless("The heap allocator was given a nil pointer to free.") - } else { - return - } + return } superpage := find_superpage_from_pointer(ptr)