diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dc4240b0414..a32d7e68040 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,6 +34,7 @@ jobs: ./odin check examples/all -vet -strict-style -disallow-do -target:netbsd_arm64 ./odin check vendor/sdl3 -vet -strict-style -disallow-do -target:netbsd_amd64 -no-entry-point ./odin check vendor/sdl3 -vet -strict-style -disallow-do -target:netbsd_arm64 -no-entry-point + ./odin run tests/heap_allocator -vet -strict-style -disallow-do -define:ODIN_DEBUG_HEAP=true -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true ./odin test tests/core/speed.odin -file -all-packages -vet -strict-style -disallow-do -o:speed -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true ./odin test tests/vendor -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true @@ -65,6 +66,7 @@ jobs: gmake -C vendor/miniaudio/src ./odin check examples/all -vet -strict-style -disallow-do -target:freebsd_amd64 ./odin check vendor/sdl3 -vet -strict-style -disallow-do -target:freebsd_amd64 -no-entry-point + ./odin run tests/heap_allocator -vet -strict-style -disallow-do -define:ODIN_DEBUG_HEAP=true -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true ./odin test tests/core/speed.odin -file -all-packages -vet -strict-style -disallow-do -o:speed -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true ./odin test tests/vendor -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true @@ -123,6 +125,8 @@ jobs: run: ./odin check examples/all -strict-style -vet -disallow-do - name: Odin check vendor/sdl3 run: ./odin check vendor/sdl3 -strict-style -vet -disallow-do -no-entry-point + - name: Odin heap allocator tests + run: ./odin run tests/heap_allocator -vet -strict-style -disallow-do -define:ODIN_DEBUG_HEAP=true -sanitize:thread -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests - name: Normal Core library tests run: ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true - name: Optimized Core library tests @@ -211,6 +215,11 @@ jobs: run: | call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat odin check vendor/sdl3 -vet -strict-style -disallow-do -no-entry-point + - name: Odin heap allocator tests + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat + odin run tests/heap_allocator -vet -strict-style -disallow-do -define:ODIN_DEBUG_HEAP=true -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests - name: Core library tests shell: cmd run: | @@ -305,6 +314,9 @@ jobs: - name: Odin run -debug run: ./odin run examples/demo -debug -vet -strict-style -disallow-do -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static" -no-rpath + - name: Odin heap allocator tests + run: ./odin run tests/heap_allocator -vet -strict-style -disallow-do -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static" -no-rpath -define:ODIN_DEBUG_HEAP=true -- -allocator=feoramalloc -vmem-tests -serial-tests -parallel-tests + - name: Normal Core library tests run: ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static" -no-rpath diff --git a/base/runtime/default_allocators_general.odin b/base/runtime/default_allocators_general.odin index 64af6c90455..cbaf4d22a31 100644 --- a/base/runtime/default_allocators_general.odin +++ b/base/runtime/default_allocators_general.odin @@ -6,9 +6,6 @@ when ODIN_DEFAULT_TO_NIL_ALLOCATOR { } else when ODIN_DEFAULT_TO_PANIC_ALLOCATOR { default_allocator_proc :: panic_allocator_proc default_allocator :: panic_allocator -} else when ODIN_OS != .Orca && (ODIN_ARCH == .wasm32 || ODIN_ARCH == .wasm64p32) { - default_allocator :: default_wasm_allocator - default_allocator_proc :: wasm_allocator_proc } else { default_allocator :: heap_allocator default_allocator_proc :: heap_allocator_proc diff --git a/base/runtime/heap_allocator.odin b/base/runtime/heap_allocator.odin index 4b04dffef15..7fc1c45c635 100644 --- a/base/runtime/heap_allocator.odin +++ b/base/runtime/heap_allocator.odin @@ -1,3 +1,6 @@ +#+build !js +#+build !orca +#+build !wasi package runtime import "base:intrinsics" @@ -9,111 +12,65 @@ heap_allocator :: proc() -> Allocator { } } -heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode, - size, alignment: int, - old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) { - // - // NOTE(tetra, 2020-01-14): The heap doesn't respect alignment. - // Instead, we overallocate by `alignment + size_of(rawptr) - 1`, and insert - // padding. We also store the original pointer returned by heap_alloc right before - // the pointer we return to the user. - // - - aligned_alloc :: proc(size, alignment: int, old_ptr: rawptr, old_size: int, zero_memory := true) -> ([]byte, Allocator_Error) { - // Not(flysand): We need to reserve enough space for alignment, which - // includes the user data itself, the space to store the pointer to - // allocation start, as well as the padding required to align both - // the user data and the pointer. - a := max(alignment, align_of(rawptr)) - space := a-1 + size_of(rawptr) + size - allocated_mem: rawptr - - force_copy := old_ptr != nil && alignment > align_of(rawptr) - - if old_ptr != nil && !force_copy { - original_old_ptr := ([^]rawptr)(old_ptr)[-1] - allocated_mem = heap_resize(original_old_ptr, space) - } else { - allocated_mem = heap_alloc(space, zero_memory) - } - aligned_mem := rawptr(([^]u8)(allocated_mem)[size_of(rawptr):]) - - ptr := uintptr(aligned_mem) - aligned_ptr := (ptr + uintptr(a)-1) & ~(uintptr(a)-1) - if allocated_mem == nil { - aligned_free(old_ptr) - aligned_free(allocated_mem) +heap_allocator_proc :: proc( + allocator_data: rawptr, + mode: Allocator_Mode, + size, alignment: int, + old_memory: rawptr, + old_size: int, + loc := #caller_location, +) -> ([]byte, Allocator_Error) { + assert(alignment <= HEAP_MAX_ALIGNMENT, "Heap allocation alignment beyond HEAP_MAX_ALIGNMENT bytes is not supported.", loc = loc) + assert(alignment >= 0, "Alignment must be greater than or equal to zero.", loc = loc) + switch mode { + case .Alloc: + // All allocations are aligned to at least their size up to + // `HEAP_MAX_ALIGNMENT`, and by virtue of binary arithmetic, any + // address aligned to N will also be aligned to N>>1. + // + // Therefore, we have no book-keeping costs for alignment. + ptr := heap_alloc(max(size, alignment)) + if ptr == nil { return nil, .Out_Of_Memory } - - aligned_mem = rawptr(aligned_ptr) - ([^]rawptr)(aligned_mem)[-1] = allocated_mem - - if force_copy { - mem_copy_non_overlapping(aligned_mem, old_ptr, min(old_size, size)) - aligned_free(old_ptr) - } - - return byte_slice(aligned_mem, size), nil - } - - aligned_free :: proc(p: rawptr) { - if p != nil { - heap_free(([^]rawptr)(p)[-1]) + return transmute([]byte)Raw_Slice{ data = ptr, len = size }, nil + case .Alloc_Non_Zeroed: + ptr := heap_alloc(max(size, alignment), zero_memory = false) + if ptr == nil { + return nil, .Out_Of_Memory } - } - - aligned_resize :: proc(p: rawptr, old_size: int, new_size: int, new_alignment: int, zero_memory := true) -> (new_memory: []byte, err: Allocator_Error) { - if p == nil { - return aligned_alloc(new_size, new_alignment, nil, old_size, zero_memory) + return transmute([]byte)Raw_Slice{ data = ptr, len = size }, nil + case .Resize: + ptr := heap_resize(old_memory, old_size, max(size, alignment)) + if ptr == nil { + return nil, .Out_Of_Memory } - - new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return - - // NOTE: heap_resize does not zero the new memory, so we do it - if zero_memory && new_size > old_size { - new_region := raw_data(new_memory[old_size:]) - intrinsics.mem_zero(new_region, new_size - old_size) + return transmute([]byte)Raw_Slice{ data = ptr, len = size }, nil + case .Resize_Non_Zeroed: + ptr := heap_resize(old_memory, old_size, max(size, alignment), zero_memory = false) + if ptr == nil { + return nil, .Out_Of_Memory } - return - } - - switch mode { - case .Alloc, .Alloc_Non_Zeroed: - return aligned_alloc(size, alignment, nil, 0, mode == .Alloc) - + return transmute([]byte)Raw_Slice{ data = ptr, len = size }, nil case .Free: - aligned_free(old_memory) - + heap_free(old_memory) case .Free_All: return nil, .Mode_Not_Implemented - - case .Resize, .Resize_Non_Zeroed: - return aligned_resize(old_memory, old_size, size, alignment, mode == .Resize) - case .Query_Features: set := (^Allocator_Mode_Set)(old_memory) if set != nil { - set^ = {.Alloc, .Alloc_Non_Zeroed, .Free, .Resize, .Resize_Non_Zeroed, .Query_Features} + set^ = { + .Alloc, + .Alloc_Non_Zeroed, + .Resize, + .Resize_Non_Zeroed, + .Free, + .Query_Features, + } } return nil, nil - case .Query_Info: return nil, .Mode_Not_Implemented } - return nil, nil } - - -heap_alloc :: proc "contextless" (size: int, zero_memory := true) -> rawptr { - return _heap_alloc(size, zero_memory) -} - -heap_resize :: proc "contextless" (ptr: rawptr, new_size: int) -> rawptr { - return _heap_resize(ptr, new_size) -} - -heap_free :: proc "contextless" (ptr: rawptr) { - _heap_free(ptr) -} \ No newline at end of file diff --git a/base/runtime/heap_allocator_control.odin b/base/runtime/heap_allocator_control.odin new file mode 100644 index 00000000000..4f5ae68b468 --- /dev/null +++ b/base/runtime/heap_allocator_control.odin @@ -0,0 +1,92 @@ +#+build !js +#+build !orca +#+build !wasi +package runtime + +import "base:intrinsics" + +/* +Merge all remote frees then free as many slabs as possible. + +This bypasses any heuristics that keep slabs setup. + +Returns true if the superpage was emptied and freed. +*/ +@(private) +compact_superpage :: proc "contextless" (superpage: ^Heap_Superpage) -> (freed: bool) { + for i := 0; i < HEAP_SLAB_COUNT; /**/ { + slab := heap_superpage_index_slab(superpage, i) + + if slab.bin_size > HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + i += heap_slabs_needed_for_size(slab.bin_size) + } else { + i += 1 + if slab.bin_size == 0 { + continue + } + } + + slab_is_cached := slab.free_bins > 0 + heap_merge_remote_frees(slab) + + if slab.free_bins == slab.max_bins { + if slab.bin_size > HEAP_MAX_BIN_SIZE { + heap_free_wide_slab(superpage, slab) + } else { + if slab_is_cached { + heap_cache_remove_slab(slab, heap_bin_size_to_rank(slab.bin_size)) + } + heap_free_slab(superpage, slab) + } + } + } + + if superpage.free_slabs == HEAP_SLAB_COUNT && !superpage.cache_block.in_use { + heap_free_superpage(superpage) + freed = true + } + return +} + +/* +Merge all remote frees then free as many slabs and superpages as possible. + +This bypasses any heuristics that keep slabs setup. +*/ +compact_heap :: proc "contextless" () { + superpage := local_heap + for { + if superpage == nil { + return + } + next_superpage := superpage.next + compact_superpage(superpage) + superpage = next_superpage + } +} + +/* +Free any empty superpages in the orphanage. + +This procedure assumes there won't ever be more than 128 superpages in the +orphanage. This limitation is due to the avoidance of heap allocation. +*/ +compact_heap_orphanage :: proc "contextless" () { + // First, try to empty the orphanage so that we can evaluate each superpage. + buffer: [128]^Heap_Superpage + for &b in buffer { + b = heap_pop_orphan() + if b == nil { + break + } + } + + // Next, compact each superpage and push it back to the orphanage if it was + // not freed. + for superpage in buffer { + if !compact_superpage(superpage) { + heap_push_orphan(superpage) + } + } +} diff --git a/base/runtime/heap_allocator_debugging.odin b/base/runtime/heap_allocator_debugging.odin new file mode 100644 index 00000000000..d055f2ce902 --- /dev/null +++ b/base/runtime/heap_allocator_debugging.odin @@ -0,0 +1,96 @@ +#+build !js +#+build !orca +#+build !wasi +package runtime + +import "base:intrinsics" + +ODIN_DEBUG_HEAP :: #config(ODIN_DEBUG_HEAP, false) + +Heap_Code_Coverage_Type :: enum { + Alloc_Bin, + Alloc_Collected_Remote_Frees, + Alloc_Heap_Initialized, + Alloc_Huge, + Alloc_Slab_Wide, + Alloc_Slab_Wide_Needed_New_Superpage, + Alloc_Slab_Wide_Used_Available_Superpage, + Alloc_Zeroed_Memory, + Freed_Bin_Freed_Slab_Which_Was_Fully_Used, + Freed_Bin_Freed_Superpage, + Freed_Bin_Reopened_Full_Slab, + Freed_Bin_Updated_Slab_Next_Free_Sector, + Freed_Huge_Allocation, + Freed_Wide_Slab, + Heap_Expanded_Cache_Data, + Huge_Alloc_Size_Adjusted, + Huge_Alloc_Size_Set_To_Superpage, + Merged_Remote_Frees, + Orphaned_Superpage_Freed_Slab, + Orphaned_Superpage_Merged_Remote_Frees, + Remotely_Freed_Bin, + Remotely_Freed_Bin_Caused_Remote_Superpage_Caching, + Remotely_Freed_Wide_Slab, + Resize_Caused_Memory_Zeroing, + Resize_Crossed_Size_Categories, + Resize_Huge, + Resize_Huge_Caused_Memory_Zeroing, + Resize_Huge_Size_Adjusted, + Resize_Huge_Size_Set_To_Superpage, + Resize_Kept_Old_Pointer, + Resize_Wide_Slab_Caused_Memory_Zeroing, + Resize_Wide_Slab_Expanded_In_Place, + Resize_Wide_Slab_Failed_To_Find_Contiguous_Expansion, + Resize_Wide_Slab_From_Remote_Thread, + Resize_Wide_Slab_Kept_Old_Pointer, + Resize_Wide_Slab_Shrunk_In_Place, + Slab_Adjusted_For_Partial_Sector, + Superpage_Add_Remote_Free_Guarded_With_Masterless, + Superpage_Add_Remote_Free_Guarded_With_Set, + Superpage_Added_Remote_Free, + Superpage_Added_To_Open_Cache_By_Freeing_Wide_Slab, + Superpage_Added_To_Open_Cache_By_Resizing_Wide_Slab, + Superpage_Added_To_Open_Cache_By_Slab, + Superpage_Adopted_From_Orphanage, + Superpage_Created_By_Empty_Orphanage, + Superpage_Freed_By_Exiting_Thread, + Superpage_Freed_By_Wide_Slab, + Superpage_Freed_On_Full_Orphanage, + Superpage_Linked, + Superpage_Cache_Block_Cleared, + Superpage_Orphaned_By_Exiting_Thread, + Superpage_Pushed_To_Orphanage, + Superpage_Registered_With_Free_Slabs, + Superpage_Registered_With_Slab_In_Use, + Superpage_Removed_From_Open_Cache_By_Slab, + Superpage_Unlinked_Non_Tail, + Superpage_Unlinked_Tail, + Superpage_Unregistered, + Superpage_Updated_Next_Free_Slab_Index, + Superpage_Updated_Next_Free_Slab_Index_As_Empty, +} + +when ODIN_DEBUG_HEAP { + heap_global_code_coverage: [Heap_Code_Coverage_Type]int // atomic +} + +@(private, disabled=!ODIN_DEBUG_HEAP) +heap_debug_cover :: #force_inline proc "contextless" (type: Heap_Code_Coverage_Type) { + when ODIN_DEBUG_HEAP { + intrinsics.atomic_add_explicit(&heap_global_code_coverage[type], 1, .Release) + } +} + +_check_heap_code_coverage :: proc "contextless" () -> bool { + when ODIN_DEBUG_HEAP { + intrinsics.atomic_thread_fence(.Seq_Cst) + for t in heap_global_code_coverage { + if t == 0 { + return false + } + } + return true + } else { + panic_contextless("ODIN_DEBUG_HEAP is not enabled, therefore the results of this procedure are meaningless.") + } +} diff --git a/base/runtime/heap_allocator_fallback.odin b/base/runtime/heap_allocator_fallback.odin new file mode 100644 index 00000000000..25e906d7e3d --- /dev/null +++ b/base/runtime/heap_allocator_fallback.odin @@ -0,0 +1,135 @@ +#+build orca +package runtime + +// This is the libc malloc-based Odin allocator, used as a fallback for +// platforms that do not support virtual memory, superpages, or aligned +// allocation, or for platforms where we would prefer to use the system's +// built-in allocator through the malloc interface. + +import "base:intrinsics" + +foreign { + @(link_name="malloc") _libc_malloc :: proc "c" (size: uint) -> rawptr --- + @(link_name="calloc") _libc_calloc :: proc "c" (num, size: uint) -> rawptr --- + @(link_name="free") _libc_free :: proc "c" (ptr: rawptr) --- + @(link_name="realloc") _libc_realloc :: proc "c" (ptr: rawptr, size: uint) -> rawptr --- +} + +@(require_results) +heap_alloc :: proc "contextless" (size: int, zero_memory := true) -> rawptr { + if size <= 0 { + return nil + } + if zero_memory { + return _libc_calloc(1, uint(size)) + } else { + return _libc_malloc(uint(size)) + } +} + +@(require_results) +heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int, zero_memory: bool = true) -> (new_ptr: rawptr) { + new_ptr = _libc_realloc(old_ptr, uint(new_size)) + + // Section 7.22.3.5.2 of the C17 standard: "The contents of the new object + // shall be the same as that of the old object prior to deallocation, up to + // the lesser of the new and old sizes. Any bytes in the new object beyond + // the size of the old object have indeterminate values." + // + // Therefore, we zero the memory ourselves. + if zero_memory && new_size > old_size { + intrinsics.mem_zero(rawptr(uintptr(new_ptr) + uintptr(old_size)), new_size - old_size) + } + + return +} + +heap_free :: proc "contextless" (ptr: rawptr) { + _libc_free(ptr) +} + +heap_allocator :: proc() -> Allocator { + return { + data = nil, + procedure = heap_allocator_proc, + } +} + +heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode, + size, alignment: int, + old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) { + + // Because malloc does not support alignment requests, and aligned_alloc + // has specific requirements for what sizes it supports, this allocator + // over-allocates by the alignment requested and stores the original + // pointer behind the address returned to the user. + + switch mode { + case .Alloc, .Alloc_Non_Zeroed: + padding := max(alignment, size_of(rawptr)) + ptr := heap_alloc(size + padding, mode == .Alloc) + if ptr == nil { + return nil, .Out_Of_Memory + } + shift := uintptr(padding) - uintptr(ptr) & uintptr(padding-1) + aligned_ptr := rawptr(uintptr(ptr) + shift) + ([^]rawptr)(aligned_ptr)[-1] = ptr + return byte_slice(aligned_ptr, size), nil + + case .Free: + if old_memory != nil { + heap_free(([^]rawptr)(old_memory)[-1]) + } + + case .Free_All: + return nil, .Mode_Not_Implemented + + case .Resize, .Resize_Non_Zeroed: + new_padding := max(alignment, size_of(rawptr)) + original_ptr := ([^]rawptr)(old_memory)[-1] + ptr: rawptr + + if alignment > align_of(rawptr) { + // The alignment is in excess of what malloc/realloc will return + // for address alignment per the C standard, so we must reallocate + // manually in order to guarantee alignment for the user. + // + // Resizing through realloc simply won't work because it's possible + // that our target address originally only needed a padding of 8 + // bytes, but if we expand the memory used and the address is moved, + // we may then need 16 bytes for proper alignment, for example. + // + // We'll copy the old data later. + ptr = heap_alloc(size + new_padding, mode == .Resize) + + } else { + real_old_size := size_of(rawptr) + old_size + real_new_size := new_padding + size + + ptr = heap_resize(original_ptr, real_old_size, real_new_size, mode == .Resize) + } + + shift := uintptr(new_padding) - uintptr(ptr) & uintptr(new_padding-1) + aligned_ptr := rawptr(uintptr(ptr) + shift) + ([^]rawptr)(aligned_ptr)[-1] = ptr + + if alignment > align_of(rawptr) { + intrinsics.mem_copy_non_overlapping(aligned_ptr, old_memory, min(size, old_size)) + heap_free(original_ptr) + } + + return byte_slice(aligned_ptr, size), nil + + case .Query_Features: + set := (^Allocator_Mode_Set)(old_memory) + if set != nil { + set^ = {.Alloc, .Alloc_Non_Zeroed, .Free, .Resize, .Resize_Non_Zeroed, .Query_Features} + } + return nil, nil + + case .Query_Info: + return nil, .Mode_Not_Implemented + } + + return nil, nil +} diff --git a/base/runtime/heap_allocator_implementation.odin b/base/runtime/heap_allocator_implementation.odin new file mode 100644 index 00000000000..cf3f2d795c3 --- /dev/null +++ b/base/runtime/heap_allocator_implementation.odin @@ -0,0 +1,1893 @@ +#+build !js +#+build !orca +#+build !wasi +package runtime + +import "base:intrinsics" + +/* +This is the dynamic heap allocator for the Odin runtime. + +**Features** + +- Lock-free guarantee: A thread cannot deadlock or obstruct the allocator; + some thread makes progress no matter the parallel conditions. + +- Thread-local heaps: There is no allocator-induced false sharing. + +- Global storage for unused memory: When a thread finishes cleanly, its memory + is sent to a global storage where other threads can use it. + +- Headerless: All bin-sized allocations (any power of two, less than or equal + to 32KiB, by default) consume no extra space, and as a result of being tightly + packed, enhance performance with cache locality for programs. + + +**Terminology** + +- Bin: an allocation of a fixed size, shared with others of the same size category. + +- Slab: a fixed-size block within a Superpage that is divided into a constant + number of Bins at runtime based on the needs of the program. + +- Sector: a variable-sized bitmap, used to track whether a Bin is free or not. + + +**Allocation Categories** + +- Huge: anything in excess of 3/4ths of a Superpage. +- Slab-wide: anything in excess of the largest Bin size. +- Bin: anything less than or equal to the largest bin size. +*/ + +// +// Tunables +// + +// NOTE: Adjusting this constant by itself is rarely enough; `HEAP_SUPERPAGE_CACHE_RATIO` +// will have to be changed as well. +HEAP_SLAB_SIZE :: #config(ODIN_HEAP_SLAB_SIZE, 64 * Kilobyte) +HEAP_MAX_BIN_SIZE :: #config(ODIN_HEAP_MAX_BIN_SIZE, HEAP_SLAB_SIZE / 2) +HEAP_MIN_BIN_SIZE :: #config(ODIN_HEAP_MIN_BIN_SIZE, 8 * Byte) +HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES :: #config(ODIN_HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES, 3) +HEAP_SUPERPAGE_CACHE_RATIO :: #config(ODIN_HEAP_SUPERPAGE_CACHE_RATIO, 20) +HEAP_PANIC_ON_DOUBLE_FREE :: #config(ODIN_HEAP_PANIC_ON_DOUBLE_FREE, true) + +// +// Constants +// + +HEAP_MAX_BIN_SHIFT :: intrinsics.constant_log2(HEAP_MAX_BIN_SIZE) +HEAP_MIN_BIN_SHIFT :: intrinsics.constant_log2(HEAP_MIN_BIN_SIZE) +HEAP_BIN_RANKS :: 1 + HEAP_MAX_BIN_SHIFT - HEAP_MIN_BIN_SHIFT +HEAP_HUGE_ALLOCATION_BOOK_KEEPING :: size_of(int) + HEAP_MAX_ALIGNMENT +HEAP_HUGE_ALLOCATION_THRESHOLD :: SUPERPAGE_SIZE / 4 * 3 +HEAP_MAX_ALIGNMENT :: 64 * Byte +HEAP_CACHE_SLAB_MAP_STRIDE :: HEAP_SUPERPAGE_CACHE_RATIO * HEAP_SLAB_COUNT +HEAP_SECTOR_TYPES :: 2 // { local_free, remote_free } +HEAP_SLAB_ALLOCATION_BOOK_KEEPING :: size_of(Heap_Slab) + HEAP_SECTOR_TYPES * size_of(uint) + HEAP_MAX_ALIGNMENT +HEAP_SLAB_COUNT :: SUPERPAGE_SIZE / HEAP_SLAB_SIZE - 1 + +@(private="file") INTEGER_BITS :: 8 * size_of(int) +@(private="file") SECTOR_BITS :: 8 * size_of(uint) + +// +// Sanity checking +// + +#assert(HEAP_MAX_BIN_SIZE & (HEAP_MAX_BIN_SIZE-1) == 0, "HEAP_MAX_BIN_SIZE must be a power of two.") +#assert(HEAP_MAX_BIN_SIZE < HEAP_SLAB_SIZE - size_of(Heap_Slab) - HEAP_SECTOR_TYPES * size_of(uint), "HEAP_MAX_BIN_SIZE must be able to fit into one Slab, including its free maps.") +#assert(HEAP_MAX_BIN_SIZE >= HEAP_MIN_BIN_SIZE, "HEAP_MAX_BIN_SIZE must be greater than or equal to HEAP_MIN_BIN_SIZE.") +#assert(HEAP_HUGE_ALLOCATION_THRESHOLD <= SUPERPAGE_SIZE - HEAP_HUGE_ALLOCATION_BOOK_KEEPING, "HEAP_HUGE_ALLOCATION_THRESHOLD must be smaller than a Superpage, with enough space for HEAP_HUGE_ALLOCATION_BOOK_KEEPING.") +#assert(HEAP_MIN_BIN_SIZE & (HEAP_MIN_BIN_SIZE-1) == 0, "HEAP_MIN_BIN_SIZE must be a power of two.") +#assert(HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES >= 0, "HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES must be positive.") +#assert(HEAP_MAX_ALIGNMENT & (HEAP_MAX_ALIGNMENT-1) == 0, "HEAP_MAX_ALIGNMENT must be a power of two.") +#assert(HEAP_SLAB_COUNT > 0, "HEAP_SLAB_COUNT must be greater than zero.") +#assert(HEAP_SLAB_SIZE & (HEAP_SLAB_SIZE-1) == 0, "HEAP_SLAB_SIZE must be a power of two.") +#assert(SUPERPAGE_SIZE >= 2 * HEAP_SLAB_SIZE, "SUPERPAGE_SIZE must be at least twice HEAP_SLAB_SIZE.") +#assert(size_of(Heap_Superpage) < HEAP_SLAB_SIZE, "The Superpage struct must not exceed the size of a Slab.") + +// +// Utility Procedures +// + +@(require_results, private) +round_to_nearest_power_of_two :: #force_inline proc "contextless" (n: uint) -> int { + assert_contextless(n > 1, "This procedure does not handle the edge case of n < 2.") + return 1 << (INTEGER_BITS - intrinsics.count_leading_zeros(n-1)) +} + +@(require_results) +heap_slabs_needed_for_size :: #force_inline proc "contextless" (size: int) -> (result: int) { + assert_contextless(size > 0) + assert_contextless(size > HEAP_MAX_BIN_SIZE) + size := size + size += HEAP_SLAB_ALLOCATION_BOOK_KEEPING + result = size / HEAP_SLAB_SIZE + (0 if size % HEAP_SLAB_SIZE == 0 else 1) + assert_contextless(result > 0) + assert_contextless(result < HEAP_SLAB_COUNT, "Calculated an overly-large Slab-wide allocation.") + return +} + +@(require_results) +heap_round_to_bin_size :: #force_inline proc "contextless" (n: int) -> int { + if n <= HEAP_MIN_BIN_SIZE { + return HEAP_MIN_BIN_SIZE + } + m := round_to_nearest_power_of_two(uint(n)) + assert_contextless(m & (m-1) == 0, "Internal rounding error.") + return m +} + +@(require_results) +heap_bin_size_to_rank :: #force_inline proc "contextless" (size: int) -> (rank: int) { + // By this point, a size of zero should've been rounded up to HEAP_MIN_BIN_SIZE. + assert_contextless(size > 0, "Size must be greater-than zero.") + assert_contextless(size & (size-1) == 0, "Size must be a power of two.") + rank = intrinsics.count_trailing_zeros(size >> HEAP_MIN_BIN_SHIFT) + assert_contextless(rank <= HEAP_BIN_RANKS, "Bin rank calculated incorrectly; it must be less-than-or-equal-to HEAP_BIN_RANKS.") + return +} + +@(require_results) +find_superpage_from_pointer :: #force_inline proc "contextless" (ptr: rawptr) -> ^Heap_Superpage { + return cast(^Heap_Superpage)(uintptr(ptr) & ~uintptr(SUPERPAGE_SIZE-1)) +} + +@(require_results) +find_slab_from_pointer :: #force_inline proc "contextless" (ptr: rawptr) -> ^Heap_Slab { + return cast(^Heap_Slab)(uintptr(ptr) & ~uintptr(HEAP_SLAB_SIZE-1)) +} + +/* +Get a specific slab by index from a superpage. + +The position deltas are all constant with respect to the origin, hence this +procedure should prove faster than accessing an array of pointers. +*/ +@(require_results) +heap_superpage_index_slab :: #force_inline proc "contextless" (superpage: ^Heap_Superpage, index: int) -> (slab: ^Heap_Slab) { + assert_contextless(index >= 0, "The heap allocator tried to index a negative slab index.") + assert_contextless(index < HEAP_SLAB_COUNT, "The heap allocator tried to index a slab beyond the configured maximum.") + return cast(^Heap_Slab)(uintptr(superpage) + HEAP_SLAB_SIZE * uintptr(1 + index)) +} + +// +// Data Structures +// + +/* +The **Slab** is a fixed-size division of a Superpage, configured at runtime to +contain fixed-size allocations. It uses two bitmaps to keep track of the +state of its bins: whether a bin is locally free or remotely free. + +It is a Slab allocator in its own right, hence the name. + +Each allocation is self-aligned, up to an alignment size of `HEAP_MAX_ALIGNMENT` +(64 bytes by default). The slab itself is aligned to its size, allowing +allocations within to find their owning slab in constant time. + +Remote threads, in an atomic wait-free manner, flip bits across `remote_free` +to signal that they have freed memory which does not belong to them. The owning +thread will then collect those bits when the slab is approaching fullness. + + +**Fields**: + +`index` is used to make self-referencing easier. It must be the first field in +the Slab, as `heap_slab_clear_data` depends on this. + +`bin_size` tracks the precise byte size of the allocations. + +`is_dirty` indicates that the slab has been used in the past and its memory may +not be entirely zeroed. + +`is_full` is true at some point after `free_bins` == 0 and false otherwise. +It is atomic and used for inter-thread communication. + +`has_remote_frees` is true at some point after a bit has been flipped on in +`remote_free`. It is atomic and used for inter-thread communication. + +`max_bins` is set at initialization with the number of bins allotted. + +`free_bins` counts the exact number of free bins known to the allocating +thread. This value does not yet include any remote free bins. + +`dirty_bins` tracks which bins have been used and must be zeroed before being +handed out again. Because we always allocate linearly, this is simply an +integer indicating the greatest index the slab has ever distributed. + +`next_free_sector` points to which `uint` in `local_free` contains a free bit. +It is always the lowest index possible. + +`sectors` is the length of each bitmap. + +`local_free` tracks which bins are free. + +`remote_free` tracks which bins have been freed by other threads. +It is atomic. + +`data` points to the first bin and is used for calculating bin positions. +*/ +Heap_Slab :: struct { + index: int, // This field must be the first. + bin_size: int, + is_dirty: bool, + + max_bins: int, + free_bins: int, + dirty_bins: int, + next_free_sector: int, + + is_full: bool, // atomic + remote_free_bins_scheduled: int, // atomic + + sectors: int, + local_free: [^]uint, + remote_free: [^]uint, // data referenced is atomic + + // NOTE: `remote_free` and its contents should be fine with regards to + // alignment for the purposes of atomic access, as every field in a Slab is + // at least register-sized. + // + // Atomically accessing memory that is not aligned to the register size + // may cause an issue on some systems. + + data: uintptr, + + /* ... local_free's data ... */ + /* ... remote_free's data ... */ + /* ... allocation data ... */ +} + +/* +The **Superpage** is a single allocation from the operating system's virtual +memory subsystem, always with a fixed size at compile time, that is used to +store almost every allocation requested by the program in sub-allocators known +as Slabs. + +On almost every platform, this structure will be 2MiB by default. + +Depending on the operating system, addresses within the space occupied by the +superpage (and hence its allocations) may also have faster access times due to +leveraging properties of the Translation Lookaside Buffer. + +It is always aligned to its size, allowing any address allocated from its space +to look up the owning superpage in constant-time with bit masking. + +**Fields:** + +`huge_size` is precisely how many bytes have been allocated to the address at +which this structure resides, if it is a Huge allocation. It is zero for normal +superpages. + +`prev` points to the previous superpage in the doubly-linked list of all +superpages for a single thread's heap. + +`next` points to the next superpage in the same doubly-linked list. It is +accessed with atomics only when engaging with the orphanage, as that is the +only time it should change in a parallel situation. For all other cases, the +thread which owns the superpage is the only one to read this field. + +`remote_free_set` is true if there might be a slab with remote frees in this superpage. + +`owner` is the value of `current_thread_id` for the thread which owns this superpage.. + +`master_cache_block` points to the `local_heap_cache` for the thread which owns this +superpage. This field is used to help synchronize remote freeing. + +`free_slabs` is the count of slabs which are ready to use for new bin ranks. + +`next_free_slab_index` is the lowest index of a free slab. +A value of `HEAP_SLAB_COUNT` means there are no free slabs. + +`cache_block` is the space where data for the heap's cache is stored, if this +superpage has been claimed by the heap. It should otherwise be all zero. +*/ +Heap_Superpage :: struct { + huge_size: int, // This field must be the first. + prev: ^Heap_Superpage, + next: ^Heap_Superpage, // atomic in orphanage, otherwise non-atomic + + remote_free_set: bool, // atomic + owner: int, // atomic + master_cache_block: ^Heap_Cache_Block, // atomic + + free_slabs: int, + next_free_slab_index: int, + + cache_block: Heap_Cache_Block, +} + +/* +`Heap_Cache_Block` is a structure that lives within each `Heap_Superpage` that has been +claimed by a thread's heap to store heap-relevant metadata for improving +allocator performance. This structure makes use of space that would have +otherwise been unused due to the particular alignment needs the allocator has. + +The number of superpages that each `Heap_Cache_Block` oversees is dictated by the +`HEAP_SUPERPAGE_CACHE_RATIO` constant. As a thread's heap accumulates more +superpages, the heap will need to claim additional superpages - at the rate of +that constant - to keep track of all the possible data. + +The arrays are configured in such a manner that they can never overflow, and +the heap will always have more than enough space to record the data needed. + + +The `HEAP_CACHE_SLAB_MAP_STRIDE` is of note, as it must contain the number of slabs +per superpage (31, by default) times the number of superpages overseen per +`Heap_Cache_Block`. This is largely where all the space is allocated, but it pays off +handsomely in allowing the allocator to find if a slab is available for any +particular bin size in constant time. + +In practice, this is an excessive amount of space allocated for a map of arrays +for this purpose, but it is entirely possible that the allocator could get into +a spot where this hash map could overflow if it used any less space. For +instance, if the allocator had as many superpages as possible for one +`Heap_Cache_Block`, filled all the slabs, then the program freed one bin from each +slab, the allocator would need every slot in the map. The cache has as much +space to prevent just that problem from arising. + +**Fields:** + +`in_use` will be true, if the parent `Heap_Superpage` is using the space occupied by +this struct. This indicates that the superpage should not be freed. + +`next_cache_block` is an atomic pointer to the next `Heap_Cache_Block`. Each +`Heap_Cache_Block` is linked together in this way to allow each heap to expand +its available space for tracking information. + + +_The next three fields are only used in the `Heap_Superpage` pointed to by `local_heap`._ + +`length` is how many `Heap_Cache_Block` structs are in use across the local +thread's heap. + +`owned_superpages` is how many superpages are in use by the local thread's +heap. + +`remote_free_count` is an estimate of how many superpages have slabs with +remote frees available to merge. + + +_The next three fields constitute the main data used in this struct, and each +of them are like partitions, spread across a linked list._ + +`slab_map` is a hash map of arrays, keyed by bin rank, used to quickly find a +slab with free bins for a particular bin size. It uses linear probing. + +`superpages_with_free_slabs` is an array of superpages with `free_slabs` +greater than zero, used for quickly allocating new slabs when none can be found +in `slab_map`. + +`superpages_with_remote_frees` is an atomic set containing references to +superpages that may have slabs with remotely free bins. Superpages are only +added if the slab was full at the time, and an invasive flag keeps the +algorithm from having to search the entire span for an existence check. + +Keep in mind that even though a superpage is added to this set when a slab is +full and freed remotely, that state need not persist; the owning thread can +free a bin before the remote frees are merged, invalidating that part of the +heuristic. +*/ +Heap_Cache_Block :: struct { + in_use: bool, + next_cache_block: ^Heap_Cache_Block, // atomic + + // { only used in `local_heap` + length: int, + owned_superpages: int, + remote_free_count: int, // atomic + // } + + slab_map: [HEAP_CACHE_SLAB_MAP_STRIDE*HEAP_BIN_RANKS]^Heap_Slab, + superpages_with_free_slabs: [HEAP_SUPERPAGE_CACHE_RATIO]^Heap_Superpage, + superpages_with_remote_frees: [HEAP_SUPERPAGE_CACHE_RATIO]^Heap_Superpage, // atomic +} + +// +// Superstructure Allocation +// + +/* +Allocate memory for a Superpage from the operating system and do any initialization work. +*/ +@(require_results) +heap_make_superpage :: proc "contextless" () -> (superpage: ^Heap_Superpage) { + superpage = cast(^Heap_Superpage)allocate_virtual_memory_superpage() + assert_contextless(uintptr(superpage) & uintptr(SUPERPAGE_SIZE-1) == 0, "The operating system returned virtual memory which isn't aligned to a Superpage-sized boundary.") + + superpage.owner = get_current_thread_id() + superpage.free_slabs = HEAP_SLAB_COUNT + + // Each Slab is aligned to its size, so that finding which slab an + // allocated pointer is assigned to is a constant operation by bit masking. + // + // However, this means we must waste one Slab per Superpage so that the + // Superpage data can live nearby inside the chunk of virtual memory. + base := uintptr(superpage) + HEAP_SLAB_SIZE + for i in 0..= HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES { + intrinsics.atomic_sub_explicit(&heap_orphanage_count, 1, .Relaxed) + free_virtual_memory(superpage, SUPERPAGE_SIZE) + heap_debug_cover(.Superpage_Freed_On_Full_Orphanage) + } else { + heap_push_orphan(superpage) + heap_debug_cover(.Superpage_Pushed_To_Orphanage) + } +} + +/* +Remove a superpage from a heap's cache of superpages. +*/ +heap_cache_unregister_superpage :: proc "contextless" (superpage: ^Heap_Superpage) { + if superpage.cache_block.in_use { + return + } + local_heap_cache.owned_superpages -= 1 + intrinsics.atomic_store_explicit(&superpage.master_cache_block, nil, .Release) + heap_debug_cover(.Superpage_Unregistered) +} + +/* +Add a superpage to a heap's cache of superpages. + +This will take note if the superpage has free slabs, what are the contents of +its slabs, and it will merge any waiting remote free bins. +*/ +heap_cache_register_superpage :: proc "contextless" (superpage: ^Heap_Superpage) { + // Expand the heap cache's available space if needed. + local_heap_cache.owned_superpages += 1 + if local_heap_cache.owned_superpages / HEAP_SUPERPAGE_CACHE_RATIO > local_heap_cache.length { + tail := local_heap_cache + for /**/; tail.next_cache_block != nil; tail = tail.next_cache_block { } + heap_cache_expand(tail) + } + + // This must come after expansion to prevent a remote thread from running out of space. + // The cache is first expanded, _then_ other threads may know about it. + intrinsics.atomic_store_explicit(&superpage.master_cache_block, local_heap_cache, .Release) + + // Register the superpage for allocation. + if superpage.free_slabs > 0 { + // Register the superpage as having free slabs available. + heap_cache_add_superpage_with_free_slabs(superpage) + heap_debug_cover(.Superpage_Registered_With_Free_Slabs) + } + + // Register slabs. + if superpage.free_slabs < HEAP_SLAB_COUNT { + for i := 0; i < HEAP_SLAB_COUNT; /**/ { + slab := heap_superpage_index_slab(superpage, i) + + if slab.bin_size > HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + i += heap_slabs_needed_for_size(slab.bin_size) + } else { + i += 1 + if slab.bin_size == 0 { + continue + } + } + + // When adopting a new Superpage, we take the opportunity to + // merge any remote frees. This is important because it's + // possible for another thread to remotely free memory while + // the thread which owned it is in limbo. + if intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Acquire) > 0 { + heap_merge_remote_frees(slab) + + if slab.free_bins > 0 { + // Synchronize with any thread that might be trying to + // free as we merge. + intrinsics.atomic_store_explicit(&slab.is_full, false, .Release) + } + } + + // Free any empty Slabs and register the ones with free bins. + if slab.free_bins == slab.max_bins { + if slab.bin_size > HEAP_MAX_BIN_SIZE { + heap_free_wide_slab(superpage, slab) + } else { + heap_free_slab(superpage, slab) + } + } else if slab.bin_size <= HEAP_MAX_BIN_SIZE && slab.free_bins > 0 { + heap_cache_add_slab(slab, heap_bin_size_to_rank(slab.bin_size)) + } + + i += 1 + heap_debug_cover(.Superpage_Registered_With_Slab_In_Use) + } + } +} + +/* +Get a superpage, first by adopting any orphan, or allocating memory from the +operating system if one is not available. +*/ +@(require_results) +heap_get_superpage:: proc "contextless" () -> (superpage: ^Heap_Superpage) { + superpage = heap_pop_orphan() + if superpage == nil { + superpage = heap_make_superpage() + heap_debug_cover(.Superpage_Created_By_Empty_Orphanage) + } else { + heap_debug_cover(.Superpage_Adopted_From_Orphanage) + } + assert_contextless(superpage != nil, "The heap allocator failed to get a superpage.") + return +} + +/* +Make an allocation in the Huge category. +*/ +@(require_results) +heap_make_huge_allocation :: proc "contextless" (size: int) -> (ptr: rawptr) { + // NOTE: ThreadSanitizer may wrongly say that this is the source of a data + // race. This is because a virtual memory address has been allocated once, + // returned to the operating system, then given back to the process in a + // different thread. + // + // It is otherwise impossible for us to race on newly allocated memory. + size := size + if size < SUPERPAGE_SIZE - HEAP_HUGE_ALLOCATION_BOOK_KEEPING { + size = SUPERPAGE_SIZE + heap_debug_cover(.Huge_Alloc_Size_Set_To_Superpage) + } else { + size += HEAP_HUGE_ALLOCATION_BOOK_KEEPING + heap_debug_cover(.Huge_Alloc_Size_Adjusted) + } + assert_contextless(size >= SUPERPAGE_SIZE, "Calculated incorrect Huge allocation size.") + + // All free operations assume every pointer has a Superpage at the + // Superpage boundary of the pointer, and a Huge allocation is no + // different. + // + // The size of the allocation is written as an integer at the beginning, + // but all other fields are left zero-initialized. We then align forward to + // `HEAP_MAX_ALIGNMENT` (64 bytes, by default) and give that to the user. + superpage := cast(^Heap_Superpage)allocate_virtual_memory_aligned(size, SUPERPAGE_SIZE) + assert_contextless(uintptr(superpage) & uintptr(SUPERPAGE_SIZE-1) == 0, "The operating system returned virtual memory which isn't aligned to a Superpage-sized boundary.") + + superpage.huge_size = size + + u := uintptr(superpage) + HEAP_HUGE_ALLOCATION_BOOK_KEEPING + ptr = rawptr(u - u & (HEAP_MAX_ALIGNMENT-1)) + + assert_contextless(uintptr(ptr) & (HEAP_MAX_ALIGNMENT-1) == 0, "Huge allocation is not aligned to HEAP_MAX_ALIGNMENT.") + assert_contextless(find_superpage_from_pointer(ptr) == superpage, "Huge allocation reverse lookup failed.") + + return +} + +/* +Make an allocation that is at least one entire Slab wide from the provided superpage. + +This will return false if the Superpage lacks enough contiguous Slabs to fit the size. +*/ +@(require_results) +heap_make_slab_sized_allocation :: proc "contextless" (superpage: ^Heap_Superpage, size: int) -> (ptr: rawptr, ok: bool) { + assert_contextless(0 <= superpage.next_free_slab_index && superpage.next_free_slab_index < HEAP_SLAB_COUNT, "Invalid next_free_slab_index.") + contiguous := heap_slabs_needed_for_size(size) + + find_run: for start := superpage.next_free_slab_index; start < HEAP_SLAB_COUNT-contiguous+1; /**/ { + n := contiguous + + for i := start; i < HEAP_SLAB_COUNT; /**/ { + bin_size := heap_superpage_index_slab(superpage, i).bin_size + if bin_size > HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + start = i + heap_slabs_needed_for_size(bin_size) + continue find_run + } else if bin_size != 0 { + start = i + 1 + continue find_run + } + n -= 1 + if n > 0 { + i += 1 + continue + } + // Setup the Slab header. + // This will be a single-sector Slab that may span several Slabs. + slab := heap_superpage_index_slab(superpage, start) + + // Setup slab. + if slab.is_dirty { + heap_slab_clear_data(slab) + } + slab.bin_size = size + slab.is_full = true + slab.max_bins = 1 + slab.dirty_bins = 1 + slab.sectors = 1 + slab.local_free = cast([^]uint)(uintptr(slab) + size_of(Heap_Slab)) + slab.remote_free = cast([^]uint)(uintptr(slab) + size_of(Heap_Slab) + 1 * size_of(uint)) + data := uintptr(slab) + HEAP_SLAB_ALLOCATION_BOOK_KEEPING + ptr = rawptr(data - data & (HEAP_MAX_ALIGNMENT-1)) + slab.data = uintptr(ptr) + assert_contextless(uintptr(ptr) & (HEAP_MAX_ALIGNMENT-1) == 0, "Slab-wide allocation's data pointer is not correctly aligned.") + assert_contextless(int(uintptr(ptr) - uintptr(superpage)) + size < SUPERPAGE_SIZE, "Incorrectly calculated Slab-wide allocation exceeds Superpage end boundary.") + + // Wipe any non-zero data from slabs ahead of the header. + for x in start+1..=i { + next_slab := heap_superpage_index_slab(superpage, x) + next_slab.index = 0 + if next_slab.is_dirty { + heap_slab_clear_data(next_slab) + } + } + + // Update statistics. + superpage.free_slabs -= contiguous + assert_contextless(superpage.free_slabs >= 0, "The heap allocator caused a superpage's free_slabs to go negative.") + if superpage.free_slabs == 0 { + heap_cache_remove_superpage_with_free_slabs(superpage) + } + // NOTE: Start from zero again, because we may have skipped a non-contiguous block. + heap_update_next_free_slab_index(superpage, 0) + + return ptr, true + } + } + + return +} + +// +// Slabs +// + +/* +Do everything that is needed to ready a Slab for a specific bin size. + +This involves a handful of space calculations and writing the header. +*/ +@(require_results) +heap_slab_setup :: proc "contextless" (superpage: ^Heap_Superpage, rounded_size: int) -> (slab: ^Heap_Slab) { + assert_contextless(0 <= superpage.next_free_slab_index && superpage.next_free_slab_index < HEAP_SLAB_COUNT, "The heap allocator found a Superpage with an invalid next_free_slab_index.") + assert_contextless(superpage.free_slabs > 0, "The heap allocator tried to setup a Slab in an exhausted Superpage.") + + superpage.free_slabs -= 1 + assert_contextless(superpage.free_slabs >= 0, "The heap allocator caused a Superpage's free_slabs to go negative.") + + slab = heap_superpage_index_slab(superpage, superpage.next_free_slab_index) + if slab.is_dirty { + heap_slab_clear_data(slab) + } + slab.bin_size = rounded_size + + // The book-keeping structures compete for the same space as the data, + // so we have to go back and forth with the math a bit. + bins := HEAP_SLAB_SIZE / rounded_size + sectors := bins / INTEGER_BITS + sectors += 0 if bins % INTEGER_BITS == 0 else 1 + + // We'll waste `2 * HEAP_MAX_ALIGNMENT` bytes per slab to simplify the math + // behind getting the pointer to the first bin to align properly. + // Otherwise we'd have to go back and forth even more. + bookkeeping_bin_cost := max(1, int(size_of(Heap_Slab) + HEAP_SECTOR_TYPES * uintptr(sectors) * size_of(uint) + 2 * HEAP_MAX_ALIGNMENT) / rounded_size) + bins -= bookkeeping_bin_cost + sectors = bins / INTEGER_BITS + sectors += 0 if bins % INTEGER_BITS == 0 else 1 + + slab.sectors = sectors + slab.free_bins = bins + slab.max_bins = bins + + base_alignment := uintptr(min(HEAP_MAX_ALIGNMENT, rounded_size)) + + slab_bitmap_base := uintptr(slab) + size_of(Heap_Slab) + total_byte_size_of_all_bitmaps := HEAP_SECTOR_TYPES * uintptr(sectors) * size_of(uint) + pointer_padding := (uintptr(base_alignment) - (slab_bitmap_base + total_byte_size_of_all_bitmaps)) & uintptr(base_alignment - 1) + + // These bitmaps are placed at the end of the struct, one after the other. + slab.local_free = cast([^]uint)(slab_bitmap_base) + slab.remote_free = cast([^]uint)(slab_bitmap_base + uintptr(sectors) * size_of(uint)) + // This pointer is specifically aligned. + slab.data = (slab_bitmap_base + total_byte_size_of_all_bitmaps + pointer_padding) + + assert_contextless(slab.data & (base_alignment-1) == 0, "Incorrect calculation for aligning Slab data pointer.") + assert_contextless(size_of(Heap_Slab) + int(total_byte_size_of_all_bitmaps) + bins * rounded_size < HEAP_SLAB_SIZE, "Slab internal allocation overlimit.") + assert_contextless(find_slab_from_pointer(rawptr(slab.data)) == slab, "Slab data pointer cannot be traced back to its Slab.") + + // Set all of the local free bits. + { + full_sectors := bins / INTEGER_BITS + partial_sector := bins % INTEGER_BITS + for i in 0.. 0 { + slab.local_free[sectors-1] = (1 << uint(bins % INTEGER_BITS)) - 1 + heap_debug_cover(.Slab_Adjusted_For_Partial_Sector) + } + } + + // Update the next free slab. + heap_update_next_free_slab_index(superpage, superpage.next_free_slab_index + 1) + + // Make the slab known to the heap. + heap_cache_add_slab(slab, heap_bin_size_to_rank(rounded_size)) + + if superpage.free_slabs == 0 { + heap_cache_remove_superpage_with_free_slabs(superpage) + heap_debug_cover(.Superpage_Removed_From_Open_Cache_By_Slab) + } + + return +} + +/* +Set everything after the index field to zero in a Slab. +*/ +heap_slab_clear_data :: proc "contextless" (slab: ^Heap_Slab) { + intrinsics.mem_zero_volatile(rawptr(uintptr(slab) + size_of(int)), HEAP_SLAB_SIZE - size_of(int)) +} + +/* +Mark a Slab-wide allocation as no longer in use by the Superpage. +*/ +heap_free_wide_slab :: proc "contextless" (superpage: ^Heap_Superpage, slab: ^Heap_Slab) { + assert_contextless(slab.bin_size > HEAP_MAX_BIN_SIZE, "The heap allocator tried to wide-free a non-wide slab.") + // There is only one bit for a Slab-wide allocation, so this will be easy. + contiguous := heap_slabs_needed_for_size(slab.bin_size) + previously_full_superpage := superpage.free_slabs == 0 + superpage.free_slabs += contiguous + + for i in slab.index.. (bins_left: int) { + assert_contextless(slab.bin_size > 0, "The heap allocator tried to merge remote frees on an unused slab.") + + merged_bins := 0 + next_free_sector := slab.next_free_sector + + // Atomically merge in all of the bits set by other threads. + for i in 0.. HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + i += heap_slabs_needed_for_size(bin_size) + } else { + i += 1 + } + } + + panic_contextless("The heap allocator was unable to find a free slab in a superpage with free_slabs > 0.") +} + +// +// The Heap Cache +// + +/* +Link a new superpage into the heap. +*/ +heap_link_superpage :: proc "contextless" (superpage: ^Heap_Superpage) { + assert_contextless(superpage.prev == nil, "The heap allocator tried to link an already-linked superpage.") + superpage.prev = local_heap_tail + local_heap_tail.next = superpage + local_heap_tail = superpage + heap_debug_cover(.Superpage_Linked) +} + +/* +Unlink a superpage from the heap. + +There must always be at least one superpage in the heap after the first +non-Huge allocation, in order to track heap metadata. +*/ +heap_unlink_superpage :: proc "contextless" (superpage: ^Heap_Superpage) { + if superpage == local_heap_tail { + assert_contextless(superpage.next == nil, "The heap allocator's tail superpage has a next link.") + assert_contextless(superpage.prev != nil, "The heap allocator's tail superpage has no previous link.") + local_heap_tail = superpage.prev + // We never unlink all superpages, so no need to check validity here. + superpage.prev.next = nil + heap_debug_cover(.Superpage_Unlinked_Tail) + return + } + if superpage.prev != nil { + superpage.prev.next = superpage.next + } + if superpage.next != nil { + superpage.next.prev = superpage.prev + } + heap_debug_cover(.Superpage_Unlinked_Non_Tail) +} + +/* +Mark a superpage from another thread as having had a bin remotely freed. +*/ +heap_remote_cache_add_remote_free_superpage :: proc "contextless" (superpage: ^Heap_Superpage) { + if intrinsics.atomic_exchange_explicit(&superpage.remote_free_set, true, .Acq_Rel) { + // Already set. + heap_debug_cover(.Superpage_Add_Remote_Free_Guarded_With_Set) + return + } + master := intrinsics.atomic_load_explicit(&superpage.master_cache_block, .Acquire) + if master == nil { + // This superpage is not owned by anyone. + // Its remote frees will be acknowledged in whole when it's adopted. + heap_debug_cover(.Superpage_Add_Remote_Free_Guarded_With_Masterless) + return + } + defer heap_debug_cover(.Superpage_Added_Remote_Free) + + cache := master + for { + for i := 0; i < len(cache.superpages_with_remote_frees); i += 1 { + old, swapped := intrinsics.atomic_compare_exchange_strong_explicit(&cache.superpages_with_remote_frees[i], nil, superpage, .Acq_Rel, .Relaxed) + assert_contextless(old != superpage, "A remote thread found a duplicate of a superpage in a heap's superpages_with_remote_frees.") + if swapped { + intrinsics.atomic_add_explicit(&master.remote_free_count, 1, .Release) + return + } + } + next_cache_block := intrinsics.atomic_load_explicit(&cache.next_cache_block, .Acquire) + assert_contextless(next_cache_block != nil, "A remote thread failed to find free space for a new entry in another heap's superpages_with_remote_frees.") + cache = next_cache_block + } +} + +/* +Claim an additional superpage for the heap cache. +*/ +heap_cache_expand :: proc "contextless" (cache: ^Heap_Cache_Block) { + superpage := find_superpage_from_pointer(cache) + assert_contextless(superpage.next != nil, "The heap allocator tried to expand its cache but has run out of linked superpages.") + new_next_cache_block := &(superpage.next).cache_block + assert_contextless(new_next_cache_block.in_use == false, "The heap allocator tried to expand its cache, but the candidate superpage is already using its cache block.") + new_next_cache_block.in_use = true + intrinsics.atomic_store_explicit(&cache.next_cache_block, new_next_cache_block, .Release) + local_heap_cache.length += 1 + heap_debug_cover(.Heap_Expanded_Cache_Data) +} + +/* +Add a slab to the heap's cache, keyed to the bin size rank of `rank`. +*/ +heap_cache_add_slab :: proc "contextless" (slab: ^Heap_Slab, rank: int) { + assert_contextless(slab != nil) + cache := local_heap_cache + for { + start := rank * HEAP_CACHE_SLAB_MAP_STRIDE + for i := start; i < start+HEAP_CACHE_SLAB_MAP_STRIDE; i += 1 { + assert_contextless(cache.slab_map[i] != slab, "The heap allocator found a duplicate entry in its slab map.") + if cache.slab_map[i] == nil { + cache.slab_map[i] = slab + return + } + } + assert_contextless(cache.next_cache_block != nil) + cache = cache.next_cache_block + } +} + +/* +Get a slab from the heap's cache for an allocation of `rounded_size` bytes. + +If no slabs are already available for the bin rank, one will be created. +*/ +@(require_results) +heap_cache_get_slab :: proc "contextless" (rounded_size: int) -> (slab: ^Heap_Slab) { + rank := heap_bin_size_to_rank(rounded_size) + slab = local_heap_cache.slab_map[rank * HEAP_CACHE_SLAB_MAP_STRIDE] + if slab == nil { + superpage := local_heap_cache.superpages_with_free_slabs[0] + if superpage == nil { + superpage = heap_get_superpage() + heap_link_superpage(superpage) + heap_cache_register_superpage(superpage) + } + assert_contextless(superpage.free_slabs > 0) + slab = heap_slab_setup(superpage, rounded_size) + } + return +} + +/* +Remove a slab with the corresponding bin rank from the heap's cache. +*/ +heap_cache_remove_slab :: proc "contextless" (slab: ^Heap_Slab, rank: int) { + cache := local_heap_cache + assert_contextless(cache.in_use) + assert_contextless(slab.bin_size == 1 << (HEAP_MIN_BIN_SHIFT + uint(rank))) + for { + assert_contextless(cache != nil) + start := rank * HEAP_CACHE_SLAB_MAP_STRIDE + for i := start; i < start+HEAP_CACHE_SLAB_MAP_STRIDE; i += 1 { + if cache.slab_map[i] == slab { + // Swap with the tail. + source_i := i + target_cache := cache + i += 1 + for { + for j := i; j < start+HEAP_CACHE_SLAB_MAP_STRIDE; j += 1 { + if target_cache.slab_map[j] == nil { + cache.slab_map[source_i] = target_cache.slab_map[j-1] + target_cache.slab_map[j-1] = nil + return + } + } + if target_cache.next_cache_block == nil { + // The entry is at the end of the stride and we have + // run out of space to search. + cache.slab_map[source_i] = nil + assert_contextless(source_i % (HEAP_CACHE_SLAB_MAP_STRIDE-1) == 0, "The heap allocator tried to remove a non-terminal slab map entry when it should have swapped it with the tail.") + return + } else if target_cache.next_cache_block.slab_map[start] == nil { + // The starting bucket in the next cache block is empty, + // so we terminate on the current stride's final bucket. + cache.slab_map[source_i] = target_cache.slab_map[start+HEAP_CACHE_SLAB_MAP_STRIDE-1] + target_cache.slab_map[start+HEAP_CACHE_SLAB_MAP_STRIDE-1] = nil + return + } + target_cache = target_cache.next_cache_block + // Reset `i` after the first iteration. + i = start + } + } + } + // Entry must be in the expanded cache blocks. + assert_contextless(cache.next_cache_block != nil) + cache = cache.next_cache_block + } +} + +/* +Make an allocation using contiguous slabs as the backing. + +This procedure will check through the heap's cache for viable superpages to +support the allocation. +*/ +@(require_results) +heap_cache_get_contiguous_slabs :: proc "contextless" (size: int) -> (ptr: rawptr) { + contiguous := heap_slabs_needed_for_size(size) + cache := local_heap_cache + for { + for i := 0; i < len(cache.superpages_with_free_slabs); i += 1 { + if cache.superpages_with_free_slabs[i] == nil { + // No superpages with free slabs left. + heap_debug_cover(.Alloc_Slab_Wide_Needed_New_Superpage) + for { + superpage := heap_get_superpage() + heap_link_superpage(superpage) + heap_cache_register_superpage(superpage) + if superpage.free_slabs >= contiguous { + alloc, ok := heap_make_slab_sized_allocation(superpage, size) + if ok { + return alloc + } + } + } + } else { + heap_debug_cover(.Alloc_Slab_Wide_Used_Available_Superpage) + superpage := cache.superpages_with_free_slabs[i] + if superpage.free_slabs >= contiguous { + alloc, ok := heap_make_slab_sized_allocation(superpage, size) + if ok { + return alloc + } + } + } + } + assert_contextless(cache.next_cache_block != nil) + cache = cache.next_cache_block + } +} + +/* +Add a superpage with free slabs to the heap's cache. +*/ +heap_cache_add_superpage_with_free_slabs :: proc "contextless" (superpage: ^Heap_Superpage) { + assert_contextless(intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) == get_current_thread_id(), "The heap allocator tried to cache a superpage that does not belong to it.") + cache := local_heap_cache + + for { + for i := 0; i < len(cache.superpages_with_free_slabs); i += 1 { + if cache.superpages_with_free_slabs[i] == nil { + cache.superpages_with_free_slabs[i] = superpage + return + } + } + assert_contextless(cache.next_cache_block != nil) + cache = cache.next_cache_block + } +} + +/* +Remove a superpage from the heap's cache for superpages with free slabs. +*/ +heap_cache_remove_superpage_with_free_slabs :: proc "contextless" (superpage: ^Heap_Superpage) { + cache := local_heap_cache + for { + for i := 0; i < len(cache.superpages_with_free_slabs); i += 1 { + if cache.superpages_with_free_slabs[i] == superpage { + // Swap with the tail. + source_i := i + target_cache := cache + i += 1 + for { + for j := i; j < len(cache.superpages_with_free_slabs); j += 1 { + if target_cache.superpages_with_free_slabs[j] == nil { + cache.superpages_with_free_slabs[source_i] = target_cache.superpages_with_free_slabs[j-1] + target_cache.superpages_with_free_slabs[j-1] = nil + return + } + } + if target_cache.next_cache_block == nil { + // The entry is at the end of the list and we have run + // out of space to search. + cache.superpages_with_free_slabs[source_i] = nil + assert_contextless(source_i == len(cache.superpages_with_free_slabs), "The heap allocator tried to remove a non-terminal superpage with free slabs entry when it should have swapped it with the tail.") + return + } else if target_cache.next_cache_block.superpages_with_free_slabs[0] == nil { + // The next list section is empty, so we have to end here. + cache.superpages_with_free_slabs[source_i] = target_cache.superpages_with_free_slabs[len(cache.superpages_with_free_slabs)-1] + target_cache.superpages_with_free_slabs[len(cache.superpages_with_free_slabs)-1] = nil + return + } + target_cache = target_cache.next_cache_block + // Reset `i` after the first iteration. + i = 0 + } + } + } + assert_contextless(cache.next_cache_block != nil) + cache = cache.next_cache_block + } +} + +// +// Superpage Orphanage +// + +// This construction is used to avoid the ABA problem. +// +// Virtually all systems we support should have a 64-bit CAS to make this work. +Tagged_Pointer :: bit_field u64 { + // Intel 5-level paging uses up to 56 bits of a pointer on x86-64. + // This should be enough to cover ARM64, too. + // + // We use an `i64` here to maintain sign extension on the upper bits for + // systems where this is relevant, hence the extra bit over 56. + pointer: i64 | 57, + // We only need so many bits that enough transactions don't happen so fast + // as to roll over the value to cause a situation where ABA can manifest. + version: u8 | 7, +} + +/* +Put a Superpage into the global orphanage. + +The caller is responsible for fetch-adding the count. +*/ +heap_push_orphan :: proc "contextless" (superpage: ^Heap_Superpage) { + assert_contextless(intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) != 0, "The heap allocator tried to push an unowned superpage to the orphanage.") + + // The algorithm below is one of the well-known methods of resolving the + // ABA problem, known as a tagged pointer. The gist is that if another + // thread has changed the value, we'll be able to detect that by checking + // against the version bits. + old_head := transmute(Tagged_Pointer)intrinsics.atomic_load_explicit(cast(^u64)&heap_orphanage, .Relaxed) + intrinsics.atomic_store_explicit(&superpage.master_cache_block, nil, .Seq_Cst) + // NOTE: This next instruction must not float above the previous one, as + // this superpage could host the `master_cache_block`. The order is important + // to keep other threads from trying to access it while we're clearing it. + if superpage.cache_block.in_use { + intrinsics.mem_zero_volatile(&superpage.cache_block, size_of(Heap_Cache_Block)) + heap_debug_cover(.Superpage_Cache_Block_Cleared) + } + superpage.prev = nil + intrinsics.atomic_store_explicit(&superpage.owner, 0, .Release) + for { + // NOTE: `next` is accessed atomically when pushing or popping from the + // orphanage, because this field must synchronize with other threads at + // this point. + // + // This has to do mainly with swinging the head's linking pointer. + // + // Beyond this point, the thread which owns the superpage will be the + // only one to read `next`, hence why it is not read atomically + // anywhere else. + intrinsics.atomic_store_explicit(&superpage.next, cast(^Heap_Superpage)uintptr(old_head.pointer), .Release) + new_head: Tagged_Pointer = --- + new_head.pointer = i64(uintptr(superpage)) + new_head.version = old_head.version + 1 + + old_head_, swapped := intrinsics.atomic_compare_exchange_weak_explicit(cast(^u64)&heap_orphanage, transmute(u64)old_head, transmute(u64)new_head, .Acq_Rel, .Relaxed) + if swapped { + break + } + old_head = transmute(Tagged_Pointer)old_head_ + } +} + +/* +Remove and return the first entry from the Superpage orphanage, which may be nil. +*/ +@(require_results) +heap_pop_orphan :: proc "contextless" () -> (superpage: ^Heap_Superpage) { + old_head := transmute(Tagged_Pointer)intrinsics.atomic_load_explicit(cast(^u64)&heap_orphanage, .Relaxed) + for { + superpage = cast(^Heap_Superpage)uintptr(old_head.pointer) + if superpage == nil { + return + } + new_head: Tagged_Pointer = --- + new_head.pointer = i64(uintptr(intrinsics.atomic_load_explicit(&superpage.next, .Acquire))) + new_head.version = old_head.version + 1 + + old_head_, swapped := intrinsics.atomic_compare_exchange_weak_explicit(cast(^u64)&heap_orphanage, transmute(u64)old_head, transmute(u64)new_head, .Acq_Rel, .Relaxed) + if swapped { + intrinsics.atomic_store_explicit(&superpage.next, nil, .Release) + intrinsics.atomic_store_explicit(&superpage.owner, get_current_thread_id(), .Release) + intrinsics.atomic_sub_explicit(&heap_orphanage_count, 1, .Release) + break + } + old_head = transmute(Tagged_Pointer)old_head_ + } + return +} + +// +// Globals +// + +@(init, private) +setup_superpage_orphanage :: proc "contextless" () { + when !VIRTUAL_MEMORY_SUPPORTED { + return + } + + // Upon a thread's clean exit, this procedure will compact its heap and + // distribute the superpages into the orphanage, if it has space. + add_thread_local_cleaner(proc "odin" () { + for superpage := local_heap; superpage != nil; /**/ { + next_superpage := superpage.next + // The following logic is a specialized case of the same found in + // `compact_heap` that ignores the cache since there's no need to + // update it when the thread's heap is being broken down. + for i := 0; i < HEAP_SLAB_COUNT; /**/ { + slab := heap_superpage_index_slab(superpage, i) + + if slab.bin_size > HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + i += heap_slabs_needed_for_size(slab.bin_size) + } else { + i += 1 + if slab.bin_size == 0 { + continue + } + } + + if intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Acquire) > 0 { + heap_merge_remote_frees(slab) + + if slab.free_bins > 0 { + // Synchronize with any thread that might be trying to + // free as we merge. + intrinsics.atomic_store_explicit(&slab.is_full, false, .Release) + } + heap_debug_cover(.Orphaned_Superpage_Merged_Remote_Frees) + } + + if slab.free_bins == slab.max_bins { + if slab.bin_size > HEAP_MAX_BIN_SIZE { + heap_free_wide_slab(superpage, slab) + } else { + heap_free_slab(superpage, slab) + } + heap_debug_cover(.Orphaned_Superpage_Freed_Slab) + } + } + + if superpage.free_slabs == HEAP_SLAB_COUNT { + if intrinsics.atomic_add_explicit(&heap_orphanage_count, 1, .Acq_Rel) >= HEAP_MAX_EMPTY_ORPHANED_SUPERPAGES { + intrinsics.atomic_sub_explicit(&heap_orphanage_count, 1, .Relaxed) + + free_virtual_memory(superpage, SUPERPAGE_SIZE) + heap_debug_cover(.Superpage_Freed_By_Exiting_Thread) + } else { + heap_push_orphan(superpage) + heap_debug_cover(.Superpage_Orphaned_By_Exiting_Thread) + } + } else { + intrinsics.atomic_add_explicit(&heap_orphanage_count, 1, .Release) + heap_push_orphan(superpage) + heap_debug_cover(.Superpage_Orphaned_By_Exiting_Thread) + } + + superpage = next_superpage + } + }) +} + +// This is a lock-free, intrusively singly-linked list of Superpages that are +// not in any thread's heap. They are free for adoption by other threads +// needing memory. +heap_orphanage: Tagged_Pointer + +// This is an _estimate_ of the number of Superpages in the orphanage. It can +// never be entirely accurate due to the nature of the design. +heap_orphanage_count: int + +@(thread_local) local_heap: ^Heap_Superpage +@(thread_local) local_heap_tail: ^Heap_Superpage +@(thread_local) local_heap_cache: ^Heap_Cache_Block + +// +// API +// + +/* +Allocate an arbitrary amount of memory from the heap and optionally zero it. +*/ +@(require_results) +heap_alloc :: proc "contextless" (size: int, zero_memory: bool = true) -> (ptr: rawptr) { + assert_contextless(size >= 0, "The heap allocator was given a negative size.") + + // Handle Huge allocations. + if size >= HEAP_HUGE_ALLOCATION_THRESHOLD { + heap_debug_cover(.Alloc_Huge) + return heap_make_huge_allocation(size) + } + + // Initialize the heap if needed. + if local_heap == nil { + local_heap = heap_get_superpage() + local_heap_tail = local_heap + local_heap_cache = &local_heap.cache_block + local_heap_cache.in_use = true + heap_cache_register_superpage(local_heap) + heap_debug_cover(.Alloc_Heap_Initialized) + } + + // Take care of any remote frees. + remote_free_count := intrinsics.atomic_load_explicit(&local_heap_cache.remote_free_count, .Acquire) + if remote_free_count > 0 { + cache := local_heap_cache + removed := 0 + counter := remote_free_count + // Go through all superpages in the cache for superpages with remote + // frees to see if any still have frees needing merged. + merge_loop: for { + consume_loop: for i in 0.. 0 && slab.free_bins == 0 && intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Acquire) > 0 { + heap_merge_remote_frees(slab) + if slab.free_bins == 0 { + // No bins were freed at all, which is possible due + // to the parallel nature of this code. The freeing + // thread could have signalled its intent, but we + // merged before it had a chance to flip the + // necessary bit. + should_reschedule = true + break merge_block + } + intrinsics.atomic_store_explicit(&slab.is_full, false, .Release) + + if bin_size > HEAP_MAX_BIN_SIZE { + heap_free_wide_slab(superpage, slab) + } else { + if slab.free_bins == slab.max_bins { + heap_free_slab(superpage, slab) + } else { + heap_cache_add_slab(slab, heap_bin_size_to_rank(bin_size)) + } + } + } + + if bin_size > HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + j += heap_slabs_needed_for_size(bin_size) + } else { + j += 1 + } + } + + if should_reschedule { + // This is the logic found in `heap_remote_cache_add_remote_free_superpage`, simplified. + if !intrinsics.atomic_exchange_explicit(&superpage.remote_free_set, true, .Acq_Rel) { + cache_reschedule := local_heap_cache + reschedule_loop: for { + for j := 0; j < len(cache_reschedule.superpages_with_remote_frees); j += 1 { + old, swapped := intrinsics.atomic_compare_exchange_strong_explicit(&cache_reschedule.superpages_with_remote_frees[j], nil, superpage, .Acq_Rel, .Relaxed) + assert_contextless(old != superpage, "The heap allocator found a duplicate of a superpage in its superpages_with_remote_frees while rescheduling.") + if swapped { + intrinsics.atomic_add_explicit(&local_heap_cache.remote_free_count, 1, .Release) + break reschedule_loop + } + } + next_cache_block := intrinsics.atomic_load_explicit(&cache_reschedule.next_cache_block, .Acquire) + assert_contextless(next_cache_block != nil, "The heap allocator failed to find free space for a new entry in its superpages_with_remote_frees cache.") + cache_reschedule = next_cache_block + } + } + } else { + removed += 1 + } + + counter -= 1 + if counter == 0 { + break merge_loop + } + } + if cache.next_cache_block == nil { + break merge_loop + } + assert_contextless(cache.next_cache_block != nil) + cache = cache.next_cache_block + } + intrinsics.atomic_sub_explicit(&local_heap_cache.remote_free_count, removed, .Release) + heap_debug_cover(.Alloc_Collected_Remote_Frees) + } + + // Handle slab-wide allocations. + if size > HEAP_MAX_BIN_SIZE { + heap_debug_cover(.Alloc_Slab_Wide) + return heap_cache_get_contiguous_slabs(size) + } + + // Get a suitable slab from the heap. + rounded_size := heap_round_to_bin_size(size) + slab := heap_cache_get_slab(rounded_size) + assert_contextless(slab.bin_size == rounded_size, "The heap allocator found a slab with the wrong bin size during allocation.") + + // Allocate a bin inside the slab. + sector := slab.next_free_sector + sector_bits := slab.local_free[sector] + assert_contextless(sector_bits != 0, "The heap allocator found a slab with a full next_free_sector.") + + // Select the lowest free bit. + index := uintptr(intrinsics.count_trailing_zeros(sector_bits)) + + // Convert the index to a pointer. + ptr = rawptr(slab.data + (uintptr(sector * SECTOR_BITS) + index) * uintptr(rounded_size)) + when !ODIN_DISABLE_ASSERT { + base_alignment := min(HEAP_MAX_ALIGNMENT, uintptr(rounded_size)) + assert_contextless(uintptr(ptr) & uintptr(base_alignment-1) == 0, "A pointer allocated by the heap is not well-aligned.") + } + + // Clear the free bit. + slab.local_free[sector] &~= (1 << index) + + // Zero the memory, if needed. + if zero_memory && index < uintptr(slab.dirty_bins) { + // Ensure that the memory zeroing is not optimized out by the compiler. + intrinsics.mem_zero_volatile(ptr, rounded_size) + // NOTE: A full memory fence should not be needed for any newly-zeroed + // allocation, as each thread controls its own heap, and for one thread + // to pass a memory address to another implies some secondary + // synchronization method, such as a mutex, which would be the way by + // which the threads come to agree on the state of main memory. + heap_debug_cover(.Alloc_Zeroed_Memory) + } + + // Update statistics. + slab.dirty_bins = int(max(slab.dirty_bins, 1 + sector * SECTOR_BITS + int(index))) + slab.free_bins -= 1 + + // Remove the slab from the cache if the slab's full. + // Otherwise, update the next free sector if the sector's full. + if slab.free_bins == 0 { + slab.next_free_sector = slab.sectors + if intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Seq_Cst) > 0 { + heap_merge_remote_frees(slab) + if slab.free_bins == 0 { + // We have come before the other thread, and the bit we needed to find was not set. + // Treat it as if it is full anyway. + intrinsics.atomic_store_explicit(&slab.is_full, true, .Release) + heap_cache_remove_slab(slab, heap_bin_size_to_rank(rounded_size)) + } else { + // No cache adjustment happens here; we know it's already in the cache. + } + } else { + intrinsics.atomic_store_explicit(&slab.is_full, true, .Release) + heap_cache_remove_slab(slab, heap_bin_size_to_rank(rounded_size)) + } + } else { + if slab.local_free[sector] == 0 { + sector += 1 + for /**/; sector < slab.sectors; sector += 1 { + if slab.local_free[sector] != 0 { + break + } + } + slab.next_free_sector = sector + } + } + heap_debug_cover(.Alloc_Bin) + return +} + +/* +Free memory returned by `heap_alloc`. +*/ +heap_free :: proc "contextless" (ptr: rawptr) { + // Check for nil. + if ptr == nil { + return + } + + superpage := find_superpage_from_pointer(ptr) + + // Check if this is a huge allocation. + if superpage.huge_size > 0 { + // NOTE: If the allocator is passed a pointer that it does not own, + // which is a scenario that it cannot possibly detect (in any + // reasonably performant fashion), then the condition above may result + // in a segmentation violation. + // + // Regardless, the result of passing a pointer to this heap allocator + // which it did not return is undefined behavior. + free_virtual_memory(superpage, superpage.huge_size) + heap_debug_cover(.Freed_Huge_Allocation) + return + } + + // Find which slab this pointer belongs to. + slab := find_slab_from_pointer(ptr) + assert_contextless(slab.bin_size > 0, "The heap allocator tried to free a pointer belonging to an empty slab.") + + // Check if this is a slab-wide allocation. + if slab.bin_size > HEAP_MAX_BIN_SIZE { + if intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) == get_current_thread_id() { + heap_free_wide_slab(superpage, slab) + heap_debug_cover(.Freed_Wide_Slab) + } else { + // Atomically let the owner know there's a free slab. + intrinsics.atomic_add_explicit(&slab.remote_free_bins_scheduled, 1, .Release) + old := intrinsics.atomic_or_explicit(&slab.remote_free[0], 1, .Seq_Cst) + heap_remote_cache_add_remote_free_superpage(superpage) + + when HEAP_PANIC_ON_DOUBLE_FREE { + if old == 1 { + panic_contextless("The heap allocator freed an already-free pointer.") + } + } + heap_debug_cover(.Remotely_Freed_Wide_Slab) + } + return + } + + // Find which sector and bin this pointer refers to. + bin_number := int(uintptr(ptr) - slab.data) / slab.bin_size + sector := bin_number / INTEGER_BITS + index := uint(bin_number) % INTEGER_BITS + + assert_contextless(bin_number < slab.max_bins, "Calculated an incorrect bin number for slab.") + assert_contextless(sector < slab.sectors, "Calculated an incorrect sector for slab.") + + // See if we own the slab or not, then free the pointer. + if intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) == get_current_thread_id() { + when HEAP_PANIC_ON_DOUBLE_FREE { + if slab.local_free[sector] & (1 << index) != 0 { + panic_contextless("The heap allocator freed an already-free pointer.") + } + } + // Mark the bin as free. + slab.local_free[sector] |= 1 << index + if slab.free_bins == 0 { + intrinsics.atomic_store_explicit(&slab.is_full, false, .Release) + // Put this slab back in the available list. + heap_cache_add_slab(slab, heap_bin_size_to_rank(slab.bin_size)) + heap_debug_cover(.Freed_Bin_Reopened_Full_Slab) + } + slab.free_bins += 1 + assert_contextless(slab.free_bins <= slab.max_bins, "A slab of the heap allocator overflowed its free bins.") + // Free the slab if it's empty and was at one point in time completely + // full. This is a heuristic to prevent a single repeated new/free + // operation in an otherwise empty slab from bogging down the + // allocator. + if slab.free_bins == slab.max_bins && slab.dirty_bins == slab.max_bins { + heap_cache_remove_slab(slab, heap_bin_size_to_rank(slab.bin_size)) + heap_free_slab(superpage, slab) + heap_debug_cover(.Freed_Bin_Freed_Slab_Which_Was_Fully_Used) + } else { + slab.next_free_sector = min(slab.next_free_sector, sector) + heap_debug_cover(.Freed_Bin_Updated_Slab_Next_Free_Sector) + } + // Free the entire superpage if it's empty. + if superpage.free_slabs == HEAP_SLAB_COUNT && !superpage.cache_block.in_use { + heap_free_superpage(superpage) + heap_debug_cover(.Freed_Bin_Freed_Superpage) + } + } else { + // Atomically let the owner know there's a free bin. + + // NOTE: The order of operations here is important. + // + // 1. We must first check if the slab is full. If we wait until later, + // we risk causing a race. + is_full := intrinsics.atomic_load_explicit(&slab.is_full, .Acquire) + + // 2. We have to let the owner know that we intend to schedule a remote + // free. This way, it can keep the cached entry around if it isn't + // able to merge all of them due to timing differences on our part. + intrinsics.atomic_add_explicit(&slab.remote_free_bins_scheduled, 1, .Release) + + // (Technically, the compiler or processor is allowed to re-order the + // above two operations, and this is okay. However, the next one acts + // as a full barrier due to its Sequential Consistency ordering.) + + // 3. Finally, we flip the bit of the bin we want freed. This must be + // the final operation across all cores, because the owner could + // already be in the process of merging remote frees, and if our bin + // was the last one, it has the authorization to wipe the slab and + // possibly reallocate in the same block of memory. + // + // By deferring this to the end, we avoid any possibility of a race, + // even if multiple threads are in this section. + old := intrinsics.atomic_or_explicit(&slab.remote_free[sector], 1 << index, .Seq_Cst) + + // 4. Now we can safely check if the slab is full and add it to the + // owner's cache if needed. + if is_full { + heap_remote_cache_add_remote_free_superpage(superpage) + heap_debug_cover(.Remotely_Freed_Bin_Caused_Remote_Superpage_Caching) + } + + when HEAP_PANIC_ON_DOUBLE_FREE { + if old & (1 << index) != 0 { + panic_contextless("The heap allocator freed an already-free pointer.") + } + } + heap_debug_cover(.Remotely_Freed_Bin) + + // NOTE: It is possible for two threads to be here at the same time, + // thus violating any sense of order among the actual number of bins + // free and the reported number. + // + // T1 & T2 could flip bits, + // T2 could increment the free count, then + // T3 could merge the free bins. + // + // This scenario would result in an inconsistent count no matter which + // way the above procedure is carried out, hence its unreliability. + } +} + +/* +Resize memory returned by `heap_alloc`. +*/ +@(require_results) +heap_resize :: proc "contextless" (old_ptr: rawptr, old_size: int, new_size: int, zero_memory: bool = true) -> (new_ptr: rawptr) { + Size_Category :: enum { + Unknown, + Bin, + Slab, + Huge, + } + + // We need to first determine if we're crossing size categories. + old_category: Size_Category + switch { + case old_size <= HEAP_MAX_BIN_SIZE: old_category = .Bin + case old_size < HEAP_HUGE_ALLOCATION_THRESHOLD: old_category = .Slab + case old_size >= HEAP_HUGE_ALLOCATION_THRESHOLD: old_category = .Huge + case: unreachable() + } + new_category: Size_Category + switch { + case new_size <= HEAP_MAX_BIN_SIZE: new_category = .Bin + case new_size < HEAP_HUGE_ALLOCATION_THRESHOLD: new_category = .Slab + case new_size >= HEAP_HUGE_ALLOCATION_THRESHOLD: new_category = .Huge + case: unreachable() + } + assert_contextless(old_category != .Unknown) + assert_contextless(new_category != .Unknown) + + if new_category != old_category { + // A change in size category cannot be optimized. + new_ptr = heap_alloc(new_size, zero_memory) + intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) + heap_free(old_ptr) + heap_debug_cover(.Resize_Crossed_Size_Categories) + return + } + + // NOTE: Superpage owners are the only ones that can change the amount of + // memory that backs each pointer. Other threads have to allocate and copy. + + // Check if this is a huge allocation. + superpage := find_superpage_from_pointer(old_ptr) + if superpage.huge_size > 0 { + // This block follows the preamble in `heap_make_huge_allocation`. + new_real_size := new_size + if new_size < SUPERPAGE_SIZE - HEAP_HUGE_ALLOCATION_BOOK_KEEPING { + new_real_size = SUPERPAGE_SIZE + heap_debug_cover(.Resize_Huge_Size_Set_To_Superpage) + } else { + new_real_size += HEAP_HUGE_ALLOCATION_BOOK_KEEPING + heap_debug_cover(.Resize_Huge_Size_Adjusted) + } + resized_superpage := cast(^Heap_Superpage)resize_virtual_memory(superpage, superpage.huge_size, new_real_size, SUPERPAGE_SIZE) + assert_contextless(uintptr(resized_superpage) & (SUPERPAGE_SIZE-1) == 0, "After resizing a huge allocation, the pointer was no longer aligned to a superpage boundary.") + resized_superpage.huge_size = new_real_size + u := uintptr(resized_superpage) + HEAP_HUGE_ALLOCATION_BOOK_KEEPING + new_ptr = rawptr(u - u & (HEAP_MAX_ALIGNMENT-1)) + + if zero_memory && new_size > old_size { + intrinsics.mem_zero_volatile( + rawptr(uintptr(new_ptr) + uintptr(old_size)), + new_size - old_size, + ) + heap_debug_cover(.Resize_Huge_Caused_Memory_Zeroing) + } + + heap_debug_cover(.Resize_Huge) + return + } + + // Find which slab this pointer belongs to. + slab := find_slab_from_pointer(old_ptr) + + // Check if this is a slab-wide allocation. + if slab.bin_size > HEAP_MAX_BIN_SIZE { + contiguous_old := heap_slabs_needed_for_size(slab.bin_size) + contiguous_new := heap_slabs_needed_for_size(new_size) + if contiguous_new == contiguous_old { + // We already have enough slabs to serve the request. + if zero_memory && new_size > old_size { + intrinsics.mem_zero_volatile( + rawptr(uintptr(old_ptr) + uintptr(old_size)), + new_size - old_size, + ) + heap_debug_cover(.Resize_Wide_Slab_Caused_Memory_Zeroing) + } + heap_debug_cover(.Resize_Wide_Slab_Kept_Old_Pointer) + return old_ptr + } + + if slab.index + contiguous_new >= HEAP_SLAB_COUNT { + // Expanding this slab would go beyond the Superpage. + // We need more memory. + new_ptr = heap_alloc(new_size, zero_memory) + intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) + heap_free(old_ptr) + return + } + + if intrinsics.atomic_load_explicit(&superpage.owner, .Acquire) != get_current_thread_id() { + // We are not the owner of this data, therefore none of the special + // optimized paths for wide slabs are available to us, as they all + // involve touching the Superpage. + // + // We must re-allocate. + new_ptr = heap_alloc(new_size, zero_memory) + intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) + heap_free(old_ptr) + heap_debug_cover(.Resize_Wide_Slab_From_Remote_Thread) + return + } + + // Can we shrink the wide slab, or can we expand it in-place? + if contiguous_new < contiguous_old { + previously_full_superpage := superpage.free_slabs == 0 + for i := slab.index + contiguous_new; i < slab.index + contiguous_old; i += 1 { + // Mark the latter slabs as unused. + next_slab := heap_superpage_index_slab(superpage, i) + next_slab.index = i + next_slab.is_dirty = true + next_slab.bin_size = 0 + } + superpage.next_free_slab_index = min(superpage.next_free_slab_index, slab.index + contiguous_new) + superpage.free_slabs += contiguous_old - contiguous_new + if previously_full_superpage { + heap_cache_add_superpage_with_free_slabs(superpage) + heap_debug_cover(.Superpage_Added_To_Open_Cache_By_Resizing_Wide_Slab) + } + heap_debug_cover(.Resize_Wide_Slab_Shrunk_In_Place) + } else { + // NOTE: We've already guarded against going beyond `HEAP_SLAB_COUNT` in the section above. + for i := slab.index + contiguous_old; i < slab.index + contiguous_new; i += 1 { + if heap_superpage_index_slab(superpage, i).bin_size != 0 { + // Contiguous space is unavailable. + new_ptr = heap_alloc(new_size, zero_memory) + intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) + heap_free(old_ptr) + heap_debug_cover(.Resize_Wide_Slab_Failed_To_Find_Contiguous_Expansion) + return + } + } + for i := slab.index + contiguous_old; i < slab.index + contiguous_new; i += 1 { + // Wipe the index bits, and if needed, the rest of the data. + next_slab := heap_superpage_index_slab(superpage, i) + next_slab.index = 0 + if next_slab.is_dirty { + heap_slab_clear_data(next_slab) + } + } + superpage.free_slabs += contiguous_old - contiguous_new + if superpage.free_slabs == 0 { + heap_cache_remove_superpage_with_free_slabs(superpage) + } else { + heap_update_next_free_slab_index(superpage, 0) + } + heap_debug_cover(.Resize_Wide_Slab_Expanded_In_Place) + } + + // The slab-wide allocation has been resized in-place. + slab.bin_size = new_size + + return old_ptr + } + + // See if a bin rank change is needed. + new_rounded_size := heap_round_to_bin_size(new_size) + if slab.bin_size == new_rounded_size { + if zero_memory && new_size > old_size { + intrinsics.mem_zero_volatile( + rawptr(uintptr(old_ptr) + uintptr(old_size)), + new_size - old_size, + ) + // It could be argued that a full memory fence is necessary here, + // because one thread may resize an address known to other threads, + // but as is the case with zeroing during allocation, we treat this + // as if the state change is not independent of the allocator. + // + // That is to say, if one thread resizes an address in-place, it's + // expected that other threads will need to be notified of this by + // the program, as with any other synchronization. + heap_debug_cover(.Resize_Caused_Memory_Zeroing) + } + heap_debug_cover(.Resize_Kept_Old_Pointer) + return old_ptr + } + + // Allocate and copy, as a last resort. + new_ptr = heap_alloc(new_size, zero_memory) + intrinsics.mem_copy_non_overlapping(new_ptr, old_ptr, min(old_size, new_size)) + heap_free(old_ptr) + return +} diff --git a/base/runtime/heap_allocator_info.odin b/base/runtime/heap_allocator_info.odin new file mode 100644 index 00000000000..9249845b486 --- /dev/null +++ b/base/runtime/heap_allocator_info.odin @@ -0,0 +1,159 @@ +#+build !js +#+build !orca +#+build !wasi +package runtime + +import "base:intrinsics" + +/* +Heap_Info provides metrics on a single thread's heap memory usage. +*/ +Heap_Info :: struct { + total_memory_allocated_from_system: int `fmt:"M"`, + total_memory_used_for_book_keeping: int `fmt:"M"`, + total_memory_in_use: int `fmt:"M"`, + total_memory_free: int `fmt:"M"`, + total_memory_dirty: int `fmt:"M"`, + total_memory_remotely_free: int `fmt:"M"`, + + total_superpages: int, + total_superpages_dedicated_to_heap_cache: int, + total_huge_allocations: int, + total_slabs: int, + total_slabs_in_use: int, + + total_dirty_bins: int, + total_free_bins: int, + total_bins_in_use: int, + total_remote_free_bins: int, + + heap_slab_map_entries: int, + heap_superpages_with_free_slabs: int, + heap_slabs_with_remote_frees: int, +} + +/* +Get information about the current thread's heap. + +This will do additional sanity checking on the heap if assertions are enabled. +*/ +@(require_results) +get_local_heap_info :: proc "contextless" () -> (info: Heap_Info) { + if local_heap_cache != nil { + cache := local_heap_cache + slab_map_terminated: [HEAP_BIN_RANKS]bool + + for { + for rank := 0; rank < HEAP_BIN_RANKS; rank += 1 { + for i := 0; i < HEAP_CACHE_SLAB_MAP_STRIDE; i += 1 { + slab := cache.slab_map[rank * HEAP_CACHE_SLAB_MAP_STRIDE + i] + if slab_map_terminated[rank] { + assert_contextless(slab == nil, "The heap allocator has a gap in its slab map.") + } else if slab == nil { + slab_map_terminated[rank] = true + } else { + info.heap_slab_map_entries += 1 + assert_contextless(slab.bin_size != 0, "The heap allocator has an empty slab in its slab map.") + assert_contextless(slab.bin_size == 1 << (HEAP_MIN_BIN_SHIFT + uint(rank)), "The heap allocator has a slab in the wrong sub-array of the slab map.") + } + } + } + for superpage in cache.superpages_with_free_slabs { + if superpage != nil { + info.heap_superpages_with_free_slabs += 1 + } + } + for i in 0.. 0 { + info.total_huge_allocations += 1 + info.total_memory_allocated_from_system += superpage.huge_size + info.total_memory_in_use += superpage.huge_size - HEAP_HUGE_ALLOCATION_BOOK_KEEPING + info.total_memory_used_for_book_keeping += HEAP_HUGE_ALLOCATION_BOOK_KEEPING + } else { + if superpage.cache_block.in_use { + info.total_superpages_dedicated_to_heap_cache += 1 + } + info.total_memory_allocated_from_system += SUPERPAGE_SIZE + for i := 0; i < HEAP_SLAB_COUNT; /**/ { + slab := heap_superpage_index_slab(superpage, i) + + if slab.bin_size != 0 { + info.total_slabs_in_use += 1 + info.total_memory_in_use += slab.bin_size * (slab.max_bins - slab.free_bins) + info.total_memory_free += slab.bin_size * slab.free_bins + info.total_memory_dirty += slab.bin_size * slab.dirty_bins + info.total_bins_in_use += slab.max_bins - slab.free_bins + info.total_free_bins += slab.free_bins + info.total_dirty_bins += slab.dirty_bins + assert_contextless(slab.dirty_bins >= slab.max_bins - slab.free_bins, "A slab of the heap allocator has a number of dirty bins which is not equivalent to the number of its total bins minus the number of free bins.") + // Account for the bitmaps used by the Slab. + info.total_memory_used_for_book_keeping += int(slab.data - uintptr(slab)) + // Account for the space not used by the bins or the bitmaps. + n := int(slab.data - uintptr(slab) + uintptr(slab.max_bins * slab.bin_size)) + if slab.bin_size > HEAP_MAX_BIN_SIZE { + info.total_memory_used_for_book_keeping += heap_slabs_needed_for_size(slab.bin_size) * HEAP_SLAB_SIZE - n + } else { + info.total_memory_used_for_book_keeping += HEAP_SLAB_SIZE - n + } + remote_free_bins := 0 + for j in 0.. HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + i += heap_slabs_needed_for_size(slab.bin_size) + } else { + i += 1 + } + } + // Every superpage has to sacrifice one Slab's worth of space so + // that they're all aligned. + info.total_memory_used_for_book_keeping += HEAP_SLAB_SIZE + info.total_slabs += HEAP_SLAB_COUNT + } + superpage = superpage.next + } + + assert_contextless(info.total_memory_allocated_from_system == info.total_memory_used_for_book_keeping + info.total_memory_in_use + info.total_memory_free, "The heap allocator's metrics for total memory in use, free, and used for book-keeping do not add up to the total memory allocated from the operating system.") + + if local_heap_cache != nil { + assert_contextless(info.total_superpages == local_heap_cache.owned_superpages) + } + return +} diff --git a/base/runtime/thread_management.odin b/base/runtime/threading.odin similarity index 68% rename from base/runtime/thread_management.odin rename to base/runtime/threading.odin index cabd4691cee..778d7e90061 100644 --- a/base/runtime/thread_management.odin +++ b/base/runtime/threading.odin @@ -5,6 +5,20 @@ Thread_Local_Cleaner :: #type proc "odin" () @(private="file") thread_local_cleaners: [8]Thread_Local_Cleaner +// The thread ID is cached here to save time from making syscalls with every +// call. +@(thread_local) local_thread_id: int + +// Return the current thread ID. +@(require_results) +get_current_thread_id :: proc "contextless" () -> int { + if local_thread_id == 0 { + local_thread_id = _get_current_thread_id() + assert_contextless(local_thread_id != 0, "_get_current_thread_id returned 0.") + } + return local_thread_id +} + // Add a procedure that will be run at the end of a thread for the purpose of // deallocating state marked as `thread_local`. // diff --git a/base/runtime/threading_darwin.odin b/base/runtime/threading_darwin.odin new file mode 100644 index 00000000000..8dd76f15528 --- /dev/null +++ b/base/runtime/threading_darwin.odin @@ -0,0 +1,17 @@ +#+private +package runtime + +foreign import lib "system:System.framework" + +foreign lib { + pthread_threadid_np :: proc "c" (rawptr, ^u64) -> i32 --- +} + +_get_current_thread_id :: proc "contextless" () -> int { + tid: u64 + result := pthread_threadid_np(nil, &tid) + if result != 0 { + panic_contextless("Failed to get current thread ID.") + } + return int(tid) +} diff --git a/base/runtime/threading_freebsd.odin b/base/runtime/threading_freebsd.odin new file mode 100644 index 00000000000..053e636401d --- /dev/null +++ b/base/runtime/threading_freebsd.odin @@ -0,0 +1,19 @@ +#+private +package runtime + +import "base:intrinsics" + +SYS_thr_self :: uintptr(432) + +_get_current_thread_id :: proc "contextless" () -> int { + when size_of(rawptr) == 4 { + id: i32 + } else { + id: i64 + } + _, ok := intrinsics.syscall_bsd(SYS_thr_self, uintptr(&id)) + if !ok { + panic_contextless("Failed to get current thread ID.") + } + return int(id) +} diff --git a/base/runtime/threading_linux.odin b/base/runtime/threading_linux.odin new file mode 100644 index 00000000000..3fc7aa993ae --- /dev/null +++ b/base/runtime/threading_linux.odin @@ -0,0 +1,22 @@ +#+private +package runtime + +import "base:intrinsics" + +when ODIN_ARCH == .amd64 { + SYS_gettid :: uintptr(186) +} else when ODIN_ARCH == .arm32 { + SYS_gettid :: uintptr(224) +} else when ODIN_ARCH == .arm64 { + SYS_gettid :: uintptr(178) +} else when ODIN_ARCH == .i386 { + SYS_gettid :: uintptr(224) +} else when ODIN_ARCH == .riscv64 { + SYS_gettid :: uintptr(178) +} else { + #panic("Syscall numbers related to threading are missing for this Linux architecture.") +} + +_get_current_thread_id :: proc "contextless" () -> int { + return int(intrinsics.syscall(SYS_gettid)) +} diff --git a/base/runtime/threading_netbsd.odin b/base/runtime/threading_netbsd.odin new file mode 100644 index 00000000000..0ba6fd0fd3e --- /dev/null +++ b/base/runtime/threading_netbsd.odin @@ -0,0 +1,11 @@ +#+private +package runtime + +import "base:intrinsics" + +SYS__lwp_self :: uintptr(311) + +_get_current_thread_id :: proc "contextless" () -> int { + result, _ := intrinsics.syscall_bsd(SYS__lwp_self) + return int(result) +} diff --git a/base/runtime/threading_openbsd.odin b/base/runtime/threading_openbsd.odin new file mode 100644 index 00000000000..8bfb3de9b24 --- /dev/null +++ b/base/runtime/threading_openbsd.odin @@ -0,0 +1,11 @@ +#+private +package runtime + +import "base:intrinsics" + +SYS_getthrid :: uintptr(299) + +_get_current_thread_id :: proc "contextless" () -> int { + result, _ := intrinsics.syscall_bsd(SYS_getthrid) + return int(result) +} diff --git a/base/runtime/threading_other.odin b/base/runtime/threading_other.odin new file mode 100644 index 00000000000..4bad1a24240 --- /dev/null +++ b/base/runtime/threading_other.odin @@ -0,0 +1,12 @@ +#+private +#+build !darwin +#+build !freebsd +#+build !linux +#+build !netbsd +#+build !openbsd +#+build !windows +package runtime + +_get_current_thread_id :: proc "contextless" () -> int { + unimplemented_contextless("This platform does not support multithreading.") +} diff --git a/base/runtime/threading_windows.odin b/base/runtime/threading_windows.odin new file mode 100644 index 00000000000..9671350614d --- /dev/null +++ b/base/runtime/threading_windows.odin @@ -0,0 +1,13 @@ +#+private +package runtime + +foreign import Kernel32 "system:Kernel32.lib" + +@(default_calling_convention="system") +foreign Kernel32 { + GetCurrentThreadId :: proc() -> u32 --- +} + +_get_current_thread_id :: proc "contextless" () -> int { + return int(GetCurrentThreadId()) +} diff --git a/base/runtime/virtual_memory.odin b/base/runtime/virtual_memory.odin new file mode 100644 index 00000000000..e8f6763506a --- /dev/null +++ b/base/runtime/virtual_memory.odin @@ -0,0 +1,95 @@ +package runtime + +ODIN_VIRTUAL_MEMORY_SUPPORTED :: VIRTUAL_MEMORY_SUPPORTED + +// Virtually all MMUs supported by Odin should have a 4KiB page size. +PAGE_SIZE :: 4 * Kilobyte + +when ODIN_ARCH == .arm32 { + SUPERPAGE_SIZE :: 1 * Megabyte +} else { + // All other architectures should have support for 2MiB pages. + // i386 supports it in PAE mode. + // amd64, arm64, and riscv64 support it by default. + SUPERPAGE_SIZE :: 2 * Megabyte +} + +#assert(SUPERPAGE_SIZE & (SUPERPAGE_SIZE-1) == 0, "SUPERPAGE_SIZE must be a power of two.") + +/* +Allocate virtual memory from the operating system. + +The address returned is guaranteed to point to data that is at least `size` +bytes large but may be larger, due to rounding `size` to the page size of the +system. +*/ +@(require_results) +allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + return _allocate_virtual_memory(size) +} + +/* +Allocate a superpage of virtual memory from the operating system. + +This is a contiguous block of memory larger than what is normally distributed +by the operating system, sometimes with special performance properties related +to the Translation Lookaside Buffer. + +The address will be a multiple of the `SUPERPAGE_SIZE` constant, and the memory +pointed to will be at least as long as that very same constant. + +The name derives from the superpage concept on the *BSD operating systems, +where it is known as huge pages on Linux and large pages on Windows. +*/ +@(require_results) +allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + return _allocate_virtual_memory_superpage() +} + +/* +Allocate virtual memory from the operating system. + +The address returned is guaranteed to be a multiple of `alignment` and point to +data that is at least `size` bytes large but may be larger, due to rounding +`size` to the page size of the system. + +`alignment` must be a power of two. +*/ +@(require_results) +allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + assert_contextless(is_power_of_two(alignment)) + return _allocate_virtual_memory_aligned(size, alignment) +} + +/* +Free virtual memory allocated by any of the `allocate_*` procs. +*/ +free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + _free_virtual_memory(ptr, size) +} + +/* +Resize virtual memory allocated by `allocate_virtual_memory`. + +**Caveats:** + +- `new_size` must not be zero. +- If `old_size` and `new_size` are the same, nothing happens. +- The resulting behavior is undefined if `old_size` is incorrect. +- If the address is changed, `alignment` will be ensured. +- `alignment` should be the same value used when the memory was allocated. +- Resizing memory returned by `allocate_virtual_memory_superpage` is not + well-defined. The memory may be resized, but it may no longer be backed by a + superpage. +*/ +@(require_results) +resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int = 0) -> rawptr { + // * This is due to a restriction of mremap on Linux. + assert_contextless(new_size != 0, "Cannot resize virtual memory address to zero.") + // * The statement about undefined behavior of incorrect `old_size` is due to + // how VirtualFree works on Windows. + if old_size == new_size { + return ptr + } + return _resize_virtual_memory(ptr, old_size, new_size, alignment) +} diff --git a/base/runtime/virtual_memory_darwin.odin b/base/runtime/virtual_memory_darwin.odin new file mode 100644 index 00000000000..77b46dc826a --- /dev/null +++ b/base/runtime/virtual_memory_darwin.odin @@ -0,0 +1,112 @@ +#+private +package runtime + +import "base:intrinsics" + +VIRTUAL_MEMORY_SUPPORTED :: true + +foreign import lib "system:System.framework" + +foreign lib { + mach_task_self_: u32 + mach_vm_allocate :: proc(target: u32, address: ^u64, size: u64, flags: i32) -> i32 --- + mach_vm_deallocate :: proc(target: u32, address: u64, size: u64) -> i32 --- + mach_vm_protect :: proc(target_task: u32, address: u64, size: u64, set_maximum: b32, new_protection: i32) -> i32 --- + mach_vm_map :: proc( + target_task: u32, + address: ^u64, + size: u64, + mask: u64, + flags: i32, + object: i32, + offset: uintptr, + copy: b32, + cur_protection, + max_protection: i32, + inheritance: u32, + ) -> i32 --- + mach_vm_remap :: proc( + target_task: u32, + target_address: ^u64, + size: u64, + mask: u64, + flags: i32, + src_task: u32, + src_address: u64, + copy: b32, + cur_protection, + max_protection: ^i32, + inheritance: u32, + ) -> i32 --- +} + +// The following features are specific to Darwin only. +VM_FLAGS_ANYWHERE :: 0x0001 + +SUPERPAGE_SIZE_ANY :: 1 +SUPERPAGE_SIZE_2MB :: 2 + +VM_FLAGS_SUPERPAGE_SHIFT :: 16 +VM_FLAGS_SUPERPAGE_SIZE_ANY :: SUPERPAGE_SIZE_ANY << VM_FLAGS_SUPERPAGE_SHIFT +VM_FLAGS_SUPERPAGE_SIZE_2MB :: SUPERPAGE_SIZE_2MB << VM_FLAGS_SUPERPAGE_SHIFT + +MEMORY_OBJECT_NULL :: 0 +VM_PROT_READ :: 0x01 +VM_PROT_WRITE :: 0x02 +VM_INHERIT_COPY :: 1 + +_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + address: u64 + result := mach_vm_map(mach_task_self_, &address, u64(size), 0, VM_FLAGS_ANYWHERE, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY) + if result != 0 { + return nil + } + return rawptr(uintptr(address)) +} + +_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + address: u64 + flags: i32 = VM_FLAGS_ANYWHERE + when ODIN_ARCH == .amd64 { + // NOTE(Feoramund): As far as we are aware, Darwin only supports + // explicit superpage allocation on AMD64 with a 2MiB parameter. + when SUPERPAGE_SIZE == 2 * Megabyte { + flags |= VM_FLAGS_SUPERPAGE_SIZE_2MB + } else { + #panic("An unsupported superpage size has been configured for AMD64 Darwin; only 2MB is supported.") + } + } + alignment_mask: u64 = SUPERPAGE_SIZE - 1 // Assumes a power of two size, ensured by an assertion in `virtual_memory.odin`. + result := mach_vm_map(mach_task_self_, &address, SUPERPAGE_SIZE, alignment_mask, flags, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY) + if result != 0 { + return nil + } + assert_contextless(address % SUPERPAGE_SIZE == 0) + return rawptr(uintptr(address)) +} + +_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + address: u64 + alignment_mask: u64 = u64(alignment) - 1 + result := mach_vm_map(mach_task_self_, &address, u64(size), alignment_mask, VM_FLAGS_ANYWHERE, MEMORY_OBJECT_NULL, 0, false, VM_PROT_READ|VM_PROT_WRITE, VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_COPY) + if result != 0 { + return nil + } + return rawptr(uintptr(address)) +} + +_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + mach_vm_deallocate(mach_task_self_, u64(uintptr(ptr)), u64(size)) +} + +_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { + result: rawptr = --- + if alignment == 0 { + result = _allocate_virtual_memory(new_size) + } else { + result = _allocate_virtual_memory_aligned(new_size, alignment) + } + intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size)) + mach_vm_deallocate(mach_task_self_, u64(uintptr(ptr)), u64(old_size)) + return result +} diff --git a/base/runtime/virtual_memory_freebsd.odin b/base/runtime/virtual_memory_freebsd.odin new file mode 100644 index 00000000000..76acfeebdce --- /dev/null +++ b/base/runtime/virtual_memory_freebsd.odin @@ -0,0 +1,135 @@ +#+private +package runtime + +import "base:intrinsics" + +VIRTUAL_MEMORY_SUPPORTED :: true + +SYS_munmap :: uintptr(73) +SYS_mmap :: uintptr(477) + +PROT_READ :: 0x01 +PROT_WRITE :: 0x02 + +MAP_PRIVATE :: 0x0002 +MAP_ANONYMOUS :: 0x1000 + +// The following features are specific to FreeBSD only. +/* + * Request specific alignment (n == log2 of the desired alignment). + * + * MAP_ALIGNED_SUPER requests optimal superpage alignment, but does + * not enforce a specific alignment. + */ +// #define MAP_ALIGNED(n) ((n) << MAP_ALIGNMENT_SHIFT) +MAP_ALIGNMENT_SHIFT :: 24 +MAP_ALIGNED_SUPER :: 1 << MAP_ALIGNMENT_SHIFT + +SUPERPAGE_MAP_FLAGS :: (intrinsics.constant_log2(SUPERPAGE_SIZE) << MAP_ALIGNMENT_SHIFT) | MAP_ALIGNED_SUPER + +_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if !ok { + return nil + } + return rawptr(result) +} + +_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, SUPERPAGE_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|SUPERPAGE_MAP_FLAGS, ~uintptr(0), 0) + if !ok { + // It may be the case that FreeBSD couldn't fulfill our alignment + // request, but it could still give us some memory. + return _allocate_virtual_memory_manually_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE) + } + return rawptr(result) +} + +_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + // This procedure uses the `MAP_ALIGNED` API provided by FreeBSD and falls + // back to manually aligned addresses, if that fails. + map_aligned_n: uintptr + if alignment >= PAGE_SIZE { + map_aligned_n = intrinsics.count_trailing_zeros(uintptr(alignment)) << MAP_ALIGNMENT_SHIFT + } + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|map_aligned_n, ~uintptr(0), 0) + if !ok { + _allocate_virtual_memory_manually_aligned(size, alignment) + } + return rawptr(result) +} + +_allocate_virtual_memory_manually_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + if alignment <= PAGE_SIZE { + // This is the simplest case. + // + // By virtue of binary arithmetic, any address aligned to a power of + // two is necessarily aligned to all lesser powers of two, and because + // mmap returns page-aligned addresses, we don't have to do anything + // extra here. + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if !ok { + return nil + } + return cast(rawptr)result + } + // We must over-allocate then adjust the address. + mmap_result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size + alignment), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if !ok { + return nil + } + assert_contextless(mmap_result % PAGE_SIZE == 0) + modulo := mmap_result & uintptr(alignment-1) + if modulo != 0 { + // The address is misaligned, so we must return an adjusted address + // and free the pages we don't need. + delta := uintptr(alignment) - modulo + adjusted_result := mmap_result + delta + + // Sanity-checking: + // - The adjusted address is still page-aligned, so it is a valid argument for munmap. + // - The adjusted address is aligned to the user's needs. + assert_contextless(adjusted_result % PAGE_SIZE == 0) + assert_contextless(adjusted_result % uintptr(alignment) == 0) + + // Round the delta to a multiple of the page size. + delta = delta / PAGE_SIZE * PAGE_SIZE + if delta > 0 { + // Unmap the pages we don't need. + intrinsics.syscall_bsd(SYS_munmap, mmap_result, delta) + } + + return rawptr(adjusted_result) + } else if size + alignment > PAGE_SIZE { + // The address is coincidentally aligned as desired, but we have space + // that will never be seen by the user, so we must free the backing + // pages for it. + start := size / PAGE_SIZE * PAGE_SIZE + if size % PAGE_SIZE != 0 { + start += PAGE_SIZE + } + length := size + alignment - start + if length > 0 { + intrinsics.syscall_bsd(SYS_munmap, mmap_result + uintptr(start), uintptr(length)) + } + } + return rawptr(mmap_result) +} + +_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(size)) +} + +_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { + // FreeBSD does not have a mremap syscall. + // All we can do is mmap a new address, copy the data, and munmap the old. + result: rawptr = --- + if alignment == 0 { + result = _allocate_virtual_memory(new_size) + } else { + result = _allocate_virtual_memory_aligned(new_size, alignment) + } + intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size)) + intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(old_size)) + return result +} diff --git a/base/runtime/virtual_memory_linux.odin b/base/runtime/virtual_memory_linux.odin new file mode 100644 index 00000000000..94eea7a1373 --- /dev/null +++ b/base/runtime/virtual_memory_linux.odin @@ -0,0 +1,156 @@ +#+private +package runtime + +import "base:intrinsics" + +VIRTUAL_MEMORY_SUPPORTED :: true + +when ODIN_ARCH == .amd64 { + SYS_mmap :: uintptr(9) + SYS_munmap :: uintptr(11) + SYS_mremap :: uintptr(25) +} else when ODIN_ARCH == .arm32 { + SYS_mmap :: uintptr(90) + SYS_munmap :: uintptr(91) + SYS_mremap :: uintptr(163) +} else when ODIN_ARCH == .arm64 { + SYS_mmap :: uintptr(222) + SYS_munmap :: uintptr(215) + SYS_mremap :: uintptr(216) +} else when ODIN_ARCH == .i386 { + SYS_mmap :: uintptr(90) + SYS_munmap :: uintptr(91) + SYS_mremap :: uintptr(163) +} else when ODIN_ARCH == .riscv64 { + SYS_mmap :: uintptr(222) + SYS_munmap :: uintptr(215) + SYS_mremap :: uintptr(216) +} else { + #panic("Syscall numbers related to virtual memory are missing for this Linux architecture.") +} + +PROT_READ :: 0x01 +PROT_WRITE :: 0x02 + +MAP_PRIVATE :: 0x02 +MAP_ANONYMOUS :: 0x20 + +MREMAP_MAYMOVE :: 0x01 + +ENOMEM :: ~uintptr(11) + +_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + result := intrinsics.syscall(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if int(result) < 0 { + return nil + } + return rawptr(result) +} + +_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + // This depends on Transparent HugePage Support being enabled. + result := intrinsics.syscall(SYS_mmap, 0, SUPERPAGE_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if int(result) < 0 { + return nil + } + if uintptr(result) % SUPERPAGE_SIZE != 0 { + // If THP support is not enabled, we may receive an address aligned to a + // page boundary instead, in which case, we must manually align a new + // address. + _free_virtual_memory(rawptr(result), SUPERPAGE_SIZE) + return _allocate_virtual_memory_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE) + } + return rawptr(result) +} + +_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + if alignment <= PAGE_SIZE { + // This is the simplest case. + // + // By virtue of binary arithmetic, any address aligned to a power of + // two is necessarily aligned to all lesser powers of two, and because + // mmap returns page-aligned addresses, we don't have to do anything + // extra here. + result := intrinsics.syscall(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if int(result) < 0 { + return nil + } + return rawptr(result) + } + // We must over-allocate then adjust the address. + mmap_result := intrinsics.syscall(SYS_mmap, 0, uintptr(size + alignment), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if int(mmap_result) < 0 { + return nil + } + assert_contextless(mmap_result % PAGE_SIZE == 0) + modulo := mmap_result & uintptr(alignment-1) + if modulo != 0 { + // The address is misaligned, so we must return an adjusted address + // and free the pages we don't need. + delta := uintptr(alignment) - modulo + adjusted_result := mmap_result + delta + + // Sanity-checking: + // - The adjusted address is still page-aligned, so it is a valid argument for mremap and munmap. + // - The adjusted address is aligned to the user's needs. + assert_contextless(adjusted_result % PAGE_SIZE == 0) + assert_contextless(adjusted_result % uintptr(alignment) == 0) + + // Round the delta to a multiple of the page size. + delta = delta / PAGE_SIZE * PAGE_SIZE + if delta > 0 { + // Unmap the pages we don't need. + intrinsics.syscall(SYS_munmap, mmap_result, delta) + } + + return rawptr(adjusted_result) + } else if size + alignment > PAGE_SIZE { + // The address is coincidentally aligned as desired, but we have space + // that will never be seen by the user, so we must free the backing + // pages for it. + start := size / PAGE_SIZE * PAGE_SIZE + if size % PAGE_SIZE != 0 { + start += PAGE_SIZE + } + length := size + alignment - start + if length > 0 { + intrinsics.syscall(SYS_munmap, mmap_result + uintptr(start), uintptr(length)) + } + } + return rawptr(mmap_result) +} + +_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + intrinsics.syscall(SYS_munmap, uintptr(ptr), uintptr(size)) +} + +_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { + if alignment == 0 { + // The user does not care about alignment, which is the simpler case. + result := intrinsics.syscall(SYS_mremap, uintptr(ptr), uintptr(old_size), uintptr(new_size), MREMAP_MAYMOVE) + if int(result) < 0 { + return nil + } + return rawptr(result) + } else { + // First, let's try to mremap without MREMAP_MAYMOVE. We might get + // lucky and the operating system could expand (or shrink, as the case + // may be) the pages in place, which means we don't have to allocate a + // whole new chunk of memory. + mremap_result := intrinsics.syscall(SYS_mremap, uintptr(ptr), uintptr(old_size), uintptr(new_size), 0) + if mremap_result != ENOMEM { + // We got lucky. + return rawptr(mremap_result) + } + + // mremap failed to resize the memory in place, which means we must + // allocate an entirely new aligned chunk of memory, copy the old data, + // and free the old pointer before returning the new one. + // + // This is costly but unavoidable with the API available to us. + result := _allocate_virtual_memory_aligned(new_size, alignment) + intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size)) + _free_virtual_memory(ptr, old_size) + return result + } +} diff --git a/base/runtime/virtual_memory_netbsd.odin b/base/runtime/virtual_memory_netbsd.odin new file mode 100644 index 00000000000..46c475d9a6f --- /dev/null +++ b/base/runtime/virtual_memory_netbsd.odin @@ -0,0 +1,72 @@ +#+private +package runtime + +import "base:intrinsics" + +VIRTUAL_MEMORY_SUPPORTED :: true + +SYS_munmap :: uintptr(73) +SYS_mmap :: uintptr(197) +SYS_mremap :: uintptr(411) + +PROT_READ :: 0x01 +PROT_WRITE :: 0x02 + +MAP_PRIVATE :: 0x0002 +MAP_ANONYMOUS :: 0x1000 + +// The following features are specific to NetBSD only. +/* + * Alignment (expressed in log2). Must be >= log2(PAGE_SIZE) and + * < # bits in a pointer (32 or 64). + */ +// #define MAP_ALIGNED(n) ((int)((unsigned int)(n) << MAP_ALIGNMENT_SHIFT)) +MAP_ALIGNMENT_SHIFT :: 24 + +_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if !ok { + return nil + } + return rawptr(result) +} + +_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + // NOTE(Feoramund): I am uncertain if NetBSD has direct support for + // superpages, so we just use the aligned allocate procedure here. + return _allocate_virtual_memory_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE) +} + +_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + // NOTE: Unlike FreeBSD, the NetBSD man pages do not indicate that + // `MAP_ALIGNED` can cause this to fail, so we don't try a second time with + // manual alignment. + map_aligned_n := intrinsics.count_trailing_zeros(uintptr(alignment)) << MAP_ALIGNMENT_SHIFT + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|map_aligned_n, ~uintptr(0), 0) + if !ok { + return nil + } + return rawptr(result) +} + +_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(size)) +} + +_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { + if alignment == 0 { + // The user does not care about alignment, which is the simpler case. + result, ok := intrinsics.syscall_bsd(SYS_mremap, uintptr(ptr), uintptr(old_size), uintptr(new_size), 0) + if !ok { + return nil + } + return rawptr(result) + } else { + map_aligned_n := intrinsics.count_trailing_zeros(uintptr(alignment)) << MAP_ALIGNMENT_SHIFT + result, ok := intrinsics.syscall_bsd(SYS_mremap, uintptr(ptr), uintptr(old_size), uintptr(new_size), map_aligned_n) + if !ok { + return nil + } + return rawptr(result) + } +} diff --git a/base/runtime/virtual_memory_openbsd.odin b/base/runtime/virtual_memory_openbsd.odin new file mode 100644 index 00000000000..6a37d4ae670 --- /dev/null +++ b/base/runtime/virtual_memory_openbsd.odin @@ -0,0 +1,104 @@ +#+private +package runtime + +import "base:intrinsics" + +VIRTUAL_MEMORY_SUPPORTED :: true + +SYS_munmap :: uintptr(73) +SYS_mmap :: uintptr(49) + +PROT_READ :: 0x01 +PROT_WRITE :: 0x02 + +MAP_PRIVATE :: 0x0002 +MAP_ANONYMOUS :: 0x1000 + +_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if !ok { + return nil + } + return rawptr(result) +} + +_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + // NOTE(Feoramund): I am uncertain if OpenBSD has direct support for + // superpages, so we just use the aligned allocate procedure here. + return _allocate_virtual_memory_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE) +} + +_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + if alignment <= PAGE_SIZE { + // This is the simplest case. + // + // By virtue of binary arithmetic, any address aligned to a power of + // two is necessarily aligned to all lesser powers of two, and because + // mmap returns page-aligned addresses, we don't have to do anything + // extra here. + result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if !ok { + return nil + } + return cast(rawptr)result + } + // We must over-allocate then adjust the address. + mmap_result, ok := intrinsics.syscall_bsd(SYS_mmap, 0, uintptr(size + alignment), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, ~uintptr(0), 0) + if !ok { + return nil + } + assert_contextless(mmap_result % PAGE_SIZE == 0) + modulo := mmap_result & uintptr(alignment-1) + if modulo != 0 { + // The address is misaligned, so we must return an adjusted address + // and free the pages we don't need. + delta := uintptr(alignment) - modulo + adjusted_result := mmap_result + delta + + // Sanity-checking: + // - The adjusted address is still page-aligned, so it is a valid argument for munmap. + // - The adjusted address is aligned to the user's needs. + assert_contextless(adjusted_result % PAGE_SIZE == 0) + assert_contextless(adjusted_result % uintptr(alignment) == 0) + + // Round the delta to a multiple of the page size. + delta = delta / PAGE_SIZE * PAGE_SIZE + if delta > 0 { + // Unmap the pages we don't need. + intrinsics.syscall_bsd(SYS_munmap, mmap_result, delta) + } + + return rawptr(adjusted_result) + } else if size + alignment > PAGE_SIZE { + // The address is coincidentally aligned as desired, but we have space + // that will never be seen by the user, so we must free the backing + // pages for it. + start := size / PAGE_SIZE * PAGE_SIZE + if size % PAGE_SIZE != 0 { + start += PAGE_SIZE + } + length := size + alignment - start + if length > 0 { + intrinsics.syscall_bsd(SYS_munmap, mmap_result + uintptr(start), uintptr(length)) + } + } + return rawptr(mmap_result) +} + +_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(size)) +} + +_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { + // OpenBSD does not have a mremap syscall. + // All we can do is mmap a new address, copy the data, and munmap the old. + result: rawptr = --- + if alignment == 0 { + result = _allocate_virtual_memory(new_size) + } else { + result = _allocate_virtual_memory_aligned(new_size, alignment) + } + intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size)) + intrinsics.syscall_bsd(SYS_munmap, uintptr(ptr), uintptr(old_size)) + return result +} diff --git a/base/runtime/virtual_memory_other.odin b/base/runtime/virtual_memory_other.odin new file mode 100644 index 00000000000..2c686051050 --- /dev/null +++ b/base/runtime/virtual_memory_other.odin @@ -0,0 +1,30 @@ +#+private +#+build !darwin +#+build !freebsd +#+build !linux +#+build !netbsd +#+build !openbsd +#+build !windows +package runtime + +VIRTUAL_MEMORY_SUPPORTED :: false + +_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + unimplemented_contextless("Virtual memory is not supported on this platform.") +} + +_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + unimplemented_contextless("Virtual memory is not supported on this platform.") +} + +_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + unimplemented_contextless("Virtual memory is not supported on this platform.") +} + +_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + unimplemented_contextless("Virtual memory is not supported on this platform.") +} + +_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { + unimplemented_contextless("Virtual memory is not supported on this platform.") +} diff --git a/base/runtime/virtual_memory_windows.odin b/base/runtime/virtual_memory_windows.odin new file mode 100644 index 00000000000..cd39a3170b0 --- /dev/null +++ b/base/runtime/virtual_memory_windows.odin @@ -0,0 +1,149 @@ +#+private +package runtime + +import "base:intrinsics" + +foreign import Kernel32 "system:Kernel32.lib" +foreign import OneCore "system:OneCore.lib" + +VIRTUAL_MEMORY_SUPPORTED :: true + +// NOTE: Windows makes a distinction between page size and allocation +// granularity. Addresses returned from the allocation procedures are rounded +// to allocation granularity boundaries, at a minimum, not page sizes. + +@(default_calling_convention="system") +foreign Kernel32 { + GetLastError :: proc() -> u32 --- + GetSystemInfo :: proc(lpSystemInfo: ^SYSTEM_INFO) --- +} + +@(default_calling_convention="system") +foreign OneCore { + VirtualAlloc :: proc(lpAddress: rawptr, dwSize: uint, flAllocationType: u32, flProtect: u32) -> rawptr --- + VirtualAlloc2 :: proc( + Process: rawptr, + BaseAddress: rawptr, + Size: uint, + AllocationType: u32, + PageProtection: u32, + ExtendedParameters: [^]MEM_EXTENDED_PARAMETER, + ParameterCount: u32) -> rawptr --- + VirtualFree :: proc(lpAddress: rawptr, dwSize: uint, dwFreeType: u32) -> b32 --- +} + +SYSTEM_INFO :: struct { + using DUMMYUNIONNAME: struct #raw_union { + dwOemId: u32, + using DUMMYSTRUCTNAME:struct { + wProcessorArchitecture: u16, + wReserved: u16, + }, + }, + dwPageSize: u32, + lpMinimumApplicationAddress: rawptr, + lpMaximumApplicationAddress: rawptr, + dwActiveProcessorMask: uint, + dwNumberOfProcessors: u32, + dwProcessorType: u32, + dwAllocationGranularity: u32, + wProcessorLevel: u16, + wProcessorRevision: u16, +} + +MemExtendedParameterAddressRequirements :: 0x01 +MEM_EXTENDED_PARAMETER_TYPE_BITS :: 8 + +MEM_ADDRESS_REQUIREMENTS :: struct { + LowestStartingAddress: rawptr, + HighestEndingAddress: rawptr, + Alignment: uint, +} + +MEM_EXTENDED_PARAMETER :: struct { + using DUMMYSTRUCTNAME: bit_field u64 { + Type: u64 | MEM_EXTENDED_PARAMETER_TYPE_BITS, + Reserved: u64 | 64 - MEM_EXTENDED_PARAMETER_TYPE_BITS, + }, + using DUMMYUNIONNAME: struct #raw_union { + ULong64: u64, + Pointer: rawptr, + Size: uint, + Handle: rawptr, + ULong: u32, + }, +} + + +MEM_COMMIT :: 0x00001000 +MEM_RESERVE :: 0x00002000 +MEM_RELEASE :: 0x00008000 + +PAGE_READWRITE :: 0x04 + +_allocate_virtual_memory :: proc "contextless" (size: int) -> rawptr { + // `Size` must be a multiple of the page size. + rounded_size := size + if rounded_size % PAGE_SIZE != 0 { + rounded_size = (size / PAGE_SIZE + 1) * PAGE_SIZE + } + result := VirtualAlloc(nil, uint(rounded_size), MEM_COMMIT|MEM_RESERVE, PAGE_READWRITE) + if result == nil { + return nil + } + return rawptr(result) +} + +_allocate_virtual_memory_superpage :: proc "contextless" () -> rawptr { + // TODO: Windows has support for Large Pages, but its usage requires privilege escalation. + return _allocate_virtual_memory_aligned(SUPERPAGE_SIZE, SUPERPAGE_SIZE) +} + +_allocate_virtual_memory_aligned :: proc "contextless" (size: int, alignment: int) -> rawptr { + sys_info: SYSTEM_INFO + GetSystemInfo(&sys_info) + if alignment <= int(sys_info.dwAllocationGranularity) { + // The alignment is less than or equal to the allocation granularity, + // which means it will automatically be aligned and any request for + // alignment less than the allocation granularity will result in + // ERROR_INVALID_PARAMETER. + return _allocate_virtual_memory(size) + } + addr_req := MEM_ADDRESS_REQUIREMENTS{ + LowestStartingAddress = nil, + HighestEndingAddress = nil, + Alignment = uint(alignment), + } + param := MEM_EXTENDED_PARAMETER{ + Type = MemExtendedParameterAddressRequirements, + Pointer = &addr_req, + } + // `Size` must be a multiple of the page size. + rounded_size := size + if rounded_size % PAGE_SIZE != 0 { + rounded_size = (size / PAGE_SIZE + 1) * PAGE_SIZE + } + result := VirtualAlloc2(nil, nil, uint(rounded_size), MEM_COMMIT|MEM_RESERVE, PAGE_READWRITE, ¶m, 1) + if result == nil { + return nil + } + return rawptr(result) +} + +_free_virtual_memory :: proc "contextless" (ptr: rawptr, size: int) { + VirtualFree(ptr, 0, MEM_RELEASE) +} + +_resize_virtual_memory :: proc "contextless" (ptr: rawptr, old_size: int, new_size: int, alignment: int) -> rawptr { + // There is no system support for resizing addresses returned by VirtualAlloc. + // All we can do is request a new address, copy the data, and free the old. + result: rawptr = --- + if alignment == 0 { + result = _allocate_virtual_memory(new_size) + } else { + result = _allocate_virtual_memory_aligned(new_size, alignment) + } + intrinsics.mem_copy_non_overlapping(result, ptr, min(new_size, old_size)) + VirtualFree(ptr, 0, MEM_RELEASE) + return result +} diff --git a/base/runtime/wasm_allocator.odin b/base/runtime/wasm_allocator.odin index 574f3dd06ef..c4799fcbe1c 100644 --- a/base/runtime/wasm_allocator.odin +++ b/base/runtime/wasm_allocator.odin @@ -1,3 +1,4 @@ +#+build !orca #+build wasm32, wasm64p32 package runtime @@ -869,3 +870,22 @@ aligned_realloc :: proc(a: ^WASM_Allocator, ptr: rawptr, alignment, size: uint, return newptr } + +heap_allocator :: default_wasm_allocator +heap_allocator_proc :: wasm_allocator_proc + +@(require_results) +heap_alloc :: proc(size: int, zero_memory: bool = true) -> (ptr: rawptr) { + bytes, _ := wasm_allocator_proc(&global_default_wasm_allocator_data, .Alloc if zero_memory else .Alloc_Non_Zeroed, size, 0, nil, 0) + return raw_data(bytes) +} + +@(require_results) +heap_resize :: proc(old_ptr: rawptr, old_size: int, new_size: int, zero_memory: bool = true) -> (new_ptr: rawptr) { + bytes, _ := wasm_allocator_proc(&global_default_wasm_allocator_data, .Resize if zero_memory else .Resize_Non_Zeroed, new_size, 0, old_ptr, old_size) + return raw_data(bytes) +} + +heap_free :: proc(ptr: rawptr) { + wasm_allocator_proc(&global_default_wasm_allocator_data, .Free, 0, 0, ptr, 0) +} diff --git a/core/os/os2/env_linux.odin b/core/os/os2/env_linux.odin index e0ef0601058..132a5464e75 100644 --- a/core/os/os2/env_linux.odin +++ b/core/os/os2/env_linux.odin @@ -76,7 +76,7 @@ _set_env :: proc(key, v_new: string) -> bool { // wasn't in the environment in the first place. k_addr, v_addr := _kv_addr_from_val(v_curr, key) if len(v_new) > len(v_curr) { - k_addr = ([^]u8)(runtime.heap_resize(k_addr, kv_size)) + k_addr = ([^]u8)(runtime.heap_resize(k_addr, len(v_curr), kv_size)) if k_addr == nil { return false } diff --git a/tests/heap_allocator/libc/heap_allocator.odin b/tests/heap_allocator/libc/heap_allocator.odin new file mode 100644 index 00000000000..8710c59f0f3 --- /dev/null +++ b/tests/heap_allocator/libc/heap_allocator.odin @@ -0,0 +1,127 @@ +package tests_heap_allocator_libc + +import "base:intrinsics" +import "base:runtime" +import "core:mem" + +// This package contains the old libc malloc-based allocator, for comparison. + +Allocator :: runtime.Allocator +Allocator_Mode :: runtime.Allocator_Mode +Allocator_Mode_Set :: runtime.Allocator_Mode_Set +Allocator_Error :: runtime.Allocator_Error + +libc_allocator :: proc() -> Allocator { + return Allocator{ + procedure = libc_allocator_proc, + data = nil, + } +} + +libc_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode, + size, alignment: int, + old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) { + // + // NOTE(tetra, 2020-01-14): The heap doesn't respect alignment. + // Instead, we overallocate by `alignment + size_of(rawptr) - 1`, and insert + // padding. We also store the original pointer returned by heap_alloc right before + // the pointer we return to the user. + // + + aligned_alloc :: proc(size, alignment: int, old_ptr: rawptr, old_size: int, zero_memory := true) -> ([]byte, Allocator_Error) { + // Not(flysand): We need to reserve enough space for alignment, which + // includes the user data itself, the space to store the pointer to + // allocation start, as well as the padding required to align both + // the user data and the pointer. + a := max(alignment, align_of(rawptr)) + space := a-1 + size_of(rawptr) + size + allocated_mem: rawptr + + force_copy := old_ptr != nil && alignment > align_of(rawptr) + + if old_ptr != nil && !force_copy { + original_old_ptr := ([^]rawptr)(old_ptr)[-1] + allocated_mem = heap_resize(original_old_ptr, space) + } else { + allocated_mem = heap_alloc(space, zero_memory) + } + aligned_mem := rawptr(([^]u8)(allocated_mem)[size_of(rawptr):]) + + ptr := uintptr(aligned_mem) + aligned_ptr := (ptr + uintptr(a)-1) & ~(uintptr(a)-1) + if allocated_mem == nil { + aligned_free(old_ptr) + aligned_free(allocated_mem) + return nil, .Out_Of_Memory + } + + aligned_mem = rawptr(aligned_ptr) + ([^]rawptr)(aligned_mem)[-1] = allocated_mem + + if force_copy { + runtime.mem_copy_non_overlapping(aligned_mem, old_ptr, min(old_size, size)) + aligned_free(old_ptr) + } + + return mem.byte_slice(aligned_mem, size), nil + } + + aligned_free :: proc(p: rawptr) { + if p != nil { + heap_free(([^]rawptr)(p)[-1]) + } + } + + aligned_resize :: proc(p: rawptr, old_size: int, new_size: int, new_alignment: int, zero_memory := true) -> (new_memory: []byte, err: Allocator_Error) { + if p == nil { + return aligned_alloc(new_size, new_alignment, nil, old_size, zero_memory) + } + + new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return + + // NOTE: heap_resize does not zero the new memory, so we do it + if zero_memory && new_size > old_size { + new_region := raw_data(new_memory[old_size:]) + intrinsics.mem_zero(new_region, new_size - old_size) + } + return + } + + switch mode { + case .Alloc, .Alloc_Non_Zeroed: + return aligned_alloc(size, alignment, nil, 0, mode == .Alloc) + + case .Free: + aligned_free(old_memory) + + case .Free_All: + return nil, .Mode_Not_Implemented + + case .Resize, .Resize_Non_Zeroed: + return aligned_resize(old_memory, old_size, size, alignment, mode == .Resize) + + case .Query_Features: + set := (^Allocator_Mode_Set)(old_memory) + if set != nil { + set^ = {.Alloc, .Alloc_Non_Zeroed, .Free, .Resize, .Resize_Non_Zeroed, .Query_Features} + } + return nil, nil + + case .Query_Info: + return nil, .Mode_Not_Implemented + } + + return nil, nil +} + +heap_alloc :: proc "contextless" (size: int, zero_memory := true) -> rawptr { + return _heap_alloc(size, zero_memory) +} + +heap_resize :: proc "contextless" (ptr: rawptr, new_size: int) -> rawptr { + return _heap_resize(ptr, new_size) +} + +heap_free :: proc "contextless" (ptr: rawptr) { + _heap_free(ptr) +} diff --git a/base/runtime/heap_allocator_orca.odin b/tests/heap_allocator/libc/heap_allocator_orca.odin similarity index 95% rename from base/runtime/heap_allocator_orca.odin rename to tests/heap_allocator/libc/heap_allocator_orca.odin index be032a20720..8fd13253db6 100644 --- a/base/runtime/heap_allocator_orca.odin +++ b/tests/heap_allocator/libc/heap_allocator_orca.odin @@ -1,6 +1,6 @@ #+build orca #+private -package runtime +package tests_heap_allocator_libc foreign { @(link_name="malloc") _orca_malloc :: proc "c" (size: int) -> rawptr --- diff --git a/base/runtime/heap_allocator_other.odin b/tests/heap_allocator/libc/heap_allocator_other.odin similarity index 94% rename from base/runtime/heap_allocator_other.odin rename to tests/heap_allocator/libc/heap_allocator_other.odin index 507dbf3181e..5118aa61221 100644 --- a/base/runtime/heap_allocator_other.odin +++ b/tests/heap_allocator/libc/heap_allocator_other.odin @@ -1,6 +1,6 @@ #+build js, wasi, freestanding, essence #+private -package runtime +package tests_heap_allocator_libc _heap_alloc :: proc "contextless" (size: int, zero_memory := true) -> rawptr { context = default_context() diff --git a/base/runtime/heap_allocator_unix.odin b/tests/heap_allocator/libc/heap_allocator_unix.odin similarity index 96% rename from base/runtime/heap_allocator_unix.odin rename to tests/heap_allocator/libc/heap_allocator_unix.odin index d4590d2dde8..49fb820bdc0 100644 --- a/base/runtime/heap_allocator_unix.odin +++ b/tests/heap_allocator/libc/heap_allocator_unix.odin @@ -1,6 +1,6 @@ #+build linux, darwin, freebsd, openbsd, netbsd, haiku #+private -package runtime +package tests_heap_allocator_libc when ODIN_OS == .Darwin { foreign import libc "system:System.framework" diff --git a/base/runtime/heap_allocator_windows.odin b/tests/heap_allocator/libc/heap_allocator_windows.odin similarity index 97% rename from base/runtime/heap_allocator_windows.odin rename to tests/heap_allocator/libc/heap_allocator_windows.odin index e07df755924..d5635d612d6 100644 --- a/base/runtime/heap_allocator_windows.odin +++ b/tests/heap_allocator/libc/heap_allocator_windows.odin @@ -1,4 +1,4 @@ -package runtime +package tests_heap_allocator_libc foreign import kernel32 "system:Kernel32.lib" diff --git a/tests/heap_allocator/test_bench.odin b/tests/heap_allocator/test_bench.odin new file mode 100644 index 00000000000..649c61b56e2 --- /dev/null +++ b/tests/heap_allocator/test_bench.odin @@ -0,0 +1,1493 @@ +// In order to test the heap allocator in a deterministic manner, we must run +// this program without the help of the test runner, because the runner itself +// makes use of heap allocation. +package tests_heap_allocator + +import "base:intrinsics" +import "base:runtime" +import "core:flags" +import "core:fmt" +import "core:log" +import "core:math/rand" +import "core:os" +import "core:sync" +import "core:thread" +import "core:time" +import "core:mem" + +import libc_allocator "libc" + +ODIN_DEBUG_HEAP :: runtime.ODIN_DEBUG_HEAP + +INTEGER_BITS :: 8 * size_of(int) +SECTOR_BITS :: 8 * size_of(uint) + +// The tests are specific to feoramalloc, but the benchmarks are general-purpose. + +// +// Utility +// + +expect :: proc "contextless" (condition: bool, message := #caller_expression(condition), loc := #caller_location) { + if !condition { + @(cold) + internal :: proc "contextless" (message: string, loc: runtime.Source_Code_Location) { + runtime.print_string("\n* Expectation failed: ") + runtime.print_string(message) + runtime.print_string(" @ ") + runtime.print_caller_location(loc) + runtime.print_string("\n\n") + when ODIN_DEBUG { + intrinsics.debug_trap() + } else { + intrinsics.trap() + } + } + internal(message, loc) + } +} + +verify_zeroed_slice :: proc(bytes: []byte, loc := #caller_location) { + for b in bytes { + expect(b == 0, loc = loc) + } +} + +verify_zeroed_ptr :: proc(ptr: [^]byte, size: int, loc := #caller_location) { + for i := 0; i < size; i += 1 { + expect(ptr[i] == 0, loc = loc) + } +} + +verify_zeroed :: proc { + verify_zeroed_slice, + verify_zeroed_ptr, +} + +verify_integrity_slice :: proc(bytes: []byte, seed: u64, loc := #caller_location) { + buf: [1]byte + rand.reset(seed) + for i := 0; i < len(bytes); i += len(buf) { + expect(rand.read(buf[:]) == len(buf), loc = loc) + length := min(len(buf), len(bytes) - i) + for j := 0; j < length; j += 1 { + expect(bytes[i+j] == buf[j], loc = loc) + } + } +} + +verify_integrity_ptr :: proc(ptr: [^]byte, size: int, seed: u64, loc := #caller_location) { + verify_integrity_slice(transmute([]byte)runtime.Raw_Slice{ + data = ptr, + len = size, + }, seed, loc) +} + +verify_integrity :: proc { + verify_integrity_slice, + verify_integrity_ptr, +} + + +randomize_bytes_slice :: proc(bytes: []byte, seed: u64, loc := #caller_location) { + rand.reset(seed) + buf: [1]byte + for i := 0; i < len(bytes); i += len(buf) { + expect(rand.read(buf[:]) == len(buf), loc = loc) + length := min(len(buf), len(bytes) - i) + for j := 0; j < length; j += 1 { + bytes[i+j] = buf[j] + } + } +} + +randomize_bytes_ptr :: proc(ptr: [^]byte, size: int, seed: u64, loc := #caller_location) { + randomize_bytes_slice(transmute([]byte)runtime.Raw_Slice{ + data = ptr, + len = size, + }, seed, loc) +} + +randomize_bytes :: proc { + randomize_bytes_slice, + randomize_bytes_ptr, +} + +dump_slabs :: proc() { + intrinsics.atomic_thread_fence(.Seq_Cst) + superpage := runtime.local_heap + for { + if superpage == nil { + return + } + log.infof("+++ Superpage at %p", superpage) + log.infof(" - Free slabs: %i", superpage.free_slabs) + log.infof(" - Next free slab index: %i", superpage.next_free_slab_index) + for i := 0; i < runtime.HEAP_SLAB_COUNT; /**/ { + slab := runtime.heap_superpage_index_slab(superpage, i) + if slab.bin_size == 0 { + // log.infof("Slab %i unused.", slab.index) + expect(i == slab.index) + } else { + log.infof("%#v", slab) + if slab.free_bins == 0 { + expect(intrinsics.atomic_load(&slab.is_full)) + } + } + if slab.bin_size > runtime.HEAP_MAX_BIN_SIZE { + // Skip contiguous slabs. + i += runtime.heap_slabs_needed_for_size(slab.bin_size) + } else { + i += 1 + } + } + log.info("") + superpage = superpage.next + } +} + +validate_cache :: proc() { + cache := runtime.local_heap_cache + slab_map_terminated: [runtime.HEAP_BIN_RANKS]bool + superpages_with_free_slabs_terminated: bool + for { + // Validate the slab map. + for rank in 0.. 0) + } + } + + next_cache := intrinsics.atomic_load_explicit(&cache.next_cache_block, .Acquire) + if next_cache == nil { + break + } + cache = next_cache + } +} + +// +// Allocation API Testing +// + +Size_Strategy :: enum { + Adding, + Multiplying, + Randomizing, +} + +Free_Strategy :: enum { + Never, + At_The_End, // free at end of allocs + Interleaved, // free X after Y allocs +} + +Free_Direction :: enum { + Forward, // like a queue + Backward, // like a stack + Randomly, +} + +test_alloc_write_free :: proc( + object_count: int, + + starting_size: int, + final_size: int, + + size_strategy: Size_Strategy, + size_operand: int, + + allocs_per_free_operation: int, + free_operations_at_once: int, + free_strategy: Free_Strategy, + free_direction: Free_Direction, +) { + Allocation :: struct { + data: []byte, + seed: u64, + } + pointers := make([]Allocation, object_count, context.temp_allocator) + + allocator := context.allocator + size := starting_size + start_index := 0 + end_index := 0 + + allocs := 0 + + log.infof("AWF: %i objects. Size: [%i..=%i] %v by %i each allocation. %i freed every %i, %v and %v.", object_count, starting_size, final_size, size_strategy, size_operand, free_operations_at_once, allocs_per_free_operation, free_strategy, free_direction) + + for o in 1..=u64(object_count) { + seed := u64(intrinsics.read_cycle_counter()) * o + alignment := min(size, runtime.HEAP_MAX_ALIGNMENT) + + bytes, alloc_err := allocator.procedure(allocator.data, .Alloc, size, alignment, nil, 0) + expect(alloc_err == nil) + pointers[end_index] = Allocation{ + data = bytes, + seed = seed, + } + end_index += 1 + allocs += 1 + + verify_zeroed(bytes) + randomize_bytes(bytes, seed) + + if size < final_size { + switch size_strategy { + case .Adding: size += size_operand + case .Multiplying: size *= size_operand + case .Randomizing: size = starting_size + rand.int_max(final_size - starting_size) + } + if final_size > starting_size { + size = min(size, final_size) + } else { + size = max(size, final_size) + } + } + + if allocs % allocs_per_free_operation != 0 { + continue + } + + switch free_strategy { + case .Never, .At_The_End: + break + case .Interleaved: + for _ in 0.. 0 && size % (runtime.HEAP_MAX_BIN_SIZE/8) == 0 { + log.infof("... %i ...", size) + } + alignment := min(size, runtime.HEAP_MAX_ALIGNMENT) + + // Allocate and free twice to make sure that the memory is truly zeroed. + // + // This works on the assumption that the allocator will return the same + // pointer if we allocate, free, then allocate again with the same + // characteristics. + // + // libc malloc does not guarantee this behavior, but feoramalloc does + // in non-parallel scenarios. + old_ptr: rawptr + for i in 0..<2 { + bytes, alloc_err := allocator.procedure(allocator.data, .Alloc, size, alignment, nil, 0) + if i == 0 { + old_ptr = raw_data(bytes) + } else { + if old_ptr != raw_data(bytes) { + different_pointers += 1 + } + } + expect(alloc_err == nil) + verify_zeroed(bytes) + randomize_bytes(bytes, u64(intrinsics.read_cycle_counter())) + _, free_err := allocator.procedure(allocator.data, .Free, 0, 0, raw_data(bytes), 0) + expect(free_err == nil) + } + } + if different_pointers > 0 { + log.warnf("There were %i cases in which the allocator didn't return the same pointer after allocating, freeing, then allocating again with the same size.", different_pointers) + } + log.info("Done.") +} + +test_single_alloc_and_resize :: proc(start, target: int) { + log.infof("Testing allocation of %i bytes, resizing to %i, then resizing back.", start, target) + allocator := context.allocator + base_seed := u64(intrinsics.read_cycle_counter()) + + alignment := min(start, runtime.HEAP_MAX_ALIGNMENT) + seed := base_seed * (1+u64(start)) + + bytes, alloc_err := allocator.procedure(allocator.data, .Alloc, start, alignment, nil, 0) + expect(alloc_err == nil) + expect(len(bytes) == start) + verify_zeroed(bytes) + randomize_bytes(bytes, seed) + + resized_bytes_1, resize_1_err := allocator.procedure(allocator.data, .Resize, target, alignment, raw_data(bytes), start) + expect(resize_1_err == nil) + expect(len(resized_bytes_1) == target) + verify_integrity(resized_bytes_1[:min(start, target)], seed) + if target > start { + verify_zeroed(resized_bytes_1[start:]) + } + + resized_bytes_2, resize_2_err := allocator.procedure(allocator.data, .Resize, start, alignment, raw_data(resized_bytes_1), target) + expect(resize_2_err == nil) + expect(len(resized_bytes_2) == start) + verify_integrity(resized_bytes_2[:min(start, target)], seed) + if start > target { + verify_zeroed(resized_bytes_2[target:]) + } + + _, free_err := allocator.procedure(allocator.data, .Free, 0, 0, raw_data(resized_bytes_2), 0) + expect(free_err == nil) +} + +/* +This test helped find an issue with the orphanage. +*/ +test_parallel_pointer_passing :: proc(thread_count: int) { + Data :: struct { + thread: ^thread.Thread, + ptr: ^^int, + sema: sync.Sema, + friend: ^sync.Sema, + wg: ^sync.Wait_Group, + } + + task :: proc(t: ^thread.Thread) { + data := cast(^Data)t.data + sync.wait(&data.sema) + expect(data.ptr != nil) + expect(data.ptr^ != nil) + expect(data.ptr^^ != 0) + free(data.ptr^) + data.ptr^ = new(int) + expect(data.ptr^^ == 0) + data.ptr^^ = int(intrinsics.read_cycle_counter()) + if data.friend != nil { + sync.post(data.friend) + } + sync.wait_group_done(data.wg) + } + + data := new(int) + data^ = int(intrinsics.read_cycle_counter()) + tasks := make([]Data, thread_count, context.temp_allocator) + + wg: sync.Wait_Group + sync.wait_group_add(&wg, thread_count) + + for i in 0.. 0) + expect(slab.bin_size > 0) + expect(intrinsics.atomic_load_explicit(&slab.remote_free_bins_scheduled, .Acquire) == 0) + + list := make([]^int, slab.max_bins) + list[0] = o + + for i in 0..= intrinsics.atomic_load_explicit(data.all_pointers_len, .Acquire) { + // Spinlock. + intrinsics.cpu_relax() + } + intrinsics.atomic_thread_fence(.Seq_Cst) + + ptr := data.all_pointers[ticket] + + expect(ptr != nil) + i_ptr := cast(^i64)ptr + expect(i_ptr^ == 0) + val := intrinsics.read_cycle_counter() + i_ptr^ = val + expect(i_ptr^ == val) + free(ptr) + } + + sync.wait_group_done(data.wg) + } + + start_time: time.Time + + barrier: sync.Barrier + sync.barrier_init(&barrier, thread_count) + + wg: sync.Wait_Group + sync.wait_group_add(&wg, thread_count) + + // non-atomic + all_pointers := make([]rawptr, thread_count*allocs_per_thread) + defer delete(all_pointers) + // atomic + all_pointers_len := 0 + all_pointers_ticket := 0 + + consumers := make([]Consumer_Data, thread_count) + defer delete(consumers) + + for i in 0.. Slab + test_single_alloc_and_resize(runtime.HEAP_MAX_BIN_SIZE, runtime.HEAP_SLAB_SIZE) + test_single_alloc_and_resize(runtime.HEAP_SLAB_SIZE, runtime.HEAP_MAX_BIN_SIZE) + + // Bin <-> Huge + test_single_alloc_and_resize(runtime.HEAP_MAX_BIN_SIZE, runtime.HEAP_HUGE_ALLOCATION_THRESHOLD) + test_single_alloc_and_resize(runtime.HEAP_HUGE_ALLOCATION_THRESHOLD, runtime.HEAP_MAX_BIN_SIZE) + + // Slab <-> Huge + test_single_alloc_and_resize(runtime.HEAP_MAX_BIN_SIZE + 1, runtime.HEAP_HUGE_ALLOCATION_THRESHOLD) + test_single_alloc_and_resize(runtime.HEAP_HUGE_ALLOCATION_THRESHOLD, runtime.HEAP_MAX_BIN_SIZE + 1) + + // Inter-huge tests. + test_single_alloc_and_resize(runtime.HEAP_HUGE_ALLOCATION_THRESHOLD + 2, runtime.HEAP_HUGE_ALLOCATION_THRESHOLD + 1) + test_single_alloc_and_resize(runtime.HEAP_HUGE_ALLOCATION_THRESHOLD + 1, runtime.SUPERPAGE_SIZE) + + // Larger-than-superpage tests. + test_single_alloc_and_resize(runtime.SUPERPAGE_SIZE, runtime.SUPERPAGE_SIZE * 3) + test_single_alloc_and_resize(runtime.SUPERPAGE_SIZE * 3, runtime.SUPERPAGE_SIZE) + + // Brute-force tests. + test_individual_allocation_and_free(runtime.HEAP_MAX_BIN_SIZE if opt.long else 1024) + test_continuous_allocation_of_size_n(16, runtime.HEAP_MAX_BIN_SIZE if opt.long else 1024) + + test_alloc_write_free( + object_count = 400, + starting_size = 16, final_size = 16, + size_strategy = .Adding, size_operand = 0, + allocs_per_free_operation = 16, + free_operations_at_once = 4, + free_strategy = .Interleaved, + free_direction = .Randomly, + ) + + test_alloc_write_free( + object_count = 100, + starting_size = 2, final_size = 4096, + size_strategy = .Multiplying, size_operand = 2, + allocs_per_free_operation = 16, + free_operations_at_once = 4, + free_strategy = .Interleaved, + free_direction = .Randomly, + ) + + test_alloc_write_free( + object_count = 100, + starting_size = 2, final_size = 32768, + size_strategy = .Adding, size_operand = 2, + allocs_per_free_operation = 16, + free_operations_at_once = 4, + free_strategy = .Interleaved, + free_direction = .Randomly, + ) + + test_alloc_write_free( + object_count = 100, + starting_size = 2, final_size = 8096, + size_strategy = .Adding, size_operand = 2, + allocs_per_free_operation = 16, + free_operations_at_once = 15, + free_strategy = .Interleaved, + free_direction = .Backward, + ) + + test_alloc_write_free( + object_count = 10, + starting_size = 8096, final_size = 2, + size_strategy = .Adding, size_operand = -2, + allocs_per_free_operation = 16, + free_operations_at_once = 15, + free_strategy = .At_The_End, + free_direction = .Forward, + ) + + test_alloc_write_free( + object_count = 10, + starting_size = 65535, final_size = 65535, + size_strategy = .Adding, size_operand = 0, + allocs_per_free_operation = 1, + free_operations_at_once = 1, + free_strategy = .At_The_End, + free_direction = .Forward, + ) + + test_alloc_write_free( + object_count = 300000, + starting_size = 8, final_size = 8, + size_strategy = .Adding, size_operand = 0, + allocs_per_free_operation = 1, + free_operations_at_once = 1, + free_strategy = .At_The_End, + free_direction = .Forward, + ) + + test_alloc_write_free( + object_count = runtime.HEAP_SLAB_COUNT*2, + starting_size = runtime.HEAP_SLAB_SIZE/2, final_size = runtime.HEAP_SLAB_SIZE*4, + size_strategy = .Multiplying, size_operand = 2, + allocs_per_free_operation = 1, + free_operations_at_once = 1, + free_strategy = .At_The_End, + free_direction = .Forward, + ) + + test_alloc_write_free( + object_count = 2, + starting_size = runtime.SUPERPAGE_SIZE/2, final_size = runtime.SUPERPAGE_SIZE*4, + size_strategy = .Multiplying, size_operand = 2, + allocs_per_free_operation = 1, + free_operations_at_once = 1, + free_strategy = .At_The_End, + free_direction = .Forward, + ) + + // This is a lengthy test and won't tell us much more than any other test will. + if opt.long { + test_single_alloc_and_resize_incremental(0, runtime.HEAP_SLAB_SIZE) + } + + runtime.compact_heap() + + test_serial_bin_sanity() + + runtime.compact_heap() + } + + if opt.serial_benchmarks { + log.info("--- Single-threaded benchmarks ---") + + log.info("* Freeing forwards ...") + bench_alloc_n_then_free_n(1_000_000, int) + bench_alloc_n_then_free_n(1_000_000, Struct_16) + bench_alloc_n_then_free_n(1_000_000, Struct_32) + bench_alloc_n_then_free_n(1_000_000, Struct_64) + bench_alloc_n_then_free_n(1_000_000, Struct_512) + bench_alloc_n_then_free_n(10_000, [8192]u8) + bench_alloc_n_then_free_n(1000, [runtime.HEAP_SLAB_SIZE/4]u8) + bench_alloc_n_then_free_n(1000, [runtime.HEAP_SLAB_SIZE*4]u8) + bench_alloc_n_then_free_n(10, [runtime.SUPERPAGE_SIZE]u8) + + log.info("* Freeing backwards ...") + bench_alloc_n_then_free_n_backwards(1_000_000, int) + bench_alloc_n_then_free_n_backwards(1_000_000, Struct_16) + bench_alloc_n_then_free_n_backwards(1_000_000, Struct_32) + bench_alloc_n_then_free_n_backwards(1_000_000, Struct_64) + bench_alloc_n_then_free_n_backwards(1_000_000, Struct_512) + bench_alloc_n_then_free_n_backwards(10_000, [8192]u8) + bench_alloc_n_then_free_n_backwards(1000, [runtime.HEAP_SLAB_SIZE/4]u8) + bench_alloc_n_then_free_n_backwards(1000, [runtime.HEAP_SLAB_SIZE*4]u8) + bench_alloc_n_then_free_n_backwards(10, [runtime.SUPERPAGE_SIZE]u8) + + log.info("* Freeing randomly ...") + bench_alloc_n_then_free_n_randomly(1_000_000, int) + bench_alloc_n_then_free_n_randomly(1_000_000, Struct_16) + bench_alloc_n_then_free_n_randomly(1_000_000, Struct_32) + bench_alloc_n_then_free_n_randomly(1_000_000, Struct_64) + bench_alloc_n_then_free_n_randomly(1_000_000, Struct_512) + bench_alloc_n_then_free_n_randomly(10_000, [8192]u8) + bench_alloc_n_then_free_n_randomly(10_000, [runtime.HEAP_SLAB_SIZE/4]u8) + bench_alloc_n_then_free_n_randomly(10_000, [runtime.HEAP_SLAB_SIZE-runtime.HEAP_SLAB_ALLOCATION_BOOK_KEEPING]u8) + bench_alloc_n_then_free_n_randomly(10_000, [runtime.HEAP_SLAB_SIZE]u8) + bench_alloc_n_then_free_n_randomly(10_000, [runtime.HEAP_SLAB_SIZE*2]u8) + bench_alloc_n_then_free_n_randomly(10, [runtime.SUPERPAGE_SIZE]u8) + + log.info("* Allocating and freeing repeatedly ...") + bench_alloc_1_then_free_1_repeatedly(100_000, int) + bench_alloc_1_then_free_1_repeatedly(100_000, Struct_16) + bench_alloc_1_then_free_1_repeatedly(100_000, Struct_32) + bench_alloc_1_then_free_1_repeatedly(100_000, Struct_64) + bench_alloc_1_then_free_1_repeatedly(100_000, Struct_512) + bench_alloc_1_then_free_1_repeatedly(10_000, [8192]u8) + } + + if opt.parallel_benchmarks { + log.info("--- Multi-threaded benchmarks ---") + + bench_1_producer_n_consumer_for_m_alloc(1, 10_000, Struct_16) + bench_1_producer_n_consumer_for_m_alloc(2, 10_000, Struct_16) + bench_1_producer_n_consumer_for_m_alloc(4, 10_000, Struct_16) + + bench_1_producer_n_consumer_for_m_alloc(1, 10_000, Struct_32) + bench_1_producer_n_consumer_for_m_alloc(2, 10_000, Struct_32) + bench_1_producer_n_consumer_for_m_alloc(4, 10_000, Struct_32) + + bench_1_producer_n_consumer_for_m_alloc(1, 10_000, Struct_64) + bench_1_producer_n_consumer_for_m_alloc(2, 10_000, Struct_64) + bench_1_producer_n_consumer_for_m_alloc(4, 10_000, Struct_64) + + bench_1_producer_n_consumer_for_m_alloc(1, 10_000, Struct_512) + bench_1_producer_n_consumer_for_m_alloc(2, 10_000, Struct_512) + bench_1_producer_n_consumer_for_m_alloc(4, 10_000, Struct_512) + + bench_1_producer_n_consumer_for_m_alloc(1, 1_000, [8192]u8) + bench_1_producer_n_consumer_for_m_alloc(2, 1_000, [8192]u8) + bench_1_producer_n_consumer_for_m_alloc(4, 1_000, [8192]u8) + + bench_1_producer_n_consumer_for_m_alloc(4, 10, [runtime.HEAP_SLAB_SIZE]u8) + + when .Thread not_in ODIN_SANITIZER_FLAGS { + // NOTE: TSan doesn't work well with excessive thread counts, + // in my experience. + bench_1_producer_n_consumer_for_m_alloc(32, 1_000, Struct_32) + bench_1_producer_n_consumer_for_m_alloc(64, 1_000, Struct_32) + bench_1_producer_n_consumer_for_m_alloc(128, 1_000, Struct_32) + } + } + + } + + log.info("Tests complete.") + + if opt.compact { + runtime.compact_heap() + log.info("The main thread's heap has been compacted.") + } + + if opt.coverage { + when runtime.ODIN_DEBUG_HEAP { + log.info("--- Code coverage report ---") + for key in runtime.Heap_Code_Coverage_Type { + value := intrinsics.atomic_load_explicit(&runtime.heap_global_code_coverage[key], .Acquire) + log.infof("%s%v: %v", ">>> " if value == 0 else "", key, value) + } + + log.infof("Full code coverage: %v", runtime._check_heap_code_coverage()) + } else { + log.error("ODIN_DEBUG_HEAP is not enabled, thus coverage cannot be calculated.") + } + } + + if opt.info { + heap_info := runtime.get_local_heap_info() + log.infof("%#v", heap_info) + } + + // if .Dump_Slabs in params { + // dump_slabs() + // } + + if opt.trap { + intrinsics.debug_trap() + } +}