Skip to content

Commit dbc0ed2

Browse files
committed
Unify stable and unstable sort implementations in same core module
This moves the stable sort implementation to the core::slice::sort module. By virtue of being in core it can't access `Vec`. The two `Vec` used by merge sort, `buf` and `runs`, are modelled as custom types that implement the very limited required `Vec` interface with the help of provided allocation and free functions. This is done to allow future re-use of functions and logic between stable and unstable sort. Such as `insert_head`.
1 parent 736c675 commit dbc0ed2

File tree

3 files changed

+540
-310
lines changed

3 files changed

+540
-310
lines changed

library/alloc/src/slice.rs

+39-309
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,12 @@ use core::cmp::Ordering::{self, Less};
1919
use core::mem::{self, SizedTypeProperties};
2020
#[cfg(not(no_global_oom_handling))]
2121
use core::ptr;
22+
#[cfg(not(no_global_oom_handling))]
23+
use core::slice::sort;
2224

2325
use crate::alloc::Allocator;
2426
#[cfg(not(no_global_oom_handling))]
25-
use crate::alloc::Global;
27+
use crate::alloc::{self, Global};
2628
#[cfg(not(no_global_oom_handling))]
2729
use crate::borrow::ToOwned;
2830
use crate::boxed::Box;
@@ -203,7 +205,7 @@ impl<T> [T] {
203205
where
204206
T: Ord,
205207
{
206-
merge_sort(self, T::lt);
208+
stable_sort(self, T::lt);
207209
}
208210

209211
/// Sorts the slice with a comparator function.
@@ -259,7 +261,7 @@ impl<T> [T] {
259261
where
260262
F: FnMut(&T, &T) -> Ordering,
261263
{
262-
merge_sort(self, |a, b| compare(a, b) == Less);
264+
stable_sort(self, |a, b| compare(a, b) == Less);
263265
}
264266

265267
/// Sorts the slice with a key extraction function.
@@ -302,7 +304,7 @@ impl<T> [T] {
302304
F: FnMut(&T) -> K,
303305
K: Ord,
304306
{
305-
merge_sort(self, |a, b| f(a).lt(&f(b)));
307+
stable_sort(self, |a, b| f(a).lt(&f(b)));
306308
}
307309

308310
/// Sorts the slice with a key extraction function.
@@ -809,324 +811,52 @@ impl<T: Clone> ToOwned for [T] {
809811
// Sorting
810812
////////////////////////////////////////////////////////////////////////////////
811813

812-
/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
813-
///
814-
/// This is the integral subroutine of insertion sort.
815-
#[cfg(not(no_global_oom_handling))]
816-
fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
817-
where
818-
F: FnMut(&T, &T) -> bool,
819-
{
820-
if v.len() >= 2 && is_less(&v[1], &v[0]) {
821-
unsafe {
822-
// There are three ways to implement insertion here:
823-
//
824-
// 1. Swap adjacent elements until the first one gets to its final destination.
825-
// However, this way we copy data around more than is necessary. If elements are big
826-
// structures (costly to copy), this method will be slow.
827-
//
828-
// 2. Iterate until the right place for the first element is found. Then shift the
829-
// elements succeeding it to make room for it and finally place it into the
830-
// remaining hole. This is a good method.
831-
//
832-
// 3. Copy the first element into a temporary variable. Iterate until the right place
833-
// for it is found. As we go along, copy every traversed element into the slot
834-
// preceding it. Finally, copy data from the temporary variable into the remaining
835-
// hole. This method is very good. Benchmarks demonstrated slightly better
836-
// performance than with the 2nd method.
837-
//
838-
// All methods were benchmarked, and the 3rd showed best results. So we chose that one.
839-
let tmp = mem::ManuallyDrop::new(ptr::read(&v[0]));
840-
841-
// Intermediate state of the insertion process is always tracked by `hole`, which
842-
// serves two purposes:
843-
// 1. Protects integrity of `v` from panics in `is_less`.
844-
// 2. Fills the remaining hole in `v` in the end.
845-
//
846-
// Panic safety:
847-
//
848-
// If `is_less` panics at any point during the process, `hole` will get dropped and
849-
// fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
850-
// initially held exactly once.
851-
let mut hole = InsertionHole { src: &*tmp, dest: &mut v[1] };
852-
ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);
853-
854-
for i in 2..v.len() {
855-
if !is_less(&v[i], &*tmp) {
856-
break;
857-
}
858-
ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
859-
hole.dest = &mut v[i];
860-
}
861-
// `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
862-
}
863-
}
864-
865-
// When dropped, copies from `src` into `dest`.
866-
struct InsertionHole<T> {
867-
src: *const T,
868-
dest: *mut T,
869-
}
870-
871-
impl<T> Drop for InsertionHole<T> {
872-
fn drop(&mut self) {
873-
unsafe {
874-
ptr::copy_nonoverlapping(self.src, self.dest, 1);
875-
}
876-
}
877-
}
878-
}
879-
880-
/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
881-
/// stores the result into `v[..]`.
882-
///
883-
/// # Safety
884-
///
885-
/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
886-
/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
887-
#[cfg(not(no_global_oom_handling))]
888-
unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
889-
where
890-
F: FnMut(&T, &T) -> bool,
891-
{
892-
let len = v.len();
893-
let v = v.as_mut_ptr();
894-
let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) };
895-
896-
// The merge process first copies the shorter run into `buf`. Then it traces the newly copied
897-
// run and the longer run forwards (or backwards), comparing their next unconsumed elements and
898-
// copying the lesser (or greater) one into `v`.
899-
//
900-
// As soon as the shorter run is fully consumed, the process is done. If the longer run gets
901-
// consumed first, then we must copy whatever is left of the shorter run into the remaining
902-
// hole in `v`.
903-
//
904-
// Intermediate state of the process is always tracked by `hole`, which serves two purposes:
905-
// 1. Protects integrity of `v` from panics in `is_less`.
906-
// 2. Fills the remaining hole in `v` if the longer run gets consumed first.
907-
//
908-
// Panic safety:
909-
//
910-
// If `is_less` panics at any point during the process, `hole` will get dropped and fill the
911-
// hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
912-
// object it initially held exactly once.
913-
let mut hole;
914-
915-
if mid <= len - mid {
916-
// The left run is shorter.
917-
unsafe {
918-
ptr::copy_nonoverlapping(v, buf, mid);
919-
hole = MergeHole { start: buf, end: buf.add(mid), dest: v };
920-
}
921-
922-
// Initially, these pointers point to the beginnings of their arrays.
923-
let left = &mut hole.start;
924-
let mut right = v_mid;
925-
let out = &mut hole.dest;
926-
927-
while *left < hole.end && right < v_end {
928-
// Consume the lesser side.
929-
// If equal, prefer the left run to maintain stability.
930-
unsafe {
931-
let to_copy = if is_less(&*right, &**left) {
932-
get_and_increment(&mut right)
933-
} else {
934-
get_and_increment(left)
935-
};
936-
ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1);
937-
}
938-
}
939-
} else {
940-
// The right run is shorter.
941-
unsafe {
942-
ptr::copy_nonoverlapping(v_mid, buf, len - mid);
943-
hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid };
944-
}
945-
946-
// Initially, these pointers point past the ends of their arrays.
947-
let left = &mut hole.dest;
948-
let right = &mut hole.end;
949-
let mut out = v_end;
950-
951-
while v < *left && buf < *right {
952-
// Consume the greater side.
953-
// If equal, prefer the right run to maintain stability.
954-
unsafe {
955-
let to_copy = if is_less(&*right.sub(1), &*left.sub(1)) {
956-
decrement_and_get(left)
957-
} else {
958-
decrement_and_get(right)
959-
};
960-
ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1);
961-
}
962-
}
963-
}
964-
// Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of
965-
// it will now be copied into the hole in `v`.
966-
967-
unsafe fn get_and_increment<T>(ptr: &mut *mut T) -> *mut T {
968-
let old = *ptr;
969-
*ptr = unsafe { ptr.add(1) };
970-
old
971-
}
972-
973-
unsafe fn decrement_and_get<T>(ptr: &mut *mut T) -> *mut T {
974-
*ptr = unsafe { ptr.sub(1) };
975-
*ptr
976-
}
977-
978-
// When dropped, copies the range `start..end` into `dest..`.
979-
struct MergeHole<T> {
980-
start: *mut T,
981-
end: *mut T,
982-
dest: *mut T,
983-
}
984-
985-
impl<T> Drop for MergeHole<T> {
986-
fn drop(&mut self) {
987-
// `T` is not a zero-sized type, and these are pointers into a slice's elements.
988-
unsafe {
989-
let len = self.end.sub_ptr(self.start);
990-
ptr::copy_nonoverlapping(self.start, self.dest, len);
991-
}
992-
}
993-
}
994-
}
995-
996-
/// This merge sort borrows some (but not all) ideas from TimSort, which is described in detail
997-
/// [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt).
998-
///
999-
/// The algorithm identifies strictly descending and non-descending subsequences, which are called
1000-
/// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
1001-
/// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
1002-
/// satisfied:
1003-
///
1004-
/// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
1005-
/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
1006-
///
1007-
/// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case.
814+
#[inline]
1008815
#[cfg(not(no_global_oom_handling))]
1009-
fn merge_sort<T, F>(v: &mut [T], mut is_less: F)
816+
fn stable_sort<T, F>(v: &mut [T], mut is_less: F)
1010817
where
1011818
F: FnMut(&T, &T) -> bool,
1012819
{
1013-
// Slices of up to this length get sorted using insertion sort.
1014-
const MAX_INSERTION: usize = 20;
1015-
// Very short runs are extended using insertion sort to span at least this many elements.
1016-
const MIN_RUN: usize = 10;
1017-
1018-
// Sorting has no meaningful behavior on zero-sized types.
1019820
if T::IS_ZST {
821+
// Sorting has no meaningful behavior on zero-sized types. Do nothing.
1020822
return;
1021823
}
1022824

1023-
let len = v.len();
1024-
1025-
// Short arrays get sorted in-place via insertion sort to avoid allocations.
1026-
if len <= MAX_INSERTION {
1027-
if len >= 2 {
1028-
for i in (0..len - 1).rev() {
1029-
insert_head(&mut v[i..], &mut is_less);
1030-
}
1031-
}
1032-
return;
1033-
}
1034-
1035-
// Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
1036-
// shallow copies of the contents of `v` without risking the dtors running on copies if
1037-
// `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
1038-
// which will always have length at most `len / 2`.
1039-
let mut buf = Vec::with_capacity(len / 2);
825+
let elem_alloc_fn = |len: usize| -> *mut T {
826+
// SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
827+
// v.len(). Alloc in general will only be used as 'shadow-region' to store temporary swap
828+
// elements.
829+
unsafe { alloc::alloc(alloc::Layout::array::<T>(len).unwrap_unchecked()) as *mut T }
830+
};
1040831

1041-
// In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
1042-
// strange decision, but consider the fact that merges more often go in the opposite direction
1043-
// (forwards). According to benchmarks, merging forwards is slightly faster than merging
1044-
// backwards. To conclude, identifying runs by traversing backwards improves performance.
1045-
let mut runs = vec![];
1046-
let mut end = len;
1047-
while end > 0 {
1048-
// Find the next natural run, and reverse it if it's strictly descending.
1049-
let mut start = end - 1;
1050-
if start > 0 {
1051-
start -= 1;
1052-
unsafe {
1053-
if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
1054-
while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {
1055-
start -= 1;
1056-
}
1057-
v[start..end].reverse();
1058-
} else {
1059-
while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1))
1060-
{
1061-
start -= 1;
1062-
}
1063-
}
1064-
}
1065-
}
1066-
1067-
// Insert some more elements into the run if it's too short. Insertion sort is faster than
1068-
// merge sort on short sequences, so this significantly improves performance.
1069-
while start > 0 && end - start < MIN_RUN {
1070-
start -= 1;
1071-
insert_head(&mut v[start..end], &mut is_less);
832+
let elem_dealloc_fn = |buf_ptr: *mut T, len: usize| {
833+
// SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
834+
// v.len(). The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
835+
// len.
836+
unsafe {
837+
alloc::dealloc(buf_ptr as *mut u8, alloc::Layout::array::<T>(len).unwrap_unchecked());
1072838
}
839+
};
1073840

1074-
// Push this run onto the stack.
1075-
runs.push(Run { start, len: end - start });
1076-
end = start;
1077-
1078-
// Merge some pairs of adjacent runs to satisfy the invariants.
1079-
while let Some(r) = collapse(&runs) {
1080-
let left = runs[r + 1];
1081-
let right = runs[r];
1082-
unsafe {
1083-
merge(
1084-
&mut v[left.start..right.start + right.len],
1085-
left.len,
1086-
buf.as_mut_ptr(),
1087-
&mut is_less,
1088-
);
1089-
}
1090-
runs[r] = Run { start: left.start, len: left.len + right.len };
1091-
runs.remove(r + 1);
841+
let run_alloc_fn = |len: usize| -> *mut sort::TimSortRun {
842+
// SAFETY: Creating the layout is safe as long as merge_sort never calls this with an
843+
// obscene length or 0.
844+
unsafe {
845+
alloc::alloc(alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked())
846+
as *mut sort::TimSortRun
1092847
}
1093-
}
1094-
1095-
// Finally, exactly one run must remain in the stack.
1096-
debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len);
848+
};
1097849

1098-
// Examines the stack of runs and identifies the next pair of runs to merge. More specifically,
1099-
// if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the
1100-
// algorithm should continue building a new run instead, `None` is returned.
1101-
//
1102-
// TimSort is infamous for its buggy implementations, as described here:
1103-
// http://envisage-project.eu/timsort-specification-and-verification/
1104-
//
1105-
// The gist of the story is: we must enforce the invariants on the top four runs on the stack.
1106-
// Enforcing them on just top three is not sufficient to ensure that the invariants will still
1107-
// hold for *all* runs in the stack.
1108-
//
1109-
// This function correctly checks invariants for the top four runs. Additionally, if the top
1110-
// run starts at index 0, it will always demand a merge operation until the stack is fully
1111-
// collapsed, in order to complete the sort.
1112-
#[inline]
1113-
fn collapse(runs: &[Run]) -> Option<usize> {
1114-
let n = runs.len();
1115-
if n >= 2
1116-
&& (runs[n - 1].start == 0
1117-
|| runs[n - 2].len <= runs[n - 1].len
1118-
|| (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
1119-
|| (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
1120-
{
1121-
if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) }
1122-
} else {
1123-
None
850+
let run_dealloc_fn = |buf_ptr: *mut sort::TimSortRun, len: usize| {
851+
// SAFETY: The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
852+
// len.
853+
unsafe {
854+
alloc::dealloc(
855+
buf_ptr as *mut u8,
856+
alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked(),
857+
);
1124858
}
1125-
}
859+
};
1126860

1127-
#[derive(Clone, Copy)]
1128-
struct Run {
1129-
start: usize,
1130-
len: usize,
1131-
}
861+
sort::merge_sort(v, &mut is_less, elem_alloc_fn, elem_dealloc_fn, run_alloc_fn, run_dealloc_fn);
1132862
}

0 commit comments

Comments
 (0)