1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
//! This module contains the entry points for `slice::sort`.
use crate::cmp;
use crate::intrinsics;
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
use crate::slice::sort::shared::smallsort::{
    insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN,
};
pub(crate) mod drift;
pub(crate) mod merge;
pub(crate) mod quicksort;
/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
/// Design document:
/// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md>
///
/// Upholds all safety properties outlined here:
/// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md>
#[inline(always)]
pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
    // Arrays of zero-sized types are always all-equal, and thus sorted.
    if T::IS_ZST {
        return;
    }
    // Instrumenting the standard library showed that 90+% of the calls to sort
    // by rustc are either of size 0 or 1.
    let len = v.len();
    if intrinsics::likely(len < 2) {
        return;
    }
    // More advanced sorting methods than insertion sort are faster if called in
    // a hot loop for small inputs, but for general-purpose code the small
    // binary size of insertion sort is more important. The instruction cache in
    // modern processors is very valuable, and for a single sort call in general
    // purpose code any gains from an advanced method are cancelled by i-cache
    // misses during the sort, and thrashing the i-cache for surrounding code.
    const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
    if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
        insertion_sort_shift_left(v, 1, is_less);
        return;
    }
    driftsort_main::<T, F, BufT>(v, is_less);
}
/// See [`sort`]
///
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
/// inlined insertion sort i-cache footprint remains minimal.
#[inline(never)]
fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
    // By allocating n elements of memory we can ensure the entire input can
    // be sorted using stable quicksort, which allows better performance on
    // random and low-cardinality distributions. However, we still want to
    // reduce our memory usage to n / 2 for large inputs. We do this by scaling
    // our allocation as max(n / 2, min(n, 8MB)), ensuring we scale like n for
    // small inputs and n / 2 for large inputs, without a sudden drop off. We
    // also need to ensure our alloc >= MIN_SMALL_SORT_SCRATCH_LEN, as the
    // small-sort always needs this much memory.
    const MAX_FULL_ALLOC_BYTES: usize = 8_000_000; // 8MB
    let max_full_alloc = MAX_FULL_ALLOC_BYTES / mem::size_of::<T>();
    let len = v.len();
    let alloc_len =
        cmp::max(cmp::max(len / 2, cmp::min(len, max_full_alloc)), SMALL_SORT_GENERAL_SCRATCH_LEN);
    // For small inputs 4KiB of stack storage suffices, which allows us to avoid
    // calling the (de-)allocator. Benchmarks showed this was quite beneficial.
    let mut stack_buf = AlignedStorage::<T, 4096>::new();
    let stack_scratch = stack_buf.as_uninit_slice_mut();
    let mut heap_buf;
    let scratch = if stack_scratch.len() >= alloc_len {
        stack_scratch
    } else {
        heap_buf = BufT::with_capacity(alloc_len);
        heap_buf.as_uninit_slice_mut()
    };
    // For small inputs using quicksort is not yet beneficial, and a single
    // small-sort or two small-sorts plus a single merge outperforms it, so use
    // eager mode.
    let eager_sort = len <= T::small_sort_threshold() * 2;
    crate::slice::sort::stable::drift::sort(v, scratch, eager_sort, is_less);
}
#[doc(hidden)]
/// Abstracts owned memory buffer, so that sort code can live in core where no allocation is
/// possible. This trait can then be implemented in a place that has access to allocation.
pub trait BufGuard<T> {
    /// Creates new buffer that holds at least `capacity` memory.
    fn with_capacity(capacity: usize) -> Self;
    /// Returns mutable access to uninitialized memory owned by the buffer.
    fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>];
}
#[repr(C)]
struct AlignedStorage<T, const N: usize> {
    _align: [T; 0],
    storage: [MaybeUninit<u8>; N],
}
impl<T, const N: usize> AlignedStorage<T, N> {
    fn new() -> Self {
        Self { _align: [], storage: [const { MaybeUninit::uninit() }; N] }
    }
    fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<T>] {
        let len = N / mem::size_of::<T>();
        // SAFETY: `_align` ensures we are correctly aligned.
        unsafe { core::slice::from_raw_parts_mut(self.storage.as_mut_ptr().cast(), len) }
    }
}