Skip to content

Commit cab4bff

Browse files
committed
Auto merge of #40601 - stjepang:sort-unstable, r=alexcrichton
Implement feature sort_unstable Tracking issue for the feature: #40585 This is essentially integration of [pdqsort](https://github.com/stjepang/pdqsort) into libcore. There's plenty of unsafe blocks to review. The heart of pdqsort is `fn partition_in_blocks` and is probably the most challenging function to understand. It requires some patience, but let me know if you find it too difficult - comments could always be improved. #### Changes * Added `sort_unstable` feature. * Tweaked insertion sort constants for stable sort. Sorting integers is now up to 5% slower, but sorting big elements is much faster (in particular, `sort_large_big_random` is 35% faster). The old constants were highly optimized for sorting integers, so overall the configuration is more balanced now. A minor regression in case of integers is forgivable as we recently had performance improvements (#39538) that completely make up for it. * Removed some uninteresting sort benchmarks. * Added a new sort benchmark for string sorting. #### Benchmarks The following table compares stable and unstable sorting: ``` name stable ns/iter unstable ns/iter diff ns/iter diff % slice::sort_large_ascending 7,240 (11049 MB/s) 7,380 (10840 MB/s) 140 1.93% slice::sort_large_big_random 1,454,138 (880 MB/s) 910,269 (1406 MB/s) -543,869 -37.40% slice::sort_large_descending 13,450 (5947 MB/s) 10,895 (7342 MB/s) -2,555 -19.00% slice::sort_large_mostly_ascending 204,041 (392 MB/s) 88,639 (902 MB/s) -115,402 -56.56% slice::sort_large_mostly_descending 217,109 (368 MB/s) 99,009 (808 MB/s) -118,100 -54.40% slice::sort_large_random 477,257 (167 MB/s) 346,028 (231 MB/s) -131,229 -27.50% slice::sort_large_random_expensive 21,670,537 (3 MB/s) 22,710,238 (3 MB/s) 1,039,701 4.80% slice::sort_large_strings 6,284,499 (38 MB/s) 6,410,896 (37 MB/s) 126,397 2.01% slice::sort_medium_random 3,515 (227 MB/s) 3,327 (240 MB/s) -188 -5.35% slice::sort_small_ascending 42 (1904 MB/s) 41 (1951 MB/s) -1 -2.38% slice::sort_small_big_random 503 (2544 MB/s) 514 (2490 MB/s) 11 2.19% slice::sort_small_descending 72 (1111 MB/s) 69 (1159 MB/s) -3 -4.17% slice::sort_small_random 369 (216 MB/s) 367 (217 MB/s) -2 -0.54% ``` Interesting cases: * Expensive comparison function and string sorting - it's a really close race, but timsort performs a slightly smaller number of comparisons. This is a natural difference of bottom-up merging versus top-down partitioning. * `large_descending` - unstable sort is faster, but both sorts should have equivalent performance. Both just check whether the slice is descending and if so, they reverse it. I blame LLVM for the discrepancy. r? @alexcrichton
2 parents 58c701f + a718051 commit cab4bff

File tree

10 files changed

+1069
-113
lines changed

10 files changed

+1069
-113
lines changed

src/libcollections/benches/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#![deny(warnings)]
1212

1313
#![feature(rand)]
14+
#![feature(sort_unstable)]
1415
#![feature(test)]
1516

1617
extern crate test;

src/libcollections/benches/slice.rs

+61-49
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ fn random_inserts(b: &mut Bencher) {
169169
}
170170
})
171171
}
172+
172173
#[bench]
173174
fn random_removes(b: &mut Bencher) {
174175
let mut rng = thread_rng();
@@ -216,65 +217,76 @@ fn gen_mostly_descending(len: usize) -> Vec<u64> {
216217
v
217218
}
218219

219-
fn gen_big_random(len: usize) -> Vec<[u64; 16]> {
220+
fn gen_strings(len: usize) -> Vec<String> {
220221
let mut rng = thread_rng();
221-
rng.gen_iter().map(|x| [x; 16]).take(len).collect()
222-
}
223-
224-
fn gen_big_ascending(len: usize) -> Vec<[u64; 16]> {
225-
(0..len as u64).map(|x| [x; 16]).take(len).collect()
222+
let mut v = vec![];
223+
for _ in 0..len {
224+
let n = rng.gen::<usize>() % 20 + 1;
225+
v.push(rng.gen_ascii_chars().take(n).collect());
226+
}
227+
v
226228
}
227229

228-
fn gen_big_descending(len: usize) -> Vec<[u64; 16]> {
229-
(0..len as u64).rev().map(|x| [x; 16]).take(len).collect()
230+
fn gen_big_random(len: usize) -> Vec<[u64; 16]> {
231+
let mut rng = thread_rng();
232+
rng.gen_iter().map(|x| [x; 16]).take(len).collect()
230233
}
231234

232-
macro_rules! sort_bench {
233-
($name:ident, $gen:expr, $len:expr) => {
235+
macro_rules! sort {
236+
($f:ident, $name:ident, $gen:expr, $len:expr) => {
234237
#[bench]
235238
fn $name(b: &mut Bencher) {
236-
b.iter(|| $gen($len).sort());
239+
b.iter(|| $gen($len).$f());
237240
b.bytes = $len * mem::size_of_val(&$gen(1)[0]) as u64;
238241
}
239242
}
240243
}
241244

242-
sort_bench!(sort_small_random, gen_random, 10);
243-
sort_bench!(sort_small_ascending, gen_ascending, 10);
244-
sort_bench!(sort_small_descending, gen_descending, 10);
245-
246-
sort_bench!(sort_small_big_random, gen_big_random, 10);
247-
sort_bench!(sort_small_big_ascending, gen_big_ascending, 10);
248-
sort_bench!(sort_small_big_descending, gen_big_descending, 10);
249-
250-
sort_bench!(sort_medium_random, gen_random, 100);
251-
sort_bench!(sort_medium_ascending, gen_ascending, 100);
252-
sort_bench!(sort_medium_descending, gen_descending, 100);
253-
254-
sort_bench!(sort_large_random, gen_random, 10000);
255-
sort_bench!(sort_large_ascending, gen_ascending, 10000);
256-
sort_bench!(sort_large_descending, gen_descending, 10000);
257-
sort_bench!(sort_large_mostly_ascending, gen_mostly_ascending, 10000);
258-
sort_bench!(sort_large_mostly_descending, gen_mostly_descending, 10000);
259-
260-
sort_bench!(sort_large_big_random, gen_big_random, 10000);
261-
sort_bench!(sort_large_big_ascending, gen_big_ascending, 10000);
262-
sort_bench!(sort_large_big_descending, gen_big_descending, 10000);
245+
macro_rules! sort_expensive {
246+
($f:ident, $name:ident, $gen:expr, $len:expr) => {
247+
#[bench]
248+
fn $name(b: &mut Bencher) {
249+
b.iter(|| {
250+
let mut v = $gen($len);
251+
let mut count = 0;
252+
v.$f(|a: &u64, b: &u64| {
253+
count += 1;
254+
if count % 1_000_000_000 == 0 {
255+
panic!("should not happen");
256+
}
257+
(*a as f64).cos().partial_cmp(&(*b as f64).cos()).unwrap()
258+
});
259+
black_box(count);
260+
});
261+
b.bytes = $len as u64 * mem::size_of::<u64>() as u64;
262+
}
263+
}
264+
}
263265

264-
#[bench]
265-
fn sort_large_random_expensive(b: &mut Bencher) {
266-
let len = 10000;
267-
b.iter(|| {
268-
let mut v = gen_random(len);
269-
let mut count = 0;
270-
v.sort_by(|a: &u64, b: &u64| {
271-
count += 1;
272-
if count % 1_000_000_000 == 0 {
273-
panic!("should not happen");
274-
}
275-
(*a as f64).cos().partial_cmp(&(*b as f64).cos()).unwrap()
276-
});
277-
black_box(count);
278-
});
279-
b.bytes = len as u64 * mem::size_of::<u64>() as u64;
280-
}
266+
sort!(sort, sort_small_ascending, gen_ascending, 10);
267+
sort!(sort, sort_small_descending, gen_descending, 10);
268+
sort!(sort, sort_small_random, gen_random, 10);
269+
sort!(sort, sort_small_big_random, gen_big_random, 10);
270+
sort!(sort, sort_medium_random, gen_random, 100);
271+
sort!(sort, sort_large_ascending, gen_ascending, 10000);
272+
sort!(sort, sort_large_descending, gen_descending, 10000);
273+
sort!(sort, sort_large_mostly_ascending, gen_mostly_ascending, 10000);
274+
sort!(sort, sort_large_mostly_descending, gen_mostly_descending, 10000);
275+
sort!(sort, sort_large_random, gen_random, 10000);
276+
sort!(sort, sort_large_big_random, gen_big_random, 10000);
277+
sort!(sort, sort_large_strings, gen_strings, 10000);
278+
sort_expensive!(sort_by, sort_large_random_expensive, gen_random, 10000);
279+
280+
sort!(sort_unstable, sort_unstable_small_ascending, gen_ascending, 10);
281+
sort!(sort_unstable, sort_unstable_small_descending, gen_descending, 10);
282+
sort!(sort_unstable, sort_unstable_small_random, gen_random, 10);
283+
sort!(sort_unstable, sort_unstable_small_big_random, gen_big_random, 10);
284+
sort!(sort_unstable, sort_unstable_medium_random, gen_random, 100);
285+
sort!(sort_unstable, sort_unstable_large_ascending, gen_ascending, 10000);
286+
sort!(sort_unstable, sort_unstable_large_descending, gen_descending, 10000);
287+
sort!(sort_unstable, sort_unstable_large_mostly_ascending, gen_mostly_ascending, 10000);
288+
sort!(sort_unstable, sort_unstable_large_mostly_descending, gen_mostly_descending, 10000);
289+
sort!(sort_unstable, sort_unstable_large_random, gen_random, 10000);
290+
sort!(sort_unstable, sort_unstable_large_big_random, gen_big_random, 10000);
291+
sort!(sort_unstable, sort_unstable_large_strings, gen_strings, 10000);
292+
sort_expensive!(sort_unstable_by, sort_unstable_large_random_expensive, gen_random, 10000);

src/libcollections/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
#![feature(shared)]
5353
#![feature(slice_get_slice)]
5454
#![feature(slice_patterns)]
55+
#![cfg_attr(not(test), feature(sort_unstable))]
5556
#![feature(specialization)]
5657
#![feature(staged_api)]
5758
#![feature(str_internals)]

src/libcollections/slice.rs

+134-28
Original file line numberDiff line numberDiff line change
@@ -1092,6 +1092,39 @@ impl<T> [T] {
10921092
merge_sort(self, |a, b| a.lt(b));
10931093
}
10941094

1095+
/// Sorts the slice using `compare` to compare elements.
1096+
///
1097+
/// This sort is stable (i.e. does not reorder equal elements) and `O(n log n)` worst-case.
1098+
///
1099+
/// # Current implementation
1100+
///
1101+
/// The current algorithm is an adaptive, iterative merge sort inspired by
1102+
/// [timsort](https://en.wikipedia.org/wiki/Timsort).
1103+
/// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
1104+
/// two or more sorted sequences concatenated one after another.
1105+
///
1106+
/// Also, it allocates temporary storage half the size of `self`, but for short slices a
1107+
/// non-allocating insertion sort is used instead.
1108+
///
1109+
/// # Examples
1110+
///
1111+
/// ```
1112+
/// let mut v = [5, 4, 1, 3, 2];
1113+
/// v.sort_by(|a, b| a.cmp(b));
1114+
/// assert!(v == [1, 2, 3, 4, 5]);
1115+
///
1116+
/// // reverse sorting
1117+
/// v.sort_by(|a, b| b.cmp(a));
1118+
/// assert!(v == [5, 4, 3, 2, 1]);
1119+
/// ```
1120+
#[stable(feature = "rust1", since = "1.0.0")]
1121+
#[inline]
1122+
pub fn sort_by<F>(&mut self, mut compare: F)
1123+
where F: FnMut(&T, &T) -> Ordering
1124+
{
1125+
merge_sort(self, |a, b| compare(a, b) == Less);
1126+
}
1127+
10951128
/// Sorts the slice using `f` to extract a key to compare elements by.
10961129
///
10971130
/// This sort is stable (i.e. does not reorder equal elements) and `O(n log n)` worst-case.
@@ -1122,37 +1155,118 @@ impl<T> [T] {
11221155
merge_sort(self, |a, b| f(a).lt(&f(b)));
11231156
}
11241157

1125-
/// Sorts the slice using `compare` to compare elements.
1158+
/// Sorts the slice, but may not preserve the order of equal elements.
11261159
///
1127-
/// This sort is stable (i.e. does not reorder equal elements) and `O(n log n)` worst-case.
1160+
/// This sort is unstable (i.e. may reorder equal elements), in-place (i.e. does not allocate),
1161+
/// and `O(n log n)` worst-case.
11281162
///
11291163
/// # Current implementation
11301164
///
1131-
/// The current algorithm is an adaptive, iterative merge sort inspired by
1132-
/// [timsort](https://en.wikipedia.org/wiki/Timsort).
1133-
/// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
1134-
/// two or more sorted sequences concatenated one after another.
1165+
/// The current algorithm is based on Orson Peters' [pdqsort][pattern-defeating quicksort],
1166+
/// which is a quicksort variant designed to be very fast on certain kinds of patterns,
1167+
/// sometimes achieving linear time. It is randomized but deterministic, and falls back to
1168+
/// heapsort on degenerate inputs.
11351169
///
1136-
/// Also, it allocates temporary storage half the size of `self`, but for short slices a
1137-
/// non-allocating insertion sort is used instead.
1170+
/// It is generally faster than stable sorting, except in a few special cases, e.g. when the
1171+
/// slice consists of several concatenated sorted sequences.
11381172
///
11391173
/// # Examples
11401174
///
11411175
/// ```
1176+
/// #![feature(sort_unstable)]
1177+
///
1178+
/// let mut v = [-5, 4, 1, -3, 2];
1179+
///
1180+
/// v.sort_unstable();
1181+
/// assert!(v == [-5, -3, 1, 2, 4]);
1182+
/// ```
1183+
///
1184+
/// [pdqsort]: https://github.com/orlp/pdqsort
1185+
// FIXME #40585: Mention `sort_unstable` in the documentation for `sort`.
1186+
#[unstable(feature = "sort_unstable", issue = "40585")]
1187+
#[inline]
1188+
pub fn sort_unstable(&mut self)
1189+
where T: Ord
1190+
{
1191+
core_slice::SliceExt::sort_unstable(self);
1192+
}
1193+
1194+
/// Sorts the slice using `compare` to compare elements, but may not preserve the order of
1195+
/// equal elements.
1196+
///
1197+
/// This sort is unstable (i.e. may reorder equal elements), in-place (i.e. does not allocate),
1198+
/// and `O(n log n)` worst-case.
1199+
///
1200+
/// # Current implementation
1201+
///
1202+
/// The current algorithm is based on Orson Peters' [pdqsort][pattern-defeating quicksort],
1203+
/// which is a quicksort variant designed to be very fast on certain kinds of patterns,
1204+
/// sometimes achieving linear time. It is randomized but deterministic, and falls back to
1205+
/// heapsort on degenerate inputs.
1206+
///
1207+
/// It is generally faster than stable sorting, except in a few special cases, e.g. when the
1208+
/// slice consists of several concatenated sorted sequences.
1209+
///
1210+
/// # Examples
1211+
///
1212+
/// ```
1213+
/// #![feature(sort_unstable)]
1214+
///
11421215
/// let mut v = [5, 4, 1, 3, 2];
1143-
/// v.sort_by(|a, b| a.cmp(b));
1216+
/// v.sort_unstable_by(|a, b| a.cmp(b));
11441217
/// assert!(v == [1, 2, 3, 4, 5]);
11451218
///
11461219
/// // reverse sorting
1147-
/// v.sort_by(|a, b| b.cmp(a));
1220+
/// v.sort_unstable_by(|a, b| b.cmp(a));
11481221
/// assert!(v == [5, 4, 3, 2, 1]);
11491222
/// ```
1150-
#[stable(feature = "rust1", since = "1.0.0")]
1223+
///
1224+
/// [pdqsort]: https://github.com/orlp/pdqsort
1225+
// FIXME #40585: Mention `sort_unstable_by` in the documentation for `sort_by`.
1226+
#[unstable(feature = "sort_unstable", issue = "40585")]
11511227
#[inline]
1152-
pub fn sort_by<F>(&mut self, mut compare: F)
1228+
pub fn sort_unstable_by<F>(&mut self, compare: F)
11531229
where F: FnMut(&T, &T) -> Ordering
11541230
{
1155-
merge_sort(self, |a, b| compare(a, b) == Less);
1231+
core_slice::SliceExt::sort_unstable_by(self, compare);
1232+
}
1233+
1234+
/// Sorts the slice using `f` to extract a key to compare elements by, but may not preserve the
1235+
/// order of equal elements.
1236+
///
1237+
/// This sort is unstable (i.e. may reorder equal elements), in-place (i.e. does not allocate),
1238+
/// and `O(n log n)` worst-case.
1239+
///
1240+
/// # Current implementation
1241+
///
1242+
/// The current algorithm is based on Orson Peters' [pdqsort][pattern-defeating quicksort],
1243+
/// which is a quicksort variant designed to be very fast on certain kinds of patterns,
1244+
/// sometimes achieving linear time. It is randomized but deterministic, and falls back to
1245+
/// heapsort on degenerate inputs.
1246+
///
1247+
/// It is generally faster than stable sorting, except in a few special cases, e.g. when the
1248+
/// slice consists of several concatenated sorted sequences.
1249+
///
1250+
/// # Examples
1251+
///
1252+
/// ```
1253+
/// #![feature(sort_unstable)]
1254+
///
1255+
/// let mut v = [-5i32, 4, 1, -3, 2];
1256+
///
1257+
/// v.sort_unstable_by_key(|k| k.abs());
1258+
/// assert!(v == [1, 2, -3, 4, -5]);
1259+
/// ```
1260+
///
1261+
/// [pdqsort]: https://github.com/orlp/pdqsort
1262+
// FIXME #40585: Mention `sort_unstable_by_key` in the documentation for `sort_by_key`.
1263+
#[unstable(feature = "sort_unstable", issue = "40585")]
1264+
#[inline]
1265+
pub fn sort_unstable_by_key<B, F>(&mut self, f: F)
1266+
where F: FnMut(&T) -> B,
1267+
B: Ord
1268+
{
1269+
core_slice::SliceExt::sort_unstable_by_key(self, f);
11561270
}
11571271

11581272
/// Copies the elements from `src` into `self`.
@@ -1553,28 +1667,20 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
15531667
fn merge_sort<T, F>(v: &mut [T], mut is_less: F)
15541668
where F: FnMut(&T, &T) -> bool
15551669
{
1670+
// Slices of up to this length get sorted using insertion sort.
1671+
const MAX_INSERTION: usize = 20;
1672+
// Very short runs are extended using insertion sort to span at least this many elements.
1673+
const MIN_RUN: usize = 10;
1674+
15561675
// Sorting has no meaningful behavior on zero-sized types.
15571676
if size_of::<T>() == 0 {
15581677
return;
15591678
}
15601679

1561-
// FIXME #12092: These numbers are platform-specific and need more extensive testing/tuning.
1562-
//
1563-
// If `v` has length up to `max_insertion`, simply switch to insertion sort because it is going
1564-
// to perform better than merge sort. For bigger types `T`, the threshold is smaller.
1565-
//
1566-
// Short runs are extended using insertion sort to span at least `min_run` elements, in order
1567-
// to improve performance.
1568-
let (max_insertion, min_run) = if size_of::<T>() <= 2 * mem::size_of::<usize>() {
1569-
(64, 32)
1570-
} else {
1571-
(32, 16)
1572-
};
1573-
15741680
let len = v.len();
15751681

15761682
// Short arrays get sorted in-place via insertion sort to avoid allocations.
1577-
if len <= max_insertion {
1683+
if len <= MAX_INSERTION {
15781684
if len >= 2 {
15791685
for i in (0..len-1).rev() {
15801686
insert_head(&mut v[i..], &mut is_less);
@@ -1618,7 +1724,7 @@ fn merge_sort<T, F>(v: &mut [T], mut is_less: F)
16181724

16191725
// Insert some more elements into the run if it's too short. Insertion sort is faster than
16201726
// merge sort on short sequences, so this significantly improves performance.
1621-
while start > 0 && end - start < min_run {
1727+
while start > 0 && end - start < MIN_RUN {
16221728
start -= 1;
16231729
insert_head(&mut v[start..end], &mut is_less);
16241730
}

src/libcollectionstest/slice.rs

+4-10
Original file line numberDiff line numberDiff line change
@@ -399,9 +399,10 @@ fn test_sort() {
399399
}
400400
}
401401

402-
// shouldn't panic
403-
let mut v: [i32; 0] = [];
404-
v.sort();
402+
// Should not panic.
403+
[0i32; 0].sort();
404+
[(); 10].sort();
405+
[(); 100].sort();
405406

406407
let mut v = [0xDEADBEEFu64];
407408
v.sort();
@@ -441,13 +442,6 @@ fn test_sort_stability() {
441442
}
442443
}
443444

444-
#[test]
445-
fn test_sort_zero_sized_type() {
446-
// Should not panic.
447-
[(); 10].sort();
448-
[(); 100].sort();
449-
}
450-
451445
#[test]
452446
fn test_concat() {
453447
let v: [Vec<i32>; 0] = [];

0 commit comments

Comments
 (0)