diff --git a/README.md b/README.md index 294e503..39d7980 100644 --- a/README.md +++ b/README.md @@ -9,10 +9,12 @@ The `SortingAlgorithms` package provides three sorting algorithms that can be us - [HeapSort] – an unstable, general purpose, in-place, O(n log n) comparison sort that works by heapifying an array and repeatedly taking the maximal element from the heap. - [TimSort] – a stable, general purpose, hybrid, O(n log n) comparison sort that adapts to different common patterns of partially ordered input data. - [CombSort] – an unstable, general purpose, in-place, O(n log n) comparison sort with O(n^2) pathological cases that can attain good efficiency through SIMD instructions and instruction level parallelism on modern hardware. +- [PagedMergeSort] – a stable, general purpose, O(n log n) time and O(sqrt n) space comparison sort. [HeapSort]: https://en.wikipedia.org/wiki/Heapsort [TimSort]: https://en.wikipedia.org/wiki/Timsort [CombSort]: https://en.wikipedia.org/wiki/Comb_sort +[PagedMergeSort]: https://link.springer.com/chapter/10.1007/BFb0016253 ## Usage diff --git a/docs/pagedMerge_130_130.gif b/docs/pagedMerge_130_130.gif new file mode 100644 index 0000000..6d8d310 Binary files /dev/null and b/docs/pagedMerge_130_130.gif differ diff --git a/src/SortingAlgorithms.jl b/src/SortingAlgorithms.jl index 6b08cd1..f6b67e1 100644 --- a/src/SortingAlgorithms.jl +++ b/src/SortingAlgorithms.jl @@ -9,12 +9,13 @@ using Base.Order import Base.Sort: sort! import DataStructures: heapify!, percolate_down! -export HeapSort, TimSort, RadixSort, CombSort +export HeapSort, TimSort, RadixSort, CombSort, PagedMergeSort struct HeapSortAlg <: Algorithm end struct TimSortAlg <: Algorithm end struct RadixSortAlg <: Algorithm end struct CombSortAlg <: Algorithm end +struct PagedMergeSortAlg <: Algorithm end function maybe_optimize(x::Algorithm) isdefined(Base.Sort, :InitialOptimizations) ? Base.Sort.InitialOptimizations(x) : x @@ -51,6 +52,34 @@ Characteristics: """ const CombSort = maybe_optimize(CombSortAlg()) +""" + PagedMergeSort + +Indicates that a sorting function should use the paged merge sort +algorithm. Paged merge sort uses is a merge sort, that uses different +merge routines to achieve stable sorting with a scratch space of size O(√n). +The merge routine for merging large subarrays merges +pages of size O(√n) almost in place, before reordering them using a page table. +At deeper recursion levels, where the scratch space is big enough, +normal merging is used, where one input is copied into the scratch space. +When the scratch space is large enough to hold the complete subarray, +the input is merged interleaved from both sides, which increases performance +for random data. + +Characteristics: + - *stable*: does preserve the ordering of elements which + compare equal (e.g. "a" and "A" in a sort of letters which + ignores case). + - *`O(√n)`* auxilary memory usage. + - *`O(n log n)`* garuanteed runtime. + +## References + - Dvořák, S., Ďurian, B. (1986). Towards an efficient merging. In: Gruska, J., Rovan, B., Wiedermann, + J. (eds) Mathematical Foundations of Computer Science 1986. MFCS 1986. Lecture Notes in Computer Science, vol 233. + Springer, Berlin, Heidelberg. https://doi.org/10.1007/BFb0016253 + - https://max-arbuzov.blogspot.com/2021/10/merge-sort-with-osqrtn-auxiliary-memory.html +""" +const PagedMergeSort = maybe_optimize(PagedMergeSortAlg()) ## Heap sort @@ -652,4 +681,252 @@ else end end +### +# PagedMergeSort +### + +# unsafe version of copyto! +# as workaround for https://github.com/JuliaLang/julia/issues/50900 +function _unsafe_copyto!(dest, doffs, src, soffs, n) + @inbounds for i in 0:n-1 + dest[doffs + i] = src[soffs + i] + end + dest +end + +function _unsafe_copyto!(dest::Array, doffs, src::Array, soffs, n) + unsafe_copyto!(dest, doffs, src, soffs, n) +end + +# merge v[lo:m] and v[m+1:hi] ([A;B]) using scratch[1:1+hi-lo] +# This is faster than merge! but requires twice as much auxiliary memory. +function twoended_merge!(v::AbstractVector{T}, lo::Integer, m::Integer, hi::Integer, o::Ordering, scratch::AbstractVector{T}) where T + @assert lo ≤ m ≤ hi + @assert abs((m-lo) - (hi-(m+1))) ≤ 1 "twoended_merge! only supports balanced merges" + len = 1 + hi - lo + # input array indices + a_lo = lo + a_hi = m + b_lo = m + 1 + b_hi = hi + # output array indices + k_lo = 1 + k_hi = len + @inbounds begin + # two ended merge + while k_lo <= len ÷ 2 + if lt(o, v[b_lo], v[a_lo]) + scratch[k_lo] = v[b_lo] + b_lo += 1 + else + scratch[k_lo] = v[a_lo] + a_lo += 1 + end + k_lo +=1 + if !lt(o, v[b_hi], v[a_hi]) + scratch[k_hi] = v[b_hi] + b_hi -= 1 + else + scratch[k_hi] = v[a_hi] + a_hi -= 1 + end + k_hi -=1 + end + # if the input length is odd, + # one item remains + if a_lo <= a_hi + scratch[k_lo] = v[a_lo] + elseif b_lo <= b_hi + scratch[k_lo] = v[b_lo] + end + # copy back from t to v + offset = lo-1 + for i = 1:len + v[offset+i] = scratch[i] + end + end +end + +# core merging loop used throughout PagedMergeSort +Base.@propagate_inbounds function merge!(f::Function, + target::AbstractVector{T}, source_a::AbstractVector{T}, source_b::AbstractVector{T}, + o::Ordering, a::Integer, b::Integer, k::Integer) where T + @inbounds while f(a,b,k) + if lt(o, source_b[b], source_a[a]) + target[k] = source_b[b] + b += 1 + else + target[k] = source_a[a] + a += 1 + end + k += 1 + end + a,b,k +end + +# merge v[lo:m] and v[m+1:hi] using scratch[1:1+m-lo] +# based on Base.Sort MergeSort +function merge!(v::AbstractVector{T}, lo::Integer, m::Integer, hi::Integer, o::Ordering, scratch::AbstractVector{T}) where {T} + _unsafe_copyto!(scratch, 1, v, lo, m - lo + 1) + f(_, b, k) = k < b <= hi + a, b, k = merge!(f, v, scratch, v, o, 1, m + 1, lo) + _unsafe_copyto!(v, k, scratch, a, b - k) +end + +struct Pages + current::Int # current page being merged into + currentNumber::Int # number of current page (=index in pageLocations) + nextA::Int # next possible page in A + nextB::Int # next possible page in B +end + +next_page_A(pages::Pages) = Pages(pages.nextA, pages.currentNumber + 1, pages.nextA + 1, pages.nextB) +next_page_B(pages::Pages) = Pages(pages.nextB, pages.currentNumber + 1, pages.nextA, pages.nextB + 1) + +Base.@propagate_inbounds function next_page!(pageLocations, pages, pagesize, lo, a) + if a > pages.nextA * pagesize + lo + pages = next_page_A(pages) + else + pages = next_page_B(pages) + end + pageLocations[pages.currentNumber] = pages.current + pages +end + +Base.@propagate_inbounds function permute_pages!(f, v, pageLocations, page_offset, pagesize, page) + while f(page) + plc = pageLocations[page-3] # plc has data belonging to page + pageLocations[page-3] = page + _unsafe_copyto!(v, page_offset(page) + 1, v, page_offset(plc) + 1, pagesize) + page = plc + end + page +end + +# merge v[lo:m] (A) and v[m+1:hi] (B) using scratch[] in O(sqrt(n)) space +function paged_merge!(v::AbstractVector{T}, lo::Integer, m::Integer, hi::Integer, o::Ordering, scratch::AbstractVector{T}, pageLocations::AbstractVector{<:Integer}) where {T} + @assert lo < m < hi + lenA = 1 + m - lo + lenB = hi - m + + # this function only supports merges with length(A) <= length(B), + # which is guaranteed by pagedmergesort! + @assert lenA <= lenB + + # regular merge if scratch is big enough + lenA <= length(scratch) && return merge!(v, lo, m, hi, o, scratch) + + len = lenA + lenB + pagesize = isqrt(len) + nPages = len ÷ pagesize # a partial page at the end does not count + @assert length(scratch) >= 3pagesize + @assert length(pageLocations) >= nPages - 3 + + @inline page_offset(page) = (page - 1) * pagesize + lo - 1 + + @inbounds begin + ################## + # merge + ################## + # merge the first 3 pages into scratch + a, b, _ = merge!((_, _, k) -> k <= 3pagesize, scratch, v, v, o, lo, m + 1, 1) + # initialize variables for merging into pages + pages = Pages(-17, 0, 1, (m - lo) ÷ pagesize + 2) # first argument is unused + # more efficient loop while more than pagesize elements of A and B are remaining + while_condition1(offset) = (_, _, k) -> k <= offset + pagesize + while a < m - pagesize && b < hi - pagesize + pages = next_page!(pageLocations, pages, pagesize, lo, a) + offset = page_offset(pages.current) + a, b, _ = merge!(while_condition1(offset), v, v, v, o, a, b, offset + 1) + end + # merge until either A or B is empty or the last page is reached + k, offset = nothing, nothing + while_condition2(offset) = (a, b, k) -> k <= offset + pagesize && a <= m && b <= hi + while a <= m && b <= hi && pages.currentNumber + 3 < nPages + pages = next_page!(pageLocations, pages, pagesize, lo, a) + offset = page_offset(pages.current) + a, b, k = merge!(while_condition2(offset), v, v, v, o, a, b, offset + 1) + end + # if the last page is reached, merge the remaining elements into the final partial page + if pages.currentNumber + 3 == nPages && a <= m && b <= hi + a, b, k = merge!((a, b, _) -> a <= m && b <= hi, v, v, v, o, a, b, nPages * pagesize + lo) + _unsafe_copyto!(v, k, v, a <= m ? a : b, hi - k + 1) + else + use_a = a <= m + # copy the incomplete page + partial_page_size = offset + pagesize - k + 1 + _unsafe_copyto!(v, k, v, use_a ? a : b, partial_page_size) + use_a && (a += partial_page_size) + use_a || (b += partial_page_size) + # copy the remaining full pages + while use_a ? a <= m - pagesize + 1 : b <= hi - pagesize + 1 + pages = next_page!(pageLocations, pages, pagesize, lo, a) + offset = page_offset(pages.current) + _unsafe_copyto!(v, offset + 1, v, use_a ? a : b, pagesize) + use_a && (a += pagesize) + use_a || (b += pagesize) + end + # copy the final partial page only if sourcing from A. + # If sourcing from B, it is already in place. + use_a && _unsafe_copyto!(v, hi - m + a, v, a, m - a + 1) + end + + ################## + # rearrange pages + ################## + # copy pages belonging to the 3 permutation chains ending with a page in the scratch space + nextA, nextB = pages.nextA, pages.nextB + + for _ in 1:3 + page = (nextB > nPages ? (nextA += 1) : (nextB += 1)) - 1 + page = permute_pages!(>(3), v, pageLocations, page_offset, pagesize, page) + _unsafe_copyto!(v, page_offset(page) + 1, scratch, (page - 1) * pagesize + 1, pagesize) + end + + # copy remaining permutation cycles + for donePageIndex = 5:nPages + # linear scan through pageLocations to make sure no cycle is missed + page = pageLocations[donePageIndex-3] + page == donePageIndex && continue + + # copy the data belonging to donePageIndex into scratch + _unsafe_copyto!(scratch, 1, v, page_offset(page) + 1, pagesize) + + # follow the cycle starting with the newly freed page + permute_pages!(!=(donePageIndex), v, pageLocations, page_offset, pagesize, page) + _unsafe_copyto!(v, page_offset(donePageIndex) + 1, scratch, 1, pagesize) + end + end +end + +# midpoint was added to Base.sort in version 1.4 and later moved to Base +# -> redefine for compatibility with earlier versions +midpoint(lo::Integer, hi::Integer) = lo + ((hi - lo) >>> 0x01) + +function pagedmergesort!(v::AbstractVector{T}, lo::Integer, hi::Integer, o::Ordering, scratch::AbstractVector{T}, pageLocations) where {T} + len = hi + 1 - lo + if len <= Base.SMALL_THRESHOLD + return Base.Sort.sort!(v, lo, hi, Base.Sort.InsertionSortAlg(), o) + end + m = midpoint(lo, hi - 1) # hi-1: ensure midpoint is rounded down. OK, because lo < hi is satisfied here + pagedmergesort!(v, lo, m, o, scratch, pageLocations) + pagedmergesort!(v, m + 1, hi, o, scratch, pageLocations) + if len <= length(scratch) + twoended_merge!(v, lo, m, hi, o, scratch) + else + paged_merge!(v, lo, m, hi, o, scratch, pageLocations) + end + return v +end + +function sort!(v::AbstractVector, lo::Integer, hi::Integer, ::PagedMergeSortAlg, o::Ordering) + lo >= hi && return v + n = hi + 1 - lo + pagesize = isqrt(n) + scratch = Vector{eltype(v)}(undef, 3pagesize) + nPages = n ÷ pagesize + pageLocations = Vector{Int}(undef, max(0, nPages - 3)) + pagedmergesort!(v, lo, hi, o, scratch, pageLocations) + return v +end end # module diff --git a/test/runtests.jl b/test/runtests.jl index f8d9970..d5b7935 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,10 +3,13 @@ using Test using StatsBase using Random +stable_algorithms = [TimSort, RadixSort, PagedMergeSort] +unstable_algorithms = [HeapSort, CombSort] + a = rand(1:10000, 1000) am = [rand() < .9 ? i : missing for i in a] -for alg in [TimSort, HeapSort, RadixSort, CombSort, SortingAlgorithms.TimSortAlg()] +for alg in [stable_algorithms; unstable_algorithms; SortingAlgorithms.TimSortAlg()] b = sort(a, alg=alg) @test issorted(b) ix = sortperm(a, alg=alg) @@ -94,8 +97,7 @@ for n in [0:10..., 100, 101, 1000, 1001] invpermute!(c, pi) @test c == v - # stable algorithms - for alg in [TimSort, RadixSort] + for alg in stable_algorithms p = sortperm(v, alg=alg, order=ord) @test p == pi s = copy(v) @@ -105,8 +107,7 @@ for n in [0:10..., 100, 101, 1000, 1001] @test s == v end - # unstable algorithms - for alg in [HeapSort, CombSort] + for alg in unstable_algorithms p = sortperm(v, alg=alg, order=ord) @test isperm(p) @test v[p] == si @@ -120,7 +121,7 @@ for n in [0:10..., 100, 101, 1000, 1001] v = randn_with_nans(n,0.1) for ord in [Base.Order.Forward, Base.Order.Reverse], - alg in [TimSort, HeapSort, RadixSort, CombSort] + alg in [stable_algorithms; unstable_algorithms] # test float sorting with NaNs s = sort(v, alg=alg, order=ord) @test issorted(s, order=ord) @@ -138,3 +139,22 @@ for n in [0:10..., 100, 101, 1000, 1001] @test reinterpret(UInt64,vp) == reinterpret(UInt64,s) end end + +for T in (Float64, Int, UInt8) + for alg in stable_algorithms + for ord in [Base.Order.By(identity), Base.Order.By(_ -> 0), Base.Order.By(Base.Fix2(÷, 100))] + for n in vcat(0:31, 40:11:100, 110:51:1000) + v = rand(T, n) + # use MergeSort to guarantee stable sorting in Julia 1.0 + @test sort(v, alg=alg, order=ord) == sort(v, alg=MergeSort, order=ord) + end + end + end +end + +# PagedMergeSort with small input without InitialOptimizations +# (https://github.com/JuliaCollections/SortingAlgorithms.jl/pull/71#discussion_r1292774352) +if isdefined(Base.Sort, :InitialOptimizations) + v = [0,1] + @test sort(v, alg=SortingAlgorithms.PagedMergeSortAlg()) == sort(v, alg=MergeSort) +end