From b7178eec189258ef5b1a6c455551087d450c97da Mon Sep 17 00:00:00 2001 From: Thomas ten Cate Date: Mon, 20 Jan 2025 21:16:54 +0100 Subject: [PATCH] =?UTF-8?q?Speed=20up=20creating=20and=20extending=20packe?= =?UTF-8?q?d=20arrays=20from=20iterators=20up=20to=2063=C3=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This uses the iterator size hint to pre-allocate, which leads to 63× speedup in the best case. If the hint is pessimistic, it reads into a buffer to avoid repeated push() calls, which is still 12× as fast as the previous implementation. --- .../src/builtin/collections/extend_buffer.rs | 122 +++++++++++++++ godot-core/src/builtin/collections/mod.rs | 1 + .../src/builtin/collections/packed_array.rs | 139 ++++++++++++++++-- itest/rust/src/benchmarks/mod.rs | 22 ++- .../containers/packed_array_test.rs | 47 +++++- 5 files changed, 313 insertions(+), 18 deletions(-) create mode 100644 godot-core/src/builtin/collections/extend_buffer.rs diff --git a/godot-core/src/builtin/collections/extend_buffer.rs b/godot-core/src/builtin/collections/extend_buffer.rs new file mode 100644 index 000000000..5fa79891d --- /dev/null +++ b/godot-core/src/builtin/collections/extend_buffer.rs @@ -0,0 +1,122 @@ +/* + * Copyright (c) godot-rust; Bromeon and contributors. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ + +use std::mem::MaybeUninit; +use std::ptr; + +/// A fixed-size buffer that does not do any allocations, and can hold up to `N` elements of type `T`. +/// +/// This is used to implement `Packed*Array::extend()` in an efficient way, because it forms a middle ground between +/// repeated `push()` calls (slow) and first collecting the entire `Iterator` into a `Vec` (faster, but takes more memory). +/// +/// Note that `N` must not be 0 for the buffer to be useful. This is checked at compile time. +pub struct ExtendBuffer { + buf: [MaybeUninit; N], + len: usize, +} + +impl Default for ExtendBuffer { + fn default() -> Self { + Self { + buf: [const { MaybeUninit::uninit() }; N], + len: 0, + } + } +} + +impl ExtendBuffer { + /// Appends the given value to the buffer. + /// + /// # Panics + /// If the buffer is full. + pub fn push(&mut self, value: T) { + self.buf[self.len].write(value); + self.len += 1; + } + + /// Returns `true` iff the buffer is full. + pub fn is_full(&self) -> bool { + self.len == N + } + + /// Returns a slice of all initialized elements in the buffer, and sets the length of the buffer back to 0. + /// + /// It is the caller's responsibility to ensure that all elements in the returned slice get dropped! + pub fn drain_as_mut_slice(&mut self) -> &mut [T] { + // Prevent panic in self.buf[0] below. + if N == 0 { + return &mut []; + } + debug_assert!(self.len <= N); + + let len = self.len; + self.len = 0; + + // MaybeUninit::slice_assume_init_ref could be used here instead, but it's experimental. + // + // SAFETY: + // - The pointer is non-null, valid and aligned. + // - `len` elements are always initialized. + // - The memory is not accessed through any other pointer, because we hold a `&mut` reference to `self`. + // - `len * mem::size_of::()` is no larger than `isize::MAX`, otherwise the `buf` slice could not have existed either. + unsafe { std::slice::from_raw_parts_mut(self.buf[0].as_mut_ptr(), len) } + } +} + +impl Drop for ExtendBuffer { + fn drop(&mut self) { + // Prevent panic in self.buf[0] below. + if N == 0 { + return; + } + debug_assert!(self.len <= N); + + // SAFETY: `slice_from_raw_parts_mut` by itself is not unsafe, but to make the resulting slice safe to use: + // - `self.buf[0]` is a valid pointer, exactly `self.len` elements are initialized. + // - The pointer is not aliased since we have an exclusive `&mut self`. + let slice = ptr::slice_from_raw_parts_mut(self.buf[0].as_mut_ptr(), self.len); + + // SAFETY: the value is valid because the `slice_from_raw_parts_mut` requirements are met, + // and there is no other way to access the value. + unsafe { + ptr::drop_in_place(slice); + } + } +} + +#[test] +fn test_extend_buffer_drop() { + // We use an `Rc` to test the buffer's `drop` behavior. + use std::rc::Rc; + + let mut buf = ExtendBuffer::, 1>::default(); + let value = Rc::new(42); + buf.push(Rc::clone(&value)); + + // The buffer contains one strong reference, this function contains another. + assert_eq!(Rc::strong_count(&value), 2); + + let slice = buf.drain_as_mut_slice(); + + // The strong reference has been returned in the slice, but not dropped. + assert_eq!(Rc::strong_count(&value), 2); + + // SAFETY: + // - The slice returned by `drain_as_mut_slice` is valid, and therefore so is its first element. + // - There is no way to access parts of `slice[0]` while `drop_in_place` is executing. + unsafe { + ptr::drop_in_place(&mut slice[0]); + } + + // The reference held by the slice has now been dropped. + assert_eq!(Rc::strong_count(&value), 1); + + drop(buf); + + // The buffer has not dropped another reference. + assert_eq!(Rc::strong_count(&value), 1); +} diff --git a/godot-core/src/builtin/collections/mod.rs b/godot-core/src/builtin/collections/mod.rs index 2a60ac49a..a98bcb879 100644 --- a/godot-core/src/builtin/collections/mod.rs +++ b/godot-core/src/builtin/collections/mod.rs @@ -7,6 +7,7 @@ mod array; mod dictionary; +mod extend_buffer; mod packed_array; // Re-export in godot::builtin. diff --git a/godot-core/src/builtin/collections/packed_array.rs b/godot-core/src/builtin/collections/packed_array.rs index 87a2338bf..a52a34cd8 100644 --- a/godot-core/src/builtin/collections/packed_array.rs +++ b/godot-core/src/builtin/collections/packed_array.rs @@ -11,8 +11,10 @@ use godot_ffi as sys; +use crate::builtin::collections::extend_buffer::ExtendBuffer; use crate::builtin::*; use crate::meta::{AsArg, ToGodot}; +use std::mem::size_of; use std::{fmt, ops, ptr}; use sys::types::*; use sys::{ffi_methods, interface_fn, GodotFfi}; @@ -380,17 +382,18 @@ macro_rules! impl_packed_array { array } - /// Drops all elements in `self` and replaces them with data from an array of values. + /// Drops all elements in `self` starting from `dst` and replaces them with data from an array of values. + /// `dst` must be a valid index, even if `len` is zero. /// /// # Safety /// - /// * Pointer must be valid slice of data with `len` size. - /// * Pointer must not point to `self` data. - /// * Length must be equal to `self.len()`. + /// * `src` must be valid slice of data with `len` size. + /// * `src` must not point to `self` data. + /// * `len` must be equal to `self.len() - dst`. /// * Source data must not be dropped later. - unsafe fn move_from_slice(&mut self, src: *const $Element, len: usize) { - let ptr = self.ptr_mut(0); - debug_assert_eq!(len, self.len(), "length precondition violated"); + unsafe fn move_from_slice(&mut self, src: *const $Element, dst: usize, len: usize) { + let ptr = self.ptr_mut(dst); + debug_assert_eq!(len, self.len() - dst, "length precondition violated"); // Drops all elements in place. Drop impl must not panic. ptr::drop_in_place(ptr::slice_from_raw_parts_mut(ptr, len)); // Copy is okay since all elements are dropped. @@ -457,7 +460,7 @@ macro_rules! impl_packed_array { // SAFETY: The packed array contains exactly N elements and the source array will be forgotten. unsafe { - packed_array.move_from_slice(arr.as_ptr(), N); + packed_array.move_from_slice(arr.as_ptr(), 0, N); } packed_array } @@ -476,13 +479,21 @@ macro_rules! impl_packed_array { // The vector is forcibly set to empty, so its contents are forgotten. unsafe { vec.set_len(0); - array.move_from_slice(vec.as_ptr(), len); + array.move_from_slice(vec.as_ptr(), 0, len); } array } } #[doc = concat!("Creates a `", stringify!($PackedArray), "` from an iterator.")] + /// + /// # Performance note + /// This uses the lower bound from `Iterator::size_hint()` to allocate memory up front. If the iterator returns + /// more than that number of elements, it falls back to reading elements into a fixed-size buffer before adding + /// them all efficiently as a batch. + /// + /// # Panics + /// - If the iterator's `size_hint()` returns an incorrect lower bound (which is a breach of the `Iterator` protocol). impl FromIterator<$Element> for $PackedArray { fn from_iter>(iter: I) -> Self { let mut array = $PackedArray::default(); @@ -491,16 +502,103 @@ macro_rules! impl_packed_array { } } - #[doc = concat!("Extends a`", stringify!($PackedArray), "` with the contents of an iterator")] + #[doc = concat!("Extends a`", stringify!($PackedArray), "` with the contents of an iterator.")] + /// + /// # Performance note + /// This uses the lower bound from `Iterator::size_hint()` to allocate memory up front. If the iterator returns + /// more than that number of elements, it falls back to reading elements into a fixed-size buffer before adding + /// them all efficiently as a batch. + /// + /// # Panics + /// - If the iterator's `size_hint()` returns an incorrect lower bound (which is a breach of the `Iterator` protocol). impl Extend<$Element> for $PackedArray { fn extend>(&mut self, iter: I) { - // Unfortunately the GDExtension API does not offer the equivalent of `Vec::reserve`. - // Otherwise we could use it to pre-allocate based on `iter.size_hint()`. + // This function is complicated, but with good reason. The problem is that we don't know the length of + // the `Iterator` ahead of time; all we get is its `size_hint()`. + // + // There are at least two categories of iterators that are common in the wild, for which we'd want good performance: + // + // 1. The length is known: `size_hint()` returns the exact size, e.g. just iterating over a `Vec` or `BTreeSet`. + // 2. The length is unknown: `size_hint()` returns 0, e.g. `Filter`, `FlatMap`, `FromFn`. + // + // A number of implementations are possible, which were benchmarked for 1000 elements of type `i32`: + // + // - Simply call `push()` in a loop: + // 6.1 µs whether or not the length is known. + // - First `collect()` the `Iterator` into a `Vec`, call `self.resize()` to make room, then move out of the `Vec`: + // 0.78 µs if the length is known, 1.62 µs if the length is unknown. + // It also requires additional temporary memory to hold all elements. + // - The strategy implemented below: + // 0.097 µs if the length is known, 0.49 µs if the length is unknown. // - // A faster implementation using `resize()` and direct pointer writes might still be - // possible. - for item in iter.into_iter() { - self.push(meta::ParamType::owned_to_arg(item)); + // The implementation of `Vec` in the standard library deals with this by repeatedly `reserve()`ing + // whatever `size_hint()` returned, but we don't want to do that because the Godot API call to + // `self.resize()` is relatively slow. + + let mut iter = iter.into_iter(); + // Cache the length to avoid repeated Godot API calls. + let mut len = self.len(); + + // Fast part. + // + // Use `Iterator::size_hint()` to pre-allocate the minimum number of elements in the iterator, then + // write directly to the resulting slice. We can do this because `size_hint()` is required by the + // `Iterator` contract to return correct bounds. Note that any bugs in it must not result in UB. + let (size_hint_min, _size_hint_max) = iter.size_hint(); + if size_hint_min > 0 { + let capacity = len + size_hint_min; + self.resize(capacity); + for out_ref in &mut self.as_mut_slice()[len..] { + *out_ref = iter.next().expect("iterator returned fewer than size_hint().0 elements"); + } + len = capacity; + } + + // Slower part. + // + // While the iterator is still not finished, gather elements into a fixed-size buffer, then add them all + // at once. + // + // Why not call `self.resize()` with fixed-size increments, like 32 elements at a time? Well, we might + // end up over-allocating, and then need to trim the array length back at the end. Because Godot + // allocates memory in steps of powers of two, this might end up with an array backing storage that is + // twice as large as it needs to be. By first gathering elements into a buffer, we can tell Godot to + // allocate exactly as much as we need, and no more. + // + // Note that we can't get by with simple memcpys, because `PackedStringArray` contains `GString`, which + // does not implement `Copy`. + // + // Buffer size: 2 kB is enough for the performance win, without needlessly blowing up the stack size. + // (A cursory check shows that most/all platforms use a stack size of at least 1 MB.) + const BUFFER_SIZE_BYTES: usize = 2048; + const BUFFER_CAPACITY: usize = const_max( + 1, + BUFFER_SIZE_BYTES / size_of::<$Element>(), + ); + let mut buf = ExtendBuffer::<_, BUFFER_CAPACITY>::default(); + while let Some(item) = iter.next() { + buf.push(item); + while !buf.is_full() { + if let Some(item) = iter.next() { + buf.push(item); + } else { + break; + } + } + + let buf_slice = buf.drain_as_mut_slice(); + let capacity = len + buf_slice.len(); + + // Assumption: resize does not panic. Otherwise we would leak memory here. + self.resize(capacity); + + // SAFETY: Dropping the first `buf_slice.len()` items is safe, because those are exactly the ones we initialized. + // Writing output is safe because we just allocated `buf_slice.len()` new elements after index `len`. + unsafe { + self.move_from_slice(buf_slice.as_ptr(), len, buf_slice.len()); + } + + len = capacity; } } } @@ -1071,3 +1169,12 @@ fn populated_or_err(array: PackedByteArray) -> Result { Ok(array) } } + +/// Helper because `usize::max()` is not const. +const fn const_max(a: usize, b: usize) -> usize { + if a > b { + a + } else { + b + } +} diff --git a/itest/rust/src/benchmarks/mod.rs b/itest/rust/src/benchmarks/mod.rs index 4ca5217d5..f73185633 100644 --- a/itest/rust/src/benchmarks/mod.rs +++ b/itest/rust/src/benchmarks/mod.rs @@ -10,7 +10,7 @@ use std::hint::black_box; use godot::builtin::inner::InnerRect2i; -use godot::builtin::{GString, Rect2i, StringName, Vector2i}; +use godot::builtin::{GString, PackedInt32Array, Rect2i, StringName, Vector2i}; use godot::classes::{Node3D, Os, RefCounted}; use godot::obj::{Gd, InstanceId, NewAlloc, NewGd}; use godot::register::GodotClass; @@ -93,6 +93,26 @@ fn utilities_ffi_call() -> f64 { godot::global::pow(base, exponent) } +#[bench(repeat = 25)] +fn packed_array_from_iter_known_size() -> PackedInt32Array { + // Create an iterator whose `size_hint()` returns `(len, Some(len))`. + PackedInt32Array::from_iter(0..100) +} + +#[bench(repeat = 25)] +fn packed_array_from_iter_unknown_size() -> PackedInt32Array { + // Create an iterator whose `size_hint()` returns `(0, None)`. + let mut item = 0; + PackedInt32Array::from_iter(std::iter::from_fn(|| { + item += 1; + if item <= 100 { + Some(item) + } else { + None + } + })) +} + // ---------------------------------------------------------------------------------------------------------------------------------------------- // Helpers for benchmarks above diff --git a/itest/rust/src/builtin_tests/containers/packed_array_test.rs b/itest/rust/src/builtin_tests/containers/packed_array_test.rs index 2c14f191e..0b9a03c09 100644 --- a/itest/rust/src/builtin_tests/containers/packed_array_test.rs +++ b/itest/rust/src/builtin_tests/containers/packed_array_test.rs @@ -276,8 +276,53 @@ fn packed_array_insert() { assert_eq!(array.to_vec(), vec![3, 1, 2, 4]); } +fn test_extend(make_iter: F) +where + F: Fn(i32) -> I, + I: Iterator, +{ + // The logic in `extend()` is not trivial, so we test it for a wide range of sizes: powers of two, also plus and minus one. + // This includes zero. We go up to 2^12, which is 4096, because the internal buffer is currently 2048 bytes (512 `i32`s) + // and we want to be future-proof in case it's ever enlarged. + let lengths = (0..12i32) + .flat_map(|i| { + let b = 1 << i; + [b - 1, b, b + 1] + }) + .collect::>(); + + for &len_a in &lengths { + for &len_b in &lengths { + let iter = make_iter(len_b); + let mut array = PackedInt32Array::from_iter(0..len_a); + array.extend(iter); + let expected = (0..len_a).chain(0..len_b).collect::>(); + assert_eq!(array.to_vec(), expected, "len_a = {len_a}, len_b = {len_b}",); + } + } +} + +#[itest] +fn packed_array_extend_known_size() { + // Create an iterator whose `size_hint()` returns `(len, Some(len))`. + test_extend(|len| 0..len); +} + +#[itest] +fn packed_array_extend_unknown_size() { + // Create an iterator whose `size_hint()` returns `(0, None)`. + test_extend(|len| { + let mut item = 0; + std::iter::from_fn(move || { + let result = if item < len { Some(item) } else { None }; + item += 1; + result + }) + }); +} + #[itest] -fn packed_array_extend() { +fn packed_array_extend_array() { let mut array = PackedByteArray::from(&[1, 2]); let other = PackedByteArray::from(&[3, 4]); array.extend_array(&other);