Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up creating and extending packed arrays from iterators up to 63× #1023

Merged
merged 1 commit into from
Feb 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 122 additions & 0 deletions godot-core/src/builtin/collections/extend_buffer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/*
* Copyright (c) godot-rust; Bromeon and contributors.
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/

use std::mem::MaybeUninit;
use std::ptr;

/// A fixed-size buffer that does not do any allocations, and can hold up to `N` elements of type `T`.
///
/// This is used to implement `Packed*Array::extend()` in an efficient way, because it forms a middle ground between
/// repeated `push()` calls (slow) and first collecting the entire `Iterator` into a `Vec` (faster, but takes more memory).
///
/// Note that `N` must not be 0 for the buffer to be useful. This is checked at compile time.
pub struct ExtendBuffer<T, const N: usize> {
buf: [MaybeUninit<T>; N],
len: usize,
}

impl<T, const N: usize> Default for ExtendBuffer<T, N> {
fn default() -> Self {
Self {
buf: [const { MaybeUninit::uninit() }; N],
len: 0,
}
}
}

impl<T, const N: usize> ExtendBuffer<T, N> {
/// Appends the given value to the buffer.
///
/// # Panics
/// If the buffer is full.
pub fn push(&mut self, value: T) {
self.buf[self.len].write(value);
self.len += 1;
}

/// Returns `true` iff the buffer is full.
pub fn is_full(&self) -> bool {
self.len == N
}

/// Returns a slice of all initialized elements in the buffer, and sets the length of the buffer back to 0.
///
/// It is the caller's responsibility to ensure that all elements in the returned slice get dropped!
pub fn drain_as_mut_slice(&mut self) -> &mut [T] {
// Prevent panic in self.buf[0] below.
if N == 0 {
return &mut [];
}
debug_assert!(self.len <= N);

let len = self.len;
self.len = 0;

// MaybeUninit::slice_assume_init_ref could be used here instead, but it's experimental.
//
// SAFETY:
// - The pointer is non-null, valid and aligned.
// - `len` elements are always initialized.
// - The memory is not accessed through any other pointer, because we hold a `&mut` reference to `self`.
// - `len * mem::size_of::<T>()` is no larger than `isize::MAX`, otherwise the `buf` slice could not have existed either.
unsafe { std::slice::from_raw_parts_mut(self.buf[0].as_mut_ptr(), len) }
}
}

impl<T, const N: usize> Drop for ExtendBuffer<T, N> {
fn drop(&mut self) {
// Prevent panic in self.buf[0] below.
if N == 0 {
return;
}
debug_assert!(self.len <= N);

// SAFETY: `slice_from_raw_parts_mut` by itself is not unsafe, but to make the resulting slice safe to use:
// - `self.buf[0]` is a valid pointer, exactly `self.len` elements are initialized.
// - The pointer is not aliased since we have an exclusive `&mut self`.
let slice = ptr::slice_from_raw_parts_mut(self.buf[0].as_mut_ptr(), self.len);

// SAFETY: the value is valid because the `slice_from_raw_parts_mut` requirements are met,
// and there is no other way to access the value.
unsafe {
ptr::drop_in_place(slice);
}
}
}

#[test]
fn test_extend_buffer_drop() {
// We use an `Rc` to test the buffer's `drop` behavior.
use std::rc::Rc;

let mut buf = ExtendBuffer::<Rc<i32>, 1>::default();
let value = Rc::new(42);
buf.push(Rc::clone(&value));

// The buffer contains one strong reference, this function contains another.
assert_eq!(Rc::strong_count(&value), 2);

let slice = buf.drain_as_mut_slice();

// The strong reference has been returned in the slice, but not dropped.
assert_eq!(Rc::strong_count(&value), 2);

// SAFETY:
// - The slice returned by `drain_as_mut_slice` is valid, and therefore so is its first element.
// - There is no way to access parts of `slice[0]` while `drop_in_place` is executing.
unsafe {
ptr::drop_in_place(&mut slice[0]);
}

// The reference held by the slice has now been dropped.
assert_eq!(Rc::strong_count(&value), 1);

drop(buf);

// The buffer has not dropped another reference.
assert_eq!(Rc::strong_count(&value), 1);
}
1 change: 1 addition & 0 deletions godot-core/src/builtin/collections/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

mod array;
mod dictionary;
mod extend_buffer;
mod packed_array;

// Re-export in godot::builtin.
Expand Down
139 changes: 123 additions & 16 deletions godot-core/src/builtin/collections/packed_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@

use godot_ffi as sys;

use crate::builtin::collections::extend_buffer::ExtendBuffer;
use crate::builtin::*;
use crate::meta::{AsArg, ToGodot};
use std::mem::size_of;
use std::{fmt, ops, ptr};
use sys::types::*;
use sys::{ffi_methods, interface_fn, GodotFfi};
Expand Down Expand Up @@ -380,17 +382,18 @@ macro_rules! impl_packed_array {
array
}

/// Drops all elements in `self` and replaces them with data from an array of values.
/// Drops all elements in `self` starting from `dst` and replaces them with data from an array of values.
/// `dst` must be a valid index, even if `len` is zero.
///
/// # Safety
///
/// * Pointer must be valid slice of data with `len` size.
/// * Pointer must not point to `self` data.
/// * Length must be equal to `self.len()`.
/// * `src` must be valid slice of data with `len` size.
/// * `src` must not point to `self` data.
/// * `len` must be equal to `self.len() - dst`.
/// * Source data must not be dropped later.
unsafe fn move_from_slice(&mut self, src: *const $Element, len: usize) {
let ptr = self.ptr_mut(0);
debug_assert_eq!(len, self.len(), "length precondition violated");
unsafe fn move_from_slice(&mut self, src: *const $Element, dst: usize, len: usize) {
let ptr = self.ptr_mut(dst);
debug_assert_eq!(len, self.len() - dst, "length precondition violated");
// Drops all elements in place. Drop impl must not panic.
ptr::drop_in_place(ptr::slice_from_raw_parts_mut(ptr, len));
// Copy is okay since all elements are dropped.
Expand Down Expand Up @@ -457,7 +460,7 @@ macro_rules! impl_packed_array {

// SAFETY: The packed array contains exactly N elements and the source array will be forgotten.
unsafe {
packed_array.move_from_slice(arr.as_ptr(), N);
packed_array.move_from_slice(arr.as_ptr(), 0, N);
}
packed_array
}
Expand All @@ -476,13 +479,21 @@ macro_rules! impl_packed_array {
// The vector is forcibly set to empty, so its contents are forgotten.
unsafe {
vec.set_len(0);
array.move_from_slice(vec.as_ptr(), len);
array.move_from_slice(vec.as_ptr(), 0, len);
}
array
}
}

#[doc = concat!("Creates a `", stringify!($PackedArray), "` from an iterator.")]
///
/// # Performance note
/// This uses the lower bound from `Iterator::size_hint()` to allocate memory up front. If the iterator returns
/// more than that number of elements, it falls back to reading elements into a fixed-size buffer before adding
/// them all efficiently as a batch.
///
/// # Panics
/// - If the iterator's `size_hint()` returns an incorrect lower bound (which is a breach of the `Iterator` protocol).
impl FromIterator<$Element> for $PackedArray {
fn from_iter<I: IntoIterator<Item = $Element>>(iter: I) -> Self {
let mut array = $PackedArray::default();
Expand All @@ -491,16 +502,103 @@ macro_rules! impl_packed_array {
}
}

#[doc = concat!("Extends a`", stringify!($PackedArray), "` with the contents of an iterator")]
#[doc = concat!("Extends a`", stringify!($PackedArray), "` with the contents of an iterator.")]
///
/// # Performance note
/// This uses the lower bound from `Iterator::size_hint()` to allocate memory up front. If the iterator returns
/// more than that number of elements, it falls back to reading elements into a fixed-size buffer before adding
/// them all efficiently as a batch.
///
/// # Panics
/// - If the iterator's `size_hint()` returns an incorrect lower bound (which is a breach of the `Iterator` protocol).
impl Extend<$Element> for $PackedArray {
fn extend<I: IntoIterator<Item = $Element>>(&mut self, iter: I) {
// Unfortunately the GDExtension API does not offer the equivalent of `Vec::reserve`.
// Otherwise we could use it to pre-allocate based on `iter.size_hint()`.
// This function is complicated, but with good reason. The problem is that we don't know the length of
// the `Iterator` ahead of time; all we get is its `size_hint()`.
//
// There are at least two categories of iterators that are common in the wild, for which we'd want good performance:
//
// 1. The length is known: `size_hint()` returns the exact size, e.g. just iterating over a `Vec` or `BTreeSet`.
// 2. The length is unknown: `size_hint()` returns 0, e.g. `Filter`, `FlatMap`, `FromFn`.
//
// A number of implementations are possible, which were benchmarked for 1000 elements of type `i32`:
//
// - Simply call `push()` in a loop:
// 6.1 µs whether or not the length is known.
// - First `collect()` the `Iterator` into a `Vec`, call `self.resize()` to make room, then move out of the `Vec`:
// 0.78 µs if the length is known, 1.62 µs if the length is unknown.
// It also requires additional temporary memory to hold all elements.
// - The strategy implemented below:
// 0.097 µs if the length is known, 0.49 µs if the length is unknown.
//
// A faster implementation using `resize()` and direct pointer writes might still be
// possible.
for item in iter.into_iter() {
self.push(meta::ParamType::owned_to_arg(item));
// The implementation of `Vec` in the standard library deals with this by repeatedly `reserve()`ing
// whatever `size_hint()` returned, but we don't want to do that because the Godot API call to
// `self.resize()` is relatively slow.

let mut iter = iter.into_iter();
// Cache the length to avoid repeated Godot API calls.
let mut len = self.len();

// Fast part.
//
// Use `Iterator::size_hint()` to pre-allocate the minimum number of elements in the iterator, then
// write directly to the resulting slice. We can do this because `size_hint()` is required by the
// `Iterator` contract to return correct bounds. Note that any bugs in it must not result in UB.
let (size_hint_min, _size_hint_max) = iter.size_hint();
if size_hint_min > 0 {
let capacity = len + size_hint_min;
self.resize(capacity);
for out_ref in &mut self.as_mut_slice()[len..] {
*out_ref = iter.next().expect("iterator returned fewer than size_hint().0 elements");
}
len = capacity;
}

// Slower part.
//
// While the iterator is still not finished, gather elements into a fixed-size buffer, then add them all
// at once.
//
// Why not call `self.resize()` with fixed-size increments, like 32 elements at a time? Well, we might
// end up over-allocating, and then need to trim the array length back at the end. Because Godot
// allocates memory in steps of powers of two, this might end up with an array backing storage that is
// twice as large as it needs to be. By first gathering elements into a buffer, we can tell Godot to
// allocate exactly as much as we need, and no more.
//
// Note that we can't get by with simple memcpys, because `PackedStringArray` contains `GString`, which
// does not implement `Copy`.
//
// Buffer size: 2 kB is enough for the performance win, without needlessly blowing up the stack size.
// (A cursory check shows that most/all platforms use a stack size of at least 1 MB.)
const BUFFER_SIZE_BYTES: usize = 2048;
const BUFFER_CAPACITY: usize = const_max(
1,
BUFFER_SIZE_BYTES / size_of::<$Element>(),
);
let mut buf = ExtendBuffer::<_, BUFFER_CAPACITY>::default();
while let Some(item) = iter.next() {
buf.push(item);
while !buf.is_full() {
if let Some(item) = iter.next() {
buf.push(item);
} else {
break;
}
}

let buf_slice = buf.drain_as_mut_slice();
let capacity = len + buf_slice.len();

// Assumption: resize does not panic. Otherwise we would leak memory here.
self.resize(capacity);

// SAFETY: Dropping the first `buf_slice.len()` items is safe, because those are exactly the ones we initialized.
// Writing output is safe because we just allocated `buf_slice.len()` new elements after index `len`.
unsafe {
self.move_from_slice(buf_slice.as_ptr(), len, buf_slice.len());
}

len = capacity;
}
}
}
Expand Down Expand Up @@ -1071,3 +1169,12 @@ fn populated_or_err(array: PackedByteArray) -> Result<PackedByteArray, ()> {
Ok(array)
}
}

/// Helper because `usize::max()` is not const.
const fn const_max(a: usize, b: usize) -> usize {
if a > b {
a
} else {
b
}
}
22 changes: 21 additions & 1 deletion itest/rust/src/benchmarks/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
use std::hint::black_box;

use godot::builtin::inner::InnerRect2i;
use godot::builtin::{GString, Rect2i, StringName, Vector2i};
use godot::builtin::{GString, PackedInt32Array, Rect2i, StringName, Vector2i};
use godot::classes::{Node3D, Os, RefCounted};
use godot::obj::{Gd, InstanceId, NewAlloc, NewGd};
use godot::register::GodotClass;
Expand Down Expand Up @@ -93,6 +93,26 @@ fn utilities_ffi_call() -> f64 {
godot::global::pow(base, exponent)
}

#[bench(repeat = 25)]
fn packed_array_from_iter_known_size() -> PackedInt32Array {
// Create an iterator whose `size_hint()` returns `(len, Some(len))`.
PackedInt32Array::from_iter(0..100)
}

#[bench(repeat = 25)]
fn packed_array_from_iter_unknown_size() -> PackedInt32Array {
// Create an iterator whose `size_hint()` returns `(0, None)`.
let mut item = 0;
PackedInt32Array::from_iter(std::iter::from_fn(|| {
item += 1;
if item <= 100 {
Some(item)
} else {
None
}
}))
}

// ----------------------------------------------------------------------------------------------------------------------------------------------
// Helpers for benchmarks above

Expand Down
Loading
Loading