From 94902d14b281e0086858a0db27f51ea399cb1cca Mon Sep 17 00:00:00 2001
From: Kevin Reid <kpreid@switchb.org>
Date: Sun, 12 Jan 2025 20:38:46 -0800
Subject: [PATCH] Expose `StagingBelt` allocation directly.

This allows using `StagingBelt` for copying to textures or any other
operation where copying to an existing buffer is not the desired outcome.

One unfortunate feature of the API is that `allocate()` returns the entire
buffer and an offset, so applications could accidentally touch parts of
the belt buffer outside the intended allocation. It might make more sense
to return `wgpu::BufferSlice`, but that struct cannot be used in
operations like `copy_buffer_to_texture` and does not have getters, so it
is not currently suitable for that purpose.

I also moved `Exclusive` into a module so that its unsafe-to-access field
is properly private, and made various improvements to the `StagingBelt`
documentation, such as acknowledging that `write_buffer_with()` exists.
---
 wgpu/src/util/belt.rs | 156 ++++++++++++++++++++++++++++++------------
 1 file changed, 114 insertions(+), 42 deletions(-)
diff --git a/wgpu/src/util/belt.rs b/wgpu/src/util/belt.rs
index d6ef7a0c46..3435e3ed3c 100644
--- a/wgpu/src/util/belt.rs
+++ b/wgpu/src/util/belt.rs
@@ -5,49 +5,27 @@ use crate::{
 use std::fmt;
 use std::sync::{mpsc, Arc};
 
-struct Chunk {
-    buffer: Arc<Buffer>,
-    size: BufferAddress,
-    offset: BufferAddress,
-}
-
-/// `Sync` wrapper that works by providing only exclusive access.
-///
-/// See https://doc.rust-lang.org/nightly/std/sync/struct.Exclusive.html
-struct Exclusive<T>(T);
-
-unsafe impl<T> Sync for Exclusive<T> {}
-
-impl<T> Exclusive<T> {
-    fn new(value: T) -> Self {
-        Self(value)
-    }
-
-    fn get_mut(&mut self) -> &mut T {
-        &mut self.0
-    }
-}
-
 /// Efficiently performs many buffer writes by sharing and reusing temporary buffers.
 ///
 /// Internally it uses a ring-buffer of staging buffers that are sub-allocated.
-/// It has an advantage over [`Queue::write_buffer()`] in a way that it returns a mutable slice,
-/// which you can fill to avoid an extra data copy.
+/// Its advantage over [`Queue::write_buffer_with()`] is that the individual allocations
+/// are cheaper; `StagingBelt` is most useful when you are writing very many small pieces
+/// of data. It can be understood as a sort of arena allocator.
 ///
 /// Using a staging belt is slightly complicated, and generally goes as follows:
-/// 1. Write to buffers that need writing to using [`StagingBelt::write_buffer()`].
+/// 1. Use [`StagingBelt::write_buffer()`] or [`StagingBelt::allocate()`] to allocate
+///    buffer slices, then write your data to them.
 /// 2. Call [`StagingBelt::finish()`].
 /// 3. Submit all command encoders that were used in step 1.
 /// 4. Call [`StagingBelt::recall()`].
 ///
-/// [`Queue::write_buffer()`]: crate::Queue::write_buffer
+/// [`Queue::write_buffer_with()`]: crate::Queue::write_buffer_with
 pub struct StagingBelt {
     chunk_size: BufferAddress,
     /// Chunks into which we are accumulating data to be transferred.
     active_chunks: Vec<Chunk>,
     /// Chunks that have scheduled transfers already; they are unmapped and some
-    /// command encoder has one or more `copy_buffer_to_buffer` commands with them
-    /// as source.
+    /// command encoder has one or more commands with them as source.
     closed_chunks: Vec<Chunk>,
     /// Chunks that are back from the GPU and ready to be mapped for write and put
     /// into `active_chunks`.
@@ -70,7 +48,7 @@ impl StagingBelt {
     ///   (per [`StagingBelt::finish()`]); and
     /// * bigger is better, within these bounds.
     pub fn new(chunk_size: BufferAddress) -> Self {
-        let (sender, receiver) = std::sync::mpsc::channel();
+        let (sender, receiver) = mpsc::channel();
         StagingBelt {
             chunk_size,
             active_chunks: Vec::new(),
@@ -81,7 +59,7 @@ impl StagingBelt {
         }
     }
 
-    /// Allocate the staging belt slice of `size` to be uploaded into the `target` buffer
+    /// Allocate a staging belt slice of `size` to be copied into the `target` buffer
     /// at the specified offset.
     ///
     /// The upload will be placed into the provided command encoder. This encoder
@@ -99,10 +77,57 @@ impl StagingBelt {
         size: BufferSize,
         device: &Device,
     ) -> BufferViewMut<'_> {
+        let (mapped, belt_buffer, offset_in_belt_buffer) = self.allocate(
+            size,
+            const { BufferSize::new(crate::COPY_BUFFER_ALIGNMENT).unwrap() },
+            device,
+        );
+        encoder.copy_buffer_to_buffer(
+            belt_buffer,
+            offset_in_belt_buffer,
+            target,
+            offset,
+            size.get(),
+        );
+        mapped
+    }
+
+    /// Allocate a staging belt slice with the given `size` and `alignment` and return it.
+    ///
+    /// This allows you to do whatever you want with the slice after, such as
+    /// copying it to a texture or executing a compute shader that reads it, whereas
+    /// [`StagingBelt::write_buffer()`] can only write to other buffers.
+    ///
+    /// If the `size` is greater than the size of any free internal buffer, a new buffer
+    /// will be allocated for it. Therefore, the `chunk_size` passed to [`StagingBelt::new()`]
+    /// should ideally be larger than every such size.
+    ///
+    /// Three values are returned:
+    ///
+    /// * The mapped buffer view which you should write into from the CPU side (Rust code).
+    /// * The buffer containing the slice, for you to use in GPU commands.
+    ///   All commands involving this slice must be submitted after
+    ///   [`StagingBelt::finish()`] is called and before [`StagingBelt::recall()`] is called.
+    /// * The offset within the buffer at which the slice starts.
+    ///   This offset should be used for all GPU commands, and should not be used with
+    ///   the mapped buffer view.
+    pub fn allocate(
+        &mut self,
+        size: BufferSize,
+        alignment: BufferSize,
+        device: &Device,
+    ) -> (BufferViewMut<'_>, &Buffer, BufferAddress) {
+        assert!(
+            alignment.get().is_power_of_two(),
+            "alignment must be a power of two, not {alignment}"
+        );
+        // At minimum, we must have alignment sufficient to map the buffer.
+        let alignment = alignment.get().max(crate::MAP_ALIGNMENT);
+
         let mut chunk = if let Some(index) = self
             .active_chunks
             .iter()
-            .position(|chunk| chunk.offset + size.get() <= chunk.size)
+            .position(|chunk| chunk.can_allocate(size, alignment))
         {
             self.active_chunks.swap_remove(index)
         } else {
@@ -111,7 +136,7 @@ impl StagingBelt {
             if let Some(index) = self
                 .free_chunks
                 .iter()
-                .position(|chunk| size.get() <= chunk.size)
+                .position(|chunk| chunk.can_allocate(size, alignment))
             {
                 self.free_chunks.swap_remove(index)
             } else {
@@ -129,17 +154,19 @@ impl StagingBelt {
             }
         };
 
-        encoder.copy_buffer_to_buffer(&chunk.buffer, chunk.offset, target, offset, size.get());
-        let old_offset = chunk.offset;
-        chunk.offset = align_to(chunk.offset + size.get(), crate::MAP_ALIGNMENT);
+        let allocation_offset = chunk.allocate(size, alignment);
 
         self.active_chunks.push(chunk);
-        self.active_chunks
-            .last()
-            .unwrap()
-            .buffer
-            .slice(old_offset..old_offset + size.get())
-            .get_mapped_range_mut()
+        let chunk = self.active_chunks.last().unwrap();
+
+        (
+            chunk
+                .buffer
+                .slice(allocation_offset..allocation_offset + size.get())
+                .get_mapped_range_mut(),
+            &chunk.buffer,
+            allocation_offset,
+        )
     }
 
     /// Prepare currently mapped buffers for use in a submission.
@@ -197,3 +224,48 @@ impl fmt::Debug for StagingBelt {
             .finish_non_exhaustive()
     }
 }
+
+struct Chunk {
+    buffer: Arc<Buffer>,
+    size: BufferAddress,
+    offset: BufferAddress,
+}
+
+impl Chunk {
+    fn can_allocate(&self, size: BufferSize, alignment: BufferAddress) -> bool {
+        let alloc_start = align_to(self.offset, alignment);
+        let alloc_end = alloc_start + size.get();
+
+        alloc_end <= self.size
+    }
+
+    fn allocate(&mut self, size: BufferSize, alignment: BufferAddress) -> BufferAddress {
+        let alloc_start = align_to(self.offset, alignment);
+        let alloc_end = alloc_start + size.get();
+
+        assert!(alloc_end <= self.size);
+        self.offset = alloc_end;
+        alloc_start
+    }
+}
+
+use exclusive::Exclusive;
+mod exclusive {
+    /// `Sync` wrapper that works by providing only exclusive access.
+    ///
+    /// See <https://doc.rust-lang.org/nightly/std/sync/struct.Exclusive.html>
+    pub(super) struct Exclusive<T>(T);
+
+    /// Safety: `&Exclusive` has no operations.
+    unsafe impl<T> Sync for Exclusive<T> {}
+
+    impl<T> Exclusive<T> {
+        pub fn new(value: T) -> Self {
+            Self(value)
+        }
+
+        pub fn get_mut(&mut self) -> &mut T {
+            &mut self.0
+        }
+    }
+}