From fb82ad61e69bddac09ff0e6a48d13390ba01a768 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96mer=20Sinan=20A=C4=9Facan?= Date: Tue, 24 Oct 2023 14:29:22 +0200 Subject: [PATCH] Use `setRange` when copying output chunks to the final buffer in `CodedBufferWriter` Similar to #885, this optimizes some more buffer copying to `memcpy`. Becuase the chunks can be very small, we use a simple loop as before on small chunks and `setRange` on large chunks. The chunk size to check for when to use a loop is somewhat arbitrarily chosen as 20. In many benchmarks the chunks are either too small (less than 10 bytes) or a more than a hundred bytes, so in the benchmarks I've tried it doesn't matter whether the threshold is 20, 30, or 40. The slowness in dart2js is probably caused by the `Uint8List` allocation, to be passed to `setRange` as the source. Results from the same benchmark in #885. Before numbers are using the changes in #886. | | Before | After | Diff | |------------------------------|------------|------------|---------------------| | AOT | 122,741 us | 114,582 us | - 8,159 us, -6.6% | | JIT | 94,880 us | 92,317 us | - 2,483 us, -2.6% | | dart2js -O4 | 258,250 us | 266,000 us | + 7,750 us, +3.0% | | dart2wasm --omit-type-checks | 195,300 us | 169,166 us | -26,134 us, -13.3% | AOT and JIT tested on x64. --- .../lib/src/protobuf/coded_buffer_writer.dart | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/protobuf/lib/src/protobuf/coded_buffer_writer.dart b/protobuf/lib/src/protobuf/coded_buffer_writer.dart index e149fabd0..58656be6c 100644 --- a/protobuf/lib/src/protobuf/coded_buffer_writer.dart +++ b/protobuf/lib/src/protobuf/coded_buffer_writer.dart @@ -141,22 +141,32 @@ class CodedBufferWriter { } buffer[outPos++] = v; } else { - // action is an amount of bytes to copy from _outputChunks into the - // buffer. + // `action` is an amount of bytes to copy from `_outputChunks` into + // the buffer. var bytesToCopy = action; while (bytesToCopy > 0) { final Uint8List chunk = _outputChunks[chunkIndex]; final int bytesInChunk = _outputChunks[chunkIndex + 1]; - // Copy at most bytesToCopy bytes from the current chunk. + // Copy at most `bytesToCopy` bytes from the current chunk. final leftInChunk = bytesInChunk - chunkPos; final bytesToCopyFromChunk = leftInChunk > bytesToCopy ? bytesToCopy : leftInChunk; final endPos = chunkPos + bytesToCopyFromChunk; - while (chunkPos < endPos) { - buffer[outPos++] = chunk[chunkPos++]; + + if (bytesToCopyFromChunk <= 20) { + while (chunkPos < endPos) { + buffer[outPos++] = chunk[chunkPos++]; + } + bytesToCopy -= bytesToCopyFromChunk; + } else { + final chunkSlice = Uint8List.sublistView(chunk, chunkPos, endPos); + buffer.setRange( + outPos, outPos + bytesToCopyFromChunk, chunkSlice); + chunkPos += bytesToCopyFromChunk; + outPos += bytesToCopyFromChunk; + bytesToCopy -= bytesToCopyFromChunk; } - bytesToCopy -= bytesToCopyFromChunk; // Move to the next chunk if the current one is exhausted. if (chunkPos == bytesInChunk) {