-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[libc] Improve memcpy for ARM Cortex-M supporting unaligned accesses. #144872
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
gchatelet
wants to merge
2
commits into
llvm:main
Choose a base branch
from
gchatelet:update_arm_memcpy
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+143
−2
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@llvm/pr-subscribers-libc Author: Guillaume Chatelet (gchatelet) ChangesFull diff: https://github.com/llvm/llvm-project/pull/144872.diff 6 Files Affected:
diff --git a/libc/src/__support/macros/optimization.h b/libc/src/__support/macros/optimization.h
index 253843e5e37aa..f7133c94b405d 100644
--- a/libc/src/__support/macros/optimization.h
+++ b/libc/src/__support/macros/optimization.h
@@ -10,7 +10,7 @@
#ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H
#define LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H
-#include "src/__support/macros/attributes.h" // LIBC_INLINE
+#include "src/__support/macros/attributes.h" // LIBC_INLINE
#include "src/__support/macros/config.h"
#include "src/__support/macros/properties/compiler.h" // LIBC_COMPILER_IS_CLANG
@@ -30,8 +30,10 @@ LIBC_INLINE constexpr bool expects_bool_condition(T value, T expected) {
#if defined(LIBC_COMPILER_IS_CLANG)
#define LIBC_LOOP_NOUNROLL _Pragma("nounroll")
+#define LIBC_LOOP_UNROLL _Pragma("unroll")
#elif defined(LIBC_COMPILER_IS_GCC)
#define LIBC_LOOP_NOUNROLL _Pragma("GCC unroll 0")
+#define LIBC_LOOP_UNROLL _Pragma("GCC unroll 2048")
#else
#error "Unhandled compiler"
#endif
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 08c0b0d34d503..a967247db53f4 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -7,6 +7,7 @@ add_header_library(
aarch64/inline_memcpy.h
aarch64/inline_memmove.h
aarch64/inline_memset.h
+ arm/inline_memcpy.h
generic/aligned_access.h
generic/byte_per_byte.h
inline_bcmp.h
diff --git a/libc/src/string/memory_utils/arm/inline_memcpy.h b/libc/src/string/memory_utils/arm/inline_memcpy.h
new file mode 100644
index 0000000000000..be8bc6459b6c4
--- /dev/null
+++ b/libc/src/string/memory_utils/arm/inline_memcpy.h
@@ -0,0 +1,127 @@
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
+
+#include "src/__support/macros/attributes.h" // LIBC_INLINE
+#include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
+#include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align
+
+#include <stddef.h> // size_t
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace {
+
+LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof(uint32_t);
+
+template <size_t bytes>
+LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
+ if constexpr (bytes == 1 || bytes == 2 || bytes == 4) {
+ memcpy_inline<bytes>(dst, src);
+ } else {
+ // We restrict loads/stores to 4 byte to prevent the use of load/store
+ // multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may
+ // fault (see notes below) and second, they use more registers which in turn
+ // adds push/pop instructions in the hot path.
+ static_assert(bytes % kWordSize == 0);
+ LIBC_LOOP_UNROLL
+ for (size_t i = 0; i < bytes / kWordSize; ++i) {
+ const uintptr_t offset = i * kWordSize;
+ memcpy_inline<kWordSize>(dst + offset, src + offset);
+ }
+ }
+ // In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting
+ // into the load/store instructions.
+ // e.g.,
+ // ldrb r3, [r1], #1
+ // strb r3, [r0], #1
+ dst += bytes;
+ src += bytes;
+}
+
+template <size_t block_size>
+LIBC_INLINE void copy_blocks(Ptr &dst, CPtr &src, size_t &size) {
+ LIBC_LOOP_NOUNROLL
+ for (size_t i = 0; i < size / block_size; ++i)
+ copy_and_bump_pointers<block_size>(dst, src);
+ // Update `size` once at the end instead of once per iteration.
+ size %= block_size;
+}
+
+LIBC_INLINE CPtr bitwise_or(CPtr a, CPtr b) {
+ return cpp::bit_cast<CPtr>(cpp::bit_cast<uintptr_t>(a) |
+ cpp::bit_cast<uintptr_t>(b));
+}
+
+LIBC_INLINE auto misaligned(CPtr a) {
+ return distance_to_align_down<kWordSize>(a);
+}
+
+} // namespace
+
+// Implementation for Cortex-M0, M0+, M1.
+// The implementation makes sure that all accesses are aligned.
+[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_low_end(Ptr dst, CPtr src,
+ size_t size) {
+ // For now, dummy implementation that performs byte per byte copy.
+ LIBC_LOOP_NOUNROLL
+ for (size_t i = 0; i < size; ++i)
+ dst[i] = src[i];
+}
+
+// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
+// support for unaligned loads and stores.
+// Notes:
+// - It compiles down to <300 bytes.
+// - `dst` and `src` are not `__restrict` to prevent the compiler from
+// reordering loads/stores.
+// - We keep state variables to a strict minimum to keep everything in the free
+// registers and prevent costly push / pop.
+// - If unaligned single loads/stores to normal memory are supported, unaligned
+// accesses for load/store multiple (LDM, STM) and load/store double (LDRD,
+// STRD) instructions are generally not supported and will still fault so we
+// make sure to restrict unrolling to word loads/stores.
+[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end(Ptr dst, CPtr src,
+ size_t size) {
+ if (misaligned(bitwise_or(src, dst))) [[unlikely]] {
+ if (size < 8) [[unlikely]] {
+ if (size & 1)
+ copy_and_bump_pointers<1>(dst, src);
+ if (size & 2)
+ copy_and_bump_pointers<2>(dst, src);
+ if (size & 4)
+ copy_and_bump_pointers<4>(dst, src);
+ return;
+ }
+ if (misaligned(src)) [[unlikely]] {
+ const size_t offset = distance_to_align_up<kWordSize>(dst);
+ if (offset & 1)
+ copy_and_bump_pointers<1>(dst, src);
+ if (offset & 2)
+ copy_and_bump_pointers<2>(dst, src);
+ size -= offset;
+ }
+ }
+ copy_blocks<64>(dst, src, size);
+ copy_blocks<16>(dst, src, size);
+ copy_blocks<4>(dst, src, size);
+ if (size & 1)
+ copy_and_bump_pointers<1>(dst, src);
+ if (size & 2) [[unlikely]]
+ copy_and_bump_pointers<2>(dst, src);
+}
+
+[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(void *__restrict dst_,
+ const void *__restrict src_,
+ size_t size) {
+ Ptr dst = cpp::bit_cast<Ptr>(dst_);
+ CPtr src = cpp::bit_cast<CPtr>(src_);
+#ifdef __ARM_FEATURE_UNALIGNED
+ return inline_memcpy_arm_mid_end(dst, src, size);
+#else
+ return inline_memcpy_arm_low_end(dst, src, size);
+#endif
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
diff --git a/libc/src/string/memory_utils/inline_memcpy.h b/libc/src/string/memory_utils/inline_memcpy.h
index f98e55321a9b5..13975e6b3bd0e 100644
--- a/libc/src/string/memory_utils/inline_memcpy.h
+++ b/libc/src/string/memory_utils/inline_memcpy.h
@@ -22,6 +22,9 @@
#include "src/string/memory_utils/x86_64/inline_memcpy.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY \
inline_memcpy_x86_maybe_interpose_repmovsb
+#elif defined(LIBC_TARGET_ARCH_IS_ARM)
+#include "src/string/memory_utils/arm/inline_memcpy.h"
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY inline_memcpy_arm
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
#include "src/string/memory_utils/aarch64/inline_memcpy.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY inline_memcpy_aarch64
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index bdf0b8652188b..c08608c87bb25 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -101,7 +101,7 @@ LIBC_INLINE void memcpy_inline(void *__restrict dst,
}
using Ptr = cpp::byte *; // Pointer to raw data.
-using CPtr = const cpp::byte *; // Const pointer to raw data.
+using CPtr = const cpp::byte *; // Pointer to const raw data.
// This type makes sure that we don't accidentally promote an integral type to
// another one. It is only constructible from the exact T type.
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 8e629270c89d2..9e2828eaf098c 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -4218,6 +4218,7 @@ libc_support_library(
"src/string/memory_utils/aarch64/inline_memcpy.h",
"src/string/memory_utils/aarch64/inline_memmove.h",
"src/string/memory_utils/aarch64/inline_memset.h",
+ "src/string/memory_utils/arm/inline_memcpy.h",
"src/string/memory_utils/generic/aligned_access.h",
"src/string/memory_utils/generic/byte_per_byte.h",
"src/string/memory_utils/inline_bcmp.h",
|
petrhosek
reviewed
Jun 20, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
For best performance the following options can be set in the
libc/config/baremetal/arm/config.json
This implementation has been tested on Raspberry Pi Pico 2 with the following options
-mcpu=cortex-m33 --target=armv8m.main-none-eabi -mfloat-abi=softfp -march=armv8m.main+fp+dsp
and compiled with the pigweed toolchain.