diff --git a/examples/alpaka/vectoradd/vectoradd.cpp b/examples/alpaka/vectoradd/vectoradd.cpp index c756809f7c..50bcc01db6 100644 --- a/examples/alpaka/vectoradd/vectoradd.cpp +++ b/examples/alpaka/vectoradd/vectoradd.cpp @@ -152,12 +152,10 @@ try } chrono.printAndReset("Init"); - const auto blobCount = decltype(mapping)::blobCount; - for(std::size_t i = 0; i < blobCount; i++) - { - alpaka::memcpy(queue, devA.blobs()[i], hostA.blobs()[i]); - alpaka::memcpy(queue, devB.blobs()[i], hostB.blobs()[i]); - } + auto copyBlobAlpaka + = [&](const auto& srcBlob, auto& dstBlob, std::size_t size) { alpaka::memcpy(queue, dstBlob, srcBlob, size); }; + llama::copyBlobs(hostA, devA, copyBlobAlpaka); + llama::copyBlobs(hostB, devB, copyBlobAlpaka); chrono.printAndReset("Copy H->D"); const auto workdiv = alpaka::getValidWorkDiv(devAcc, problemSize, elements, false); @@ -171,11 +169,8 @@ try } plotFile << "\"LLAMA " << mappingname << "\"\t" << stats.mean() << "\t" << stats.sem() << '\n'; - for(std::size_t i = 0; i < blobCount; i++) - { - alpaka::memcpy(queue, hostA.blobs()[i], devA.blobs()[i]); - alpaka::memcpy(queue, hostB.blobs()[i], devB.blobs()[i]); - } + llama::copyBlobs(devA, hostA, copyBlobAlpaka); + llama::copyBlobs(devB, hostB, copyBlobAlpaka); chrono.printAndReset("Copy D->H"); } catch(const std::exception& e) diff --git a/include/llama/Copy.hpp b/include/llama/Copy.hpp index 26d2d6087e..f98b710121 100644 --- a/include/llama/Copy.hpp +++ b/include/llama/Copy.hpp @@ -26,15 +26,19 @@ namespace llama }); } - using memcopyFunc = void* (*) (void*, const void*, std::size_t); + // need a custom memcpy symbol in LLAMA, because with clang+CUDA, there are multiple std::memcpy symbols, so + // the address is ambiguous. + inline constexpr auto memcpy + = [](void* dst, const void* src, std::size_t size) { std::memcpy(dst, src, size); }; - inline void parallelMemcpy( + template + void parallelMemcpy( std::byte* dst, const std::byte* src, std::size_t size, std::size_t threadId = 0, std::size_t threadCount = 1, - memcopyFunc singleThreadMemcpy = std::memcpy) + MemcpyFunc singleThreadMemcpy = &memcpy) { const auto sizePerThread = size / threadCount; const auto sizeLastThread = sizePerThread + size % threadCount; @@ -43,17 +47,19 @@ namespace llama } } // namespace internal - /// Direct memcpy from source view blobs to destination view blobs. Both views need to have the same mappings with - /// the same array dimensions. - /// @param threadId Optional. Zero-based id of calling thread for multi-threaded invocations. - /// @param threadCount Optional. Thread count in case of multi-threaded invocation. + /// Copy the blobs' content from the source view to the destination view in parallel with the given thread + /// configuration. Both views need to have the same mappings with the same array extents. + /// @param threadId Zero-based id of calling thread for multi-threaded invocations. + /// @param threadCount Thread count in case of multi-threaded invocation. + /// \param singleThreadMemcpy The implementation of memcpy. By default: std::memcpy. LLAMA_EXPORT - template - void blobMemcpy( + template + void memcpyBlobs( const View& srcView, View& dstView, std::size_t threadId = 0, - std::size_t threadCount = 1) + std::size_t threadCount = 1, + MemcpyFunc singleThreadMemcpy = internal::memcpy) { internal::assertTrivialCopyable(); @@ -68,7 +74,40 @@ namespace llama &srcView.blobs()[i][0], dstView.mapping().blobSize(i), threadId, - threadCount); + threadCount, + singleThreadMemcpy); + } + + namespace internal + { + inline constexpr auto copyBlobWithMemcpy = [](const auto& src, auto& dst, std::size_t size) + { + static_assert(std::is_trivially_copyable_v>); + static_assert(std::is_trivially_copyable_v>); + std::memcpy(&dst[0], &src[0], size); + }; + } // namespace internal + + /// Copy the blobs' content from the source view to the destination view. Both views need to have the same mapping, + /// and thus the same blob count and blob sizes. The copy is performed blob by blob. + /// \param copyBlob The function to use for copying blobs. Default is \ref internal::copyBlobWithMemcpy, which uses + /// std::memcpy. + LLAMA_EXPORT + template< + typename Mapping, + typename SrcBlob, + typename DstBlob, + typename BlobCopyFunc = decltype(internal::copyBlobWithMemcpy)> + void copyBlobs( + const View& srcView, + View& dstView, + BlobCopyFunc copyBlob = internal::copyBlobWithMemcpy) + { + // TODO(bgruber): we do not verify if the mappings have other runtime state than the array dimensions + if(srcView.extents() != dstView.extents()) + throw std::runtime_error{"Array dimensions sizes are different"}; + for(std::size_t i = 0; i < Mapping::blobCount; i++) + copyBlob(srcView.blobs()[i], dstView.blobs()[i], dstView.mapping().blobSize(i)); } /// Field-wise copy from source to destination view. Both views need to have the same array and record dimensions. @@ -308,7 +347,8 @@ namespace llama template void operator()(const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount) const { - blobMemcpy(srcView, dstView, threadId, threadCount); + // FIXME(bgruber): need to fallback to fieldWiseCopy when elements are not trivially copyable + memcpyBlobs(srcView, dstView, threadId, threadCount); } }; @@ -400,8 +440,9 @@ namespace llama } }; - /// Copy data from source view to destination view. Both views need to have the same array and record - /// dimensions. Delegates to \ref Copy to choose an implementation. + /// Copy data from source to destination view. Both views need to have the same array and record + /// dimensions, but may have different mappings. The blobs need to be read- and writeable. Delegates to \ref Copy + /// to choose an implementation. /// @param threadId Optional. Zero-based id of calling thread for multi-threaded invocations. /// @param threadCount Optional. Thread count in case of multi-threaded invocation. LLAMA_EXPORT diff --git a/tests/copy.cpp b/tests/copy.cpp index 5eea7a027a..0bba822bf4 100644 --- a/tests/copy.cpp +++ b/tests/copy.cpp @@ -3,6 +3,8 @@ #include "common.hpp" +#include + namespace { using ArrayExtents = llama::ArrayExtentsDynamic; @@ -92,9 +94,43 @@ TEMPLATE_LIST_TEST_CASE("copy", "", AllMappingsProduct) } // NOLINTNEXTLINE(cert-err58-cpp) -TEMPLATE_LIST_TEST_CASE("blobMemcpy", "", AllMappings) +TEMPLATE_LIST_TEST_CASE("memcpyBlobs_default", "", AllMappings) +{ + testCopy([](const auto& srcView, auto& dstView) { llama::memcpyBlobs(srcView, dstView); }); +} + +TEMPLATE_LIST_TEST_CASE("memcpyBlobs_3threads", "", AllMappings) +{ + testCopy( + [](const auto& srcView, auto& dstView) + { + std::thread t1{[&] { llama::memcpyBlobs(srcView, dstView, 0, 3); }}; + std::thread t2{[&] { llama::memcpyBlobs(srcView, dstView, 1, 3); }}; + std::thread t3{[&] { llama::memcpyBlobs(srcView, dstView, 2, 3); }}; + t1.join(); + t2.join(); + t3.join(); + }); +} + +// NOLINTNEXTLINE(cert-err58-cpp) +TEMPLATE_LIST_TEST_CASE("copyBlobs_default", "", AllMappings) +{ + testCopy([](const auto& srcView, auto& dstView) { llama::copyBlobs(srcView, dstView); }); +} + +// NOLINTNEXTLINE(cert-err58-cpp) +TEMPLATE_LIST_TEST_CASE("copyBlobs_stdcopy", "", AllMappings) { - testCopy([](const auto& srcView, auto& dstView) { llama::blobMemcpy(srcView, dstView); }); + testCopy( + [](const auto& srcView, auto& dstView) + { + llama::copyBlobs( + srcView, + dstView, + [](const auto& srcBlob, auto& dstBlob, std::size_t size) + { std::copy(&srcBlob[0], &srcBlob[0] + size, &dstBlob[0]); }); + }); } // NOLINTNEXTLINE(cert-err58-cpp)