Skip to content

Commit

Permalink
Generalize blob copying functions
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Dec 16, 2023
1 parent 7375de2 commit 26bfa0a
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 27 deletions.
17 changes: 6 additions & 11 deletions examples/alpaka/vectoradd/vectoradd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,10 @@ try
}
chrono.printAndReset("Init");

const auto blobCount = decltype(mapping)::blobCount;
for(std::size_t i = 0; i < blobCount; i++)
{
alpaka::memcpy(queue, devA.blobs()[i], hostA.blobs()[i]);
alpaka::memcpy(queue, devB.blobs()[i], hostB.blobs()[i]);
}
auto copyBlobAlpaka
= [&](const auto& srcBlob, auto& dstBlob, std::size_t size) { alpaka::memcpy(queue, dstBlob, srcBlob, size); };
llama::copyBlobs(hostA, devA, copyBlobAlpaka);
llama::copyBlobs(hostB, devB, copyBlobAlpaka);
chrono.printAndReset("Copy H->D");

const auto workdiv = alpaka::getValidWorkDiv<Acc>(devAcc, problemSize, elements, false);
Expand All @@ -171,11 +169,8 @@ try
}
plotFile << "\"LLAMA " << mappingname << "\"\t" << stats.mean() << "\t" << stats.sem() << '\n';

for(std::size_t i = 0; i < blobCount; i++)
{
alpaka::memcpy(queue, hostA.blobs()[i], devA.blobs()[i]);
alpaka::memcpy(queue, hostB.blobs()[i], devB.blobs()[i]);
}
llama::copyBlobs(devA, hostA, copyBlobAlpaka);
llama::copyBlobs(devB, hostB, copyBlobAlpaka);
chrono.printAndReset("Copy D->H");
}
catch(const std::exception& e)
Expand Down
69 changes: 55 additions & 14 deletions include/llama/Copy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,19 @@ namespace llama
});
}

using memcopyFunc = void* (*) (void*, const void*, std::size_t);
// need a custom memcpy symbol in LLAMA, because with clang+CUDA, there are multiple std::memcpy symbols, so
// the address is ambiguous.
inline constexpr auto memcpy
= [](void* dst, const void* src, std::size_t size) { std::memcpy(dst, src, size); };

inline void parallelMemcpy(
template<typename MemcpyFunc = decltype(memcpy)>
void parallelMemcpy(
std::byte* dst,
const std::byte* src,
std::size_t size,
std::size_t threadId = 0,
std::size_t threadCount = 1,
memcopyFunc singleThreadMemcpy = std::memcpy)
MemcpyFunc singleThreadMemcpy = &memcpy)
{
const auto sizePerThread = size / threadCount;
const auto sizeLastThread = sizePerThread + size % threadCount;
Expand All @@ -43,17 +47,19 @@ namespace llama
}
} // namespace internal

/// Direct memcpy from source view blobs to destination view blobs. Both views need to have the same mappings with
/// the same array dimensions.
/// @param threadId Optional. Zero-based id of calling thread for multi-threaded invocations.
/// @param threadCount Optional. Thread count in case of multi-threaded invocation.
/// Copy the blobs' content from the source view to the destination view in parallel with the given thread
/// configuration. Both views need to have the same mappings with the same array extents.
/// @param threadId Zero-based id of calling thread for multi-threaded invocations.
/// @param threadCount Thread count in case of multi-threaded invocation.
/// \param singleThreadMemcpy The implementation of memcpy. By default: std::memcpy.
LLAMA_EXPORT
template<typename Mapping, typename SrcBlob, typename DstBlob>
void blobMemcpy(
template<typename Mapping, typename SrcBlob, typename DstBlob, typename MemcpyFunc = decltype(internal::memcpy)>
void memcpyBlobs(
const View<Mapping, SrcBlob>& srcView,
View<Mapping, DstBlob>& dstView,
std::size_t threadId = 0,
std::size_t threadCount = 1)
std::size_t threadCount = 1,
MemcpyFunc singleThreadMemcpy = internal::memcpy)
{
internal::assertTrivialCopyable<typename Mapping::RecordDim>();

Expand All @@ -68,7 +74,40 @@ namespace llama
&srcView.blobs()[i][0],
dstView.mapping().blobSize(i),
threadId,
threadCount);
threadCount,
singleThreadMemcpy);
}

namespace internal
{
inline constexpr auto copyBlobWithMemcpy = [](const auto& src, auto& dst, std::size_t size)
{
static_assert(std::is_trivially_copyable_v<std::remove_reference_t<decltype(*&src[0])>>);
static_assert(std::is_trivially_copyable_v<std::remove_reference_t<decltype(*&dst[0])>>);
std::memcpy(&dst[0], &src[0], size);
};
} // namespace internal

/// Copy the blobs' content from the source view to the destination view. Both views need to have the same mapping,
/// and thus the same blob count and blob sizes. The copy is performed blob by blob.
/// \param copyBlob The function to use for copying blobs. Default is \ref internal::copyBlobWithMemcpy, which uses
/// std::memcpy.
LLAMA_EXPORT
template<
typename Mapping,
typename SrcBlob,
typename DstBlob,
typename BlobCopyFunc = decltype(internal::copyBlobWithMemcpy)>
void copyBlobs(
const View<Mapping, SrcBlob>& srcView,
View<Mapping, DstBlob>& dstView,
BlobCopyFunc copyBlob = internal::copyBlobWithMemcpy)
{
// TODO(bgruber): we do not verify if the mappings have other runtime state than the array dimensions
if(srcView.extents() != dstView.extents())
throw std::runtime_error{"Array dimensions sizes are different"};
for(std::size_t i = 0; i < Mapping::blobCount; i++)
copyBlob(srcView.blobs()[i], dstView.blobs()[i], dstView.mapping().blobSize(i));
}

/// Field-wise copy from source to destination view. Both views need to have the same array and record dimensions.
Expand Down Expand Up @@ -308,7 +347,8 @@ namespace llama
template<typename SrcView, typename DstView>
void operator()(const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount) const
{
blobMemcpy(srcView, dstView, threadId, threadCount);
// FIXME(bgruber): need to fallback to fieldWiseCopy when elements are not trivially copyable
memcpyBlobs(srcView, dstView, threadId, threadCount);
}
};

Expand Down Expand Up @@ -400,8 +440,9 @@ namespace llama
}
};

/// Copy data from source view to destination view. Both views need to have the same array and record
/// dimensions. Delegates to \ref Copy to choose an implementation.
/// Copy data from source to destination view. Both views need to have the same array and record
/// dimensions, but may have different mappings. The blobs need to be read- and writeable. Delegates to \ref Copy
/// to choose an implementation.
/// @param threadId Optional. Zero-based id of calling thread for multi-threaded invocations.
/// @param threadCount Optional. Thread count in case of multi-threaded invocation.
LLAMA_EXPORT
Expand Down
40 changes: 38 additions & 2 deletions tests/copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include "common.hpp"

#include <thread>

namespace
{
using ArrayExtents = llama::ArrayExtentsDynamic<std::size_t, 2>;
Expand Down Expand Up @@ -92,9 +94,43 @@ TEMPLATE_LIST_TEST_CASE("copy", "", AllMappingsProduct)
}

// NOLINTNEXTLINE(cert-err58-cpp)
TEMPLATE_LIST_TEST_CASE("blobMemcpy", "", AllMappings)
TEMPLATE_LIST_TEST_CASE("memcpyBlobs_default", "", AllMappings)
{
testCopy<TestType, TestType>([](const auto& srcView, auto& dstView) { llama::memcpyBlobs(srcView, dstView); });
}

TEMPLATE_LIST_TEST_CASE("memcpyBlobs_3threads", "", AllMappings)
{
testCopy<TestType, TestType>(
[](const auto& srcView, auto& dstView)
{
std::thread t1{[&] { llama::memcpyBlobs(srcView, dstView, 0, 3); }};
std::thread t2{[&] { llama::memcpyBlobs(srcView, dstView, 1, 3); }};
std::thread t3{[&] { llama::memcpyBlobs(srcView, dstView, 2, 3); }};
t1.join();
t2.join();
t3.join();
});
}

// NOLINTNEXTLINE(cert-err58-cpp)
TEMPLATE_LIST_TEST_CASE("copyBlobs_default", "", AllMappings)
{
testCopy<TestType, TestType>([](const auto& srcView, auto& dstView) { llama::copyBlobs(srcView, dstView); });
}

// NOLINTNEXTLINE(cert-err58-cpp)
TEMPLATE_LIST_TEST_CASE("copyBlobs_stdcopy", "", AllMappings)
{
testCopy<TestType, TestType>([](const auto& srcView, auto& dstView) { llama::blobMemcpy(srcView, dstView); });
testCopy<TestType, TestType>(
[](const auto& srcView, auto& dstView)
{
llama::copyBlobs(
srcView,
dstView,
[](const auto& srcBlob, auto& dstBlob, std::size_t size)
{ std::copy(&srcBlob[0], &srcBlob[0] + size, &dstBlob[0]); });
});
}

// NOLINTNEXTLINE(cert-err58-cpp)
Expand Down

0 comments on commit 26bfa0a

Please sign in to comment.