Skip to content

Commit

Permalink
Simplify copy implementation
Browse files Browse the repository at this point in the history
* Hardcode readOpt heuristic
* Drop readOpt parameter from aosoaCommonBlockCopy interface
* Remove aosoaCommonBlockCopy runs from viewcopy
  • Loading branch information
bernhardmgruber committed Feb 2, 2024
1 parent 04df063 commit 942680d
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 155 deletions.
66 changes: 5 additions & 61 deletions examples/viewcopy/viewcopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,10 +278,9 @@ set ylabel "Throughput [GiB/s]"

plotFile << R"plot(
$data << EOD
"src layout" "dst layout" "naive copy" "naive copy_sem" "std::copy" "std::copy_sem" "aosoa copy(r)" "aosoa copy(r)_sem" "aosoa copy(w)" "aosoa copy(w)_sem" "LLAMA copy" "LLAMA copy_sem")plot";
"src layout" "dst layout" "naive copy" "naive copy_sem" "std::copy" "std::copy_sem" "LLAMA copy" "LLAMA copy_sem")plot";
if constexpr(runParallelVersions)
plotFile
<< R"plot( "naive copy(p)" "naive copy(p)_sem" "aosoa copy(r,p)" "aosoa copy(r,p)_sem" "aosoa copy(w,p)" "aosoa copy(w,p)_sem" "LLAMA copy(p)" "LLAMA copy_sem(p)")plot";
plotFile << R"plot( "naive copy(p)" "naive copy(p)_sem" "LLAMA copy(p)" "LLAMA copy_sem(p)")plot";
plotFile << '\n';

auto benchmarkAllCopies = [&](std::string_view srcName, std::string_view dstName, auto srcMapping, auto dstMapping)
Expand Down Expand Up @@ -317,23 +316,6 @@ set ylabel "Throughput [GiB/s]"
[](const auto& srcView, auto& dstView) { llama::fieldWiseCopy(srcView, dstView); });
benchmarkCopy("std::copy", [](const auto& srcView, auto& dstView) { stdCopy(srcView, dstView); });
using namespace llama::mapping;
constexpr auto hasCommonBlockCopy = (isAoSoA<decltype(srcMapping)> && isAoSoA<decltype(dstMapping)>)
|| (isAoSoA<decltype(srcMapping)> && isSoA<decltype(dstMapping)>)
|| (isSoA<decltype(srcMapping)> && isAoSoA<decltype(dstMapping)>);
if constexpr(hasCommonBlockCopy)
{
benchmarkCopy(
"aosoa copy(r)",
[](const auto& srcView, auto& dstView) { llama::aosoaCommonBlockCopy(srcView, dstView, true); });
benchmarkCopy(
"aosoa copy(w)",
[](const auto& srcView, auto& dstView) { llama::aosoaCommonBlockCopy(srcView, dstView, false); });
}
else
{
for(int i = 0; i < 2 * 2; i++)
plotFile << "0\t";
}
benchmarkCopy("llama", [&](const auto& srcView, auto& dstView) { llama::copy(srcView, dstView); });

if constexpr(runParallelVersions)
Expand All @@ -346,40 +328,6 @@ set ylabel "Throughput [GiB/s]"
// NOLINTNEXTLINE(openmp-exception-escape)
llama::fieldWiseCopy(srcView, dstView, omp_get_thread_num(), omp_get_num_threads());
});
if constexpr(hasCommonBlockCopy)
{
benchmarkCopy(
"aosoa_copy(r,p)",
[&](const auto& srcView, auto& dstView)
{
#pragma omp parallel
// NOLINTNEXTLINE(openmp-exception-escape)
llama::aosoaCommonBlockCopy(
srcView,
dstView,
true,
omp_get_thread_num(),
omp_get_num_threads());
});
benchmarkCopy(
"aosoa_copy(w,p)",
[&](const auto& srcView, auto& dstView)
{
#pragma omp parallel
// NOLINTNEXTLINE(openmp-exception-escape)
llama::aosoaCommonBlockCopy(
srcView,
dstView,
false,
omp_get_thread_num(),
omp_get_num_threads());
});
}
else
{
for(int i = 0; i < 2 * 2; i++)
plotFile << "0\t";
}
benchmarkCopy(
"llama(p)",
[&](const auto& srcView, auto& dstView)
Expand Down Expand Up @@ -419,15 +367,11 @@ set ylabel "Throughput [GiB/s]"
plotFile << R"(EOD
plot $data using 3: 4:xtic(sprintf("%s -> %s", stringcolumn(1), stringcolumn(2))) ti col, \
"" using 5: 6 ti col, \
"" using 7: 8 ti col, \
"" using 9:10 ti col, \
"" using 11:12 ti col)";
"" using 7: 8 ti col)";
if constexpr(runParallelVersions)
plotFile << R"(, \
"" using 13:14 ti col, \
"" using 15:16 ti col, \
"" using 17:18 ti col, \
"" using 19:20 ti col)";
"" using 9:10 ti col, \
"" using 11:12 ti col)";
plotFile << '\n';
fmt::print("Plot with: ./viewcopy.sh\n");
}
Expand Down
133 changes: 50 additions & 83 deletions include/llama/Copy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,6 @@ namespace llama
void aosoaCommonBlockCopy(
const View<SrcMapping, SrcBlob>& srcView,
View<DstMapping, DstBlob>& dstView,
bool readOpt,
std::size_t threadId = 0,
std::size_t threadCount = 1)
{
Expand Down Expand Up @@ -253,11 +252,12 @@ namespace llama
return std::gcd(lanesSrc, lanesDst);
return std::min(lanesSrc, lanesDst);
}();
if(readOpt)
if constexpr(lanesSrc < lanesDst)
{
static_assert(srcIsAoSoA);

// optimized for linear reading
constexpr auto srcL = srcIsAoSoA ? lanesSrc : l;
const auto elementsPerThread = flatSize / srcL / threadCount * srcL;
const auto elementsPerThread = flatSize / lanesSrc / threadCount * lanesSrc;
const auto start = threadId * elementsPerThread;
const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread;

Expand All @@ -269,56 +269,41 @@ namespace llama
};

// if the AoSoA is packed we can move the src pointer along
if constexpr(srcIsAoSoA)
if constexpr(SrcMapping::fieldAlignment == mapping::FieldAlignment::Pack)
{
if constexpr(SrcMapping::fieldAlignment == mapping::FieldAlignment::Pack)
{
auto* threadSrc = mapSrc(start, RecordCoord<>{});
for(std::size_t i = start; i < stop; i += lanesSrc)
forEachLeafCoord<RecordDim>(
[&](auto rc) LLAMA_LAMBDA_INLINE
{
for(std::size_t j = 0; j < lanesSrc; j += l)
{
assert(threadSrc == mapSrc(i + j, rc));
copyLBlock(threadSrc, i + j, rc);
}
});
}
else
{
for(std::size_t i = start; i < stop; i += lanesSrc)
forEachLeafCoord<RecordDim>(
[&](auto rc) LLAMA_LAMBDA_INLINE
auto* threadSrc = mapSrc(start, RecordCoord<>{});
for(std::size_t i = start; i < stop; i += lanesSrc)
forEachLeafCoord<RecordDim>(
[&](auto rc) LLAMA_LAMBDA_INLINE
{
for(std::size_t j = 0; j < lanesSrc; j += l)
{
auto* threadSrc = mapSrc(i, rc);
for(std::size_t j = 0; j < lanesSrc; j += l)
{
assert(threadSrc == mapSrc(i + j, rc));
copyLBlock(threadSrc, i + j, rc);
}
});
}
assert(threadSrc == mapSrc(i + j, rc));
copyLBlock(threadSrc, i + j, rc);
}
});
}
else
{
forEachLeafCoord<RecordDim>(
[&](auto rc) LLAMA_LAMBDA_INLINE
{
for(std::size_t i = start; i < stop; i += l)
for(std::size_t i = start; i < stop; i += lanesSrc)
forEachLeafCoord<RecordDim>(
[&](auto rc) LLAMA_LAMBDA_INLINE
{
auto* threadSrc = mapSrc(i, rc);
copyLBlock(threadSrc, i, rc);
}
});
for(std::size_t j = 0; j < lanesSrc; j += l)
{
assert(threadSrc == mapSrc(i + j, rc));
copyLBlock(threadSrc, i + j, rc);
}
});
}
}
else
{
// optimized for linear writing
constexpr auto dstL = dstIsAoSoA ? lanesDst : l;
const auto elementsPerThread = flatSize / dstL / threadCount * dstL;
static_assert(dstIsAoSoA);

// optimized for linear writing
const auto elementsPerThread = flatSize / lanesDst / threadCount * lanesDst;
const auto start = threadId * elementsPerThread;
const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread;

Expand All @@ -330,48 +315,33 @@ namespace llama
};

// if the AoSoA is packed we can move the dst pointer along
if constexpr(dstIsAoSoA)
if constexpr(DstMapping::fieldAlignment == mapping::FieldAlignment::Pack)
{
if constexpr(DstMapping::fieldAlignment == mapping::FieldAlignment::Pack)
{
auto* threadDst = mapDst(start, RecordCoord<>{});
for(std::size_t i = start; i < stop; i += lanesDst)
forEachLeafCoord<RecordDim>(
[&](auto rc) LLAMA_LAMBDA_INLINE
{
for(std::size_t j = 0; j < lanesDst; j += l)
{
assert(threadDst == mapDst(i + j, rc));
copyLBlock(threadDst, i + j, rc);
}
});
}
else
{
for(std::size_t i = start; i < stop; i += lanesDst)
forEachLeafCoord<RecordDim>(
[&](auto rc) LLAMA_LAMBDA_INLINE
auto* threadDst = mapDst(start, RecordCoord<>{});
for(std::size_t i = start; i < stop; i += lanesDst)
forEachLeafCoord<RecordDim>(
[&](auto rc) LLAMA_LAMBDA_INLINE
{
for(std::size_t j = 0; j < lanesDst; j += l)
{
auto* threadDst = mapDst(i, rc);
for(std::size_t j = 0; j < lanesDst; j += l)
{
assert(threadDst == mapDst(i + j, rc));
copyLBlock(threadDst, i + j, rc);
}
});
}
assert(threadDst == mapDst(i + j, rc));
copyLBlock(threadDst, i + j, rc);
}
});
}
else
{
forEachLeafCoord<RecordDim>(
[&](auto rc) LLAMA_LAMBDA_INLINE
{
for(std::size_t i = start; i < stop; i += l)
for(std::size_t i = start; i < stop; i += lanesDst)
forEachLeafCoord<RecordDim>(
[&](auto rc) LLAMA_LAMBDA_INLINE
{
auto* threadDst = mapDst(i, rc);
copyLBlock(threadDst, i, rc);
}
});
for(std::size_t j = 0; j < lanesDst; j += l)
{
assert(threadDst == mapDst(i + j, rc));
copyLBlock(threadDst, i + j, rc);
}
});
}
}
}
Expand Down Expand Up @@ -430,8 +400,7 @@ namespace llama
std::size_t threadId,
std::size_t threadCount)
{
constexpr auto readOpt = LanesSrc < LanesDst; // read contiguously on the AoSoA with the smaller lane count
aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount);
aosoaCommonBlockCopy(srcView, dstView, threadId, threadCount);
}
};

Expand Down Expand Up @@ -462,8 +431,7 @@ namespace llama
std::size_t threadId,
std::size_t threadCount)
{
constexpr auto readOpt = true; // read contiguously on the AoSoA
aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount);
aosoaCommonBlockCopy(srcView, dstView, threadId, threadCount);
}
};

Expand Down Expand Up @@ -494,8 +462,7 @@ namespace llama
std::size_t threadId,
std::size_t threadCount)
{
constexpr auto readOpt = false; // read contiguously on the AoSoA
aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount);
aosoaCommonBlockCopy(srcView, dstView, threadId, threadCount);
}
};

Expand Down
13 changes: 2 additions & 11 deletions tests/copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,19 +153,10 @@ TEMPLATE_LIST_TEST_CASE("fieldWiseCopy", "", AllMappingsProduct)
}

// NOLINTNEXTLINE(cert-err58-cpp)
TEMPLATE_LIST_TEST_CASE("aosoaCommonBlockCopy.readOpt", "", AoSoAMappingsProduct)
TEMPLATE_LIST_TEST_CASE("aosoaCommonBlockCopy", "", AoSoAMappingsProduct)
{
using SrcMapping = mp_first<TestType>;
using DstMapping = mp_second<TestType>;
testCopy<SrcMapping, DstMapping>([](const auto& srcView, auto& dstView)
{ llama::aosoaCommonBlockCopy(srcView, dstView, true); });
}

// NOLINTNEXTLINE(cert-err58-cpp)
TEMPLATE_LIST_TEST_CASE("aosoaCommonBlockCopy.writeOpt", "", AoSoAMappingsProduct)
{
using SrcMapping = mp_first<TestType>;
using DstMapping = mp_second<TestType>;
testCopy<SrcMapping, DstMapping>([](const auto& srcView, auto& dstView)
{ llama::aosoaCommonBlockCopy(srcView, dstView, false); });
{ llama::aosoaCommonBlockCopy(srcView, dstView); });
}

0 comments on commit 942680d

Please sign in to comment.