Skip to content

Commit

Permalink
Add GPU Gems layout to alpaka n-body
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Nov 16, 2023
1 parent 5a07375 commit 93c1e9f
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 9 deletions.
31 changes: 30 additions & 1 deletion examples/alpaka/nbody/nbody.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ namespace tag
struct Y{};
struct Z{};
struct Mass{};
struct Padding{};
} // namespace tag

using Vec3 = llama::Record<
Expand All @@ -120,7 +121,8 @@ enum Mapping
AoS,
SoA_SB,
SoA_MB,
AoSoA
AoSoA,
SplitGpuGems
};

template<typename Acc, typename ParticleRefI, typename ParticleRefJ>
Expand Down Expand Up @@ -229,6 +231,8 @@ void run(std::ostream& plotFile)
return "SoA MB";
if(m == 3)
return "AoSoA" + std::to_string(aosoaLanes);
if(m == 4)
return "SplitGpuGems";
std::abort();
};
const auto title = "GM " + mappingName(MappingGM) + " SM " + mappingName(MappingSM);
Expand All @@ -252,7 +256,31 @@ void run(std::ostream& plotFile)
return llama::mapping::SoA<ArrayExtents, Particle, llama::mapping::Blobs::OnePerField>{extents};
if constexpr(MappingGM == AoSoA)
return llama::mapping::AoSoA<ArrayExtents, Particle, aosoaLanes>{extents};
using boost::mp11::mp_list;
if constexpr(MappingGM == SplitGpuGems)
{
using Vec4 = llama::Record<
llama::Field<tag::X, FP>,
llama::Field<tag::Y, FP>,
llama::Field<tag::Z, FP>,
llama::Field<tag::Padding, FP>>; // dummy
using ParticlePadded = llama::
Record<llama::Field<tag::Pos, Vec3>, llama::Field<tag::Vel, Vec4>, llama::Field<tag::Mass, FP>>;
return llama::mapping::Split<
ArrayExtents,
ParticlePadded,
mp_list<
mp_list<tag::Pos, tag::X>,
mp_list<tag::Pos, tag::Y>,
mp_list<tag::Pos, tag::Z>,
mp_list<tag::Mass>>,
llama::mapping::BindAoS<>::fn,
llama::mapping::BindAoS<>::fn,
true>{extents};
}
}();
std::ofstream{"nbody_alpaka_mapping_" + mappingName(MappingGM) + ".svg"}
<< llama::toSvg(decltype(mapping){llama::ArrayExtentsDynamic<int, 1>{32}});

Stopwatch watch;

Expand Down Expand Up @@ -351,6 +379,7 @@ set y2tics auto
run<alpaka::ExampleDefaultAcc, AoSoA, AoS>(plotFile);
run<alpaka::ExampleDefaultAcc, AoSoA, SoA_SB>(plotFile);
run<alpaka::ExampleDefaultAcc, AoSoA, AoSoA>(plotFile);
run<alpaka::ExampleDefaultAcc, SplitGpuGems, AoS>(plotFile);

plotFile << R"(EOD
plot $data using 2:xtic(1) ti col axis x1y1, "" using 3 ti col axis x1y2
Expand Down
42 changes: 34 additions & 8 deletions examples/cuda/nbody/nbody.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ namespace tag
struct Y{};
struct Z{};
struct Mass{};
struct Padding{};
} // namespace tag

using Vec3 = llama::Record<
Expand Down Expand Up @@ -177,7 +178,9 @@ try
if(m == 4)
return "Split SoA";
if(m == 5)
return "Split AoS";
return "Split AoS"; // similar to GPU Gems, but no padding float in velocity
if(m == 6)
return "SplitGpuGems";
std::abort();
};
auto title = "GM " + mappingName(Mapping);
Expand Down Expand Up @@ -210,11 +213,34 @@ try
ArrayExtents,
Particle,
llama::RecordCoord<1>,
llama::mapping::BindSoA<>::fn,
llama::mapping::BindSoA<>::fn,

llama::mapping::BindAoS<>::fn,
llama::mapping::BindAoS<>::fn,
true>{extents};
if constexpr(Mapping == 6)
{
using boost::mp11::mp_list;
using Vec4 = llama::Record<
llama::Field<tag::X, FP>,
llama::Field<tag::Y, FP>,
llama::Field<tag::Z, FP>,
llama::Field<tag::Padding, FP>>; // dummy
using ParticlePadded = llama::
Record<llama::Field<tag::Pos, Vec3>, llama::Field<tag::Vel, Vec4>, llama::Field<tag::Mass, FP>>;
return llama::mapping::Split<
ArrayExtents,
ParticlePadded,
mp_list<
mp_list<tag::Pos, tag::X>,
mp_list<tag::Pos, tag::Y>,
mp_list<tag::Pos, tag::Z>,
mp_list<tag::Mass>>,
llama::mapping::BindAoS<>::fn,
llama::mapping::BindAoS<>::fn,
true>{extents};
}
}();
std::ofstream{"nbody_cuda_mapping_" + mappingName(Mapping) + ".svg"}
<< llama::toSvg(decltype(mapping){llama::ArrayExtentsDynamic<int, 1>{32}});
auto tmapping = [&]
{
if constexpr(countFieldAccesses)
Expand Down Expand Up @@ -344,7 +370,7 @@ catch(const std::exception& e)
// based on:
// https://developer.nvidia.com/gpugems/gpugems3/part-v-physics-simulation/chapter-31-fast-n-body-simulation-cuda
// The original GPU gems implementation is with THREADS_PER_BLOCK == SHARED_ELEMENTS_PER_BLOCK
namespace manual
namespace gpugems
{
using FP3 = std::conditional_t<std::is_same_v<FP, float>, float3, double3>;
using FP4 = std::conditional_t<std::is_same_v<FP, float>, float4, double4>;
Expand Down Expand Up @@ -493,15 +519,15 @@ namespace manual
{
std::cerr << "Exception: " << e.what() << std::endl;
}
} // namespace manual
} // namespace gpugems

auto main() -> int
try
{
std::cout << problemSize / 1024 << "ki particles (" << problemSize * llama::sizeOf<Particle> / 1024 << "kiB)\n"
<< "Caching " << sharedElementsPerBlock << " particles ("
<< sharedElementsPerBlock * llama::sizeOf<SharedMemoryParticle> / 1024 << " kiB) in shared memory\n"
<< "Using " << threadsPerBlock << " per block\n";
<< "Using " << threadsPerBlock << " threads per block\n";
int device = 0;
cudaGetDevice(&device);
cudaDeviceProp prop{};
Expand Down Expand Up @@ -538,7 +564,7 @@ $data << EOD
mp_for_each<mp_iota_c<6>>(
[&](auto i)
{ mp_for_each<mp_iota_c<4>>([&](auto j) { run<decltype(i)::value, decltype(j)::value>(plotFile, true); }); });
manual::run(plotFile);
gpugems::run(plotFile);

plotFile <<
R"(EOD
Expand Down

0 comments on commit 93c1e9f

Please sign in to comment.