diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index 2ff2dc1381..e3a0127948 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -96,6 +96,7 @@ namespace tag struct Y{}; struct Z{}; struct Mass{}; + struct Padding{}; } // namespace tag using Vec3 = llama::Record< @@ -120,7 +121,8 @@ enum Mapping AoS, SoA_SB, SoA_MB, - AoSoA + AoSoA, + SplitGpuGems }; template @@ -229,6 +231,8 @@ void run(std::ostream& plotFile) return "SoA MB"; if(m == 3) return "AoSoA" + std::to_string(aosoaLanes); + if(m == 4) + return "SplitGpuGems"; std::abort(); }; const auto title = "GM " + mappingName(MappingGM) + " SM " + mappingName(MappingSM); @@ -252,7 +256,31 @@ void run(std::ostream& plotFile) return llama::mapping::SoA{extents}; if constexpr(MappingGM == AoSoA) return llama::mapping::AoSoA{extents}; + using boost::mp11::mp_list; + if constexpr(MappingGM == SplitGpuGems) + { + using Vec4 = llama::Record< + llama::Field, + llama::Field, + llama::Field, + llama::Field>; // dummy + using ParticlePadded = llama:: + Record, llama::Field, llama::Field>; + return llama::mapping::Split< + ArrayExtents, + ParticlePadded, + mp_list< + mp_list, + mp_list, + mp_list, + mp_list>, + llama::mapping::BindAoS<>::fn, + llama::mapping::BindAoS<>::fn, + true>{extents}; + } }(); + std::ofstream{"nbody_alpaka_mapping_" + mappingName(MappingGM) + ".svg"} + << llama::toSvg(decltype(mapping){llama::ArrayExtentsDynamic{32}}); Stopwatch watch; @@ -351,6 +379,7 @@ set y2tics auto run(plotFile); run(plotFile); run(plotFile); + run(plotFile); plotFile << R"(EOD plot $data using 2:xtic(1) ti col axis x1y1, "" using 3 ti col axis x1y2 diff --git a/examples/cuda/nbody/nbody.cu b/examples/cuda/nbody/nbody.cu index 25bd9d8af6..0b53cf3c18 100644 --- a/examples/cuda/nbody/nbody.cu +++ b/examples/cuda/nbody/nbody.cu @@ -53,6 +53,7 @@ namespace tag struct Y{}; struct Z{}; struct Mass{}; + struct Padding{}; } // namespace tag using Vec3 = llama::Record< @@ -177,7 +178,9 @@ try if(m == 4) return "Split SoA"; if(m == 5) - return "Split AoS"; + return "Split AoS"; // similar to GPU Gems, but no padding float in velocity + if(m == 6) + return "SplitGpuGems"; std::abort(); }; auto title = "GM " + mappingName(Mapping); @@ -210,11 +213,34 @@ try ArrayExtents, Particle, llama::RecordCoord<1>, - llama::mapping::BindSoA<>::fn, - llama::mapping::BindSoA<>::fn, - + llama::mapping::BindAoS<>::fn, + llama::mapping::BindAoS<>::fn, + true>{extents}; + if constexpr(Mapping == 6) + { + using boost::mp11::mp_list; + using Vec4 = llama::Record< + llama::Field, + llama::Field, + llama::Field, + llama::Field>; // dummy + using ParticlePadded = llama:: + Record, llama::Field, llama::Field>; + return llama::mapping::Split< + ArrayExtents, + ParticlePadded, + mp_list< + mp_list, + mp_list, + mp_list, + mp_list>, + llama::mapping::BindAoS<>::fn, + llama::mapping::BindAoS<>::fn, true>{extents}; + } }(); + std::ofstream{"nbody_cuda_mapping_" + mappingName(Mapping) + ".svg"} + << llama::toSvg(decltype(mapping){llama::ArrayExtentsDynamic{32}}); auto tmapping = [&] { if constexpr(countFieldAccesses) @@ -344,7 +370,7 @@ catch(const std::exception& e) // based on: // https://developer.nvidia.com/gpugems/gpugems3/part-v-physics-simulation/chapter-31-fast-n-body-simulation-cuda // The original GPU gems implementation is with THREADS_PER_BLOCK == SHARED_ELEMENTS_PER_BLOCK -namespace manual +namespace gpugems { using FP3 = std::conditional_t, float3, double3>; using FP4 = std::conditional_t, float4, double4>; @@ -493,7 +519,7 @@ namespace manual { std::cerr << "Exception: " << e.what() << std::endl; } -} // namespace manual +} // namespace gpugems auto main() -> int try @@ -501,7 +527,7 @@ try std::cout << problemSize / 1024 << "ki particles (" << problemSize * llama::sizeOf / 1024 << "kiB)\n" << "Caching " << sharedElementsPerBlock << " particles (" << sharedElementsPerBlock * llama::sizeOf / 1024 << " kiB) in shared memory\n" - << "Using " << threadsPerBlock << " per block\n"; + << "Using " << threadsPerBlock << " threads per block\n"; int device = 0; cudaGetDevice(&device); cudaDeviceProp prop{}; @@ -538,7 +564,7 @@ $data << EOD mp_for_each>( [&](auto i) { mp_for_each>([&](auto j) { run(plotFile, true); }); }); - manual::run(plotFile); + gpugems::run(plotFile); plotFile << R"(EOD