From e44b23e5ca87f0cc3085c88d93b2a313ff888c88 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Tue, 2 Jan 2024 00:23:31 +0100 Subject: [PATCH] Automatically run LLAMA SIMD versions with 1 lane --- examples/nbody/nbody.cpp | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/examples/nbody/nbody.cpp b/examples/nbody/nbody.cpp index 9e8d3d620d..a0f51515c3 100644 --- a/examples/nbody/nbody.cpp +++ b/examples/nbody/nbody.cpp @@ -36,7 +36,6 @@ constexpr auto dumpMapping = false; constexpr auto allowRsqrt = false; // rsqrt can be way faster, but less accurate constexpr auto newtonRaphsonAfterRsqrt = true; // generate a newton raphson refinement after explicit calls to rsqrt() constexpr auto runUpdate = true; // run update step. Useful to disable for benchmarking the move step. -constexpr auto llamaSimdLanes = -1; // SIMD lanes to use for LLAMA, -1 lets LLAMA choose constexpr auto timestep = FP{0.0001}; constexpr auto eps2 = FP{0.01}; @@ -281,7 +280,8 @@ namespace usellama particles(i)(tag::Pos{}) += particles(i)(tag::Vel{}) * timestep; } - template + /// @tparam SIMDLanes 0: no SIMD, otherwise use the value as SIMD width + template auto main(std::ostream& plotFile) -> Vec3 { auto mappingName = [](int m) -> std::string @@ -304,7 +304,7 @@ namespace usellama return "BitPack SoA 11e4"; std::abort(); }; - auto title = "LLAMA " + mappingName(Mapping) + (UseSimd ? " SIMD" : ""); + auto title = "LLAMA " + mappingName(Mapping) + (SIMDLanes == 0 ? "" : " SIMD W=" + std::to_string(SIMDLanes)); std::cout << title << "\n"; Stopwatch watch; auto mapping = [&] @@ -378,23 +378,19 @@ namespace usellama common::Stats statsMove; for(std::size_t s = 0; s < steps + 1; ++s) { -#ifdef HAVE_XSIMD - constexpr auto width - = llamaSimdLanes == -1 ? llama::simdLanesWithFullVectorsFor : llamaSimdLanes; -#endif if constexpr(runUpdate) { #ifdef HAVE_XSIMD - if constexpr(UseSimd) - updateSimd(particles); + if constexpr(SIMDLanes != 0) + updateSimd(particles); else #endif update(particles); statsUpdate(watch.printAndReset("update", '\t')); } #ifdef HAVE_XSIMD - if constexpr(UseSimd) - moveSimd(particles); + if constexpr(SIMDLanes != 0) + moveSimd(particles); else #endif move(particles); @@ -1591,6 +1587,10 @@ set y2tics auto // SIMD versions updating 8 particles by 1 are also a bit faster than updating 1 particle by 8, so the latter are // also disabled. +#ifdef HAVE_XSIMD + static constexpr auto nativeSimdWidth = llama::simdLanesWithFullVectorsFor; +#endif + std::vector finalPositions; using namespace boost::mp11; mp_for_each>( @@ -1600,12 +1600,17 @@ set y2tics auto // only AoSoA (3) needs lanes using Lanes = std::conditional_t, mp_list_c>; mp_for_each( - [&](auto lanes) + [&](auto aosoaLanesIc) { - finalPositions.push_back(usellama::main(plotFile)); + static constexpr int aosoaLanes = decltype(aosoaLanesIc)::value; + finalPositions.push_back(usellama::main(plotFile)); #ifdef HAVE_XSIMD - if constexpr(i < 5) // TODO(bgruber): simd does not work with proxy references yet - finalPositions.push_back(usellama::main(plotFile)); + // TODO(bgruber): simd does not work with proxy references yet + if constexpr(i < 5) + { + finalPositions.push_back(usellama::main(plotFile)); + finalPositions.push_back(usellama::main(plotFile)); + } #endif }); });