From 2b94b4b1affd4fec6fccdcb0efc2e2386fbb3681 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 11 Jan 2024 15:37:38 +0100 Subject: [PATCH] Add standalone n-body SoA and AoSoA for code comparison --- examples/nbody_code_comp/CMakeLists.txt | 14 ++- ...dy_baseline.cpp => nbody-AoS-baseline.cpp} | 44 +++----- examples/nbody_code_comp/nbody-AoSoA.cpp | 103 ++++++++++++++++++ examples/nbody_code_comp/nbody-SoA.cpp | 97 +++++++++++++++++ .../{nbody_ported.cpp => nbody-ported.cpp} | 15 ++- 5 files changed, 238 insertions(+), 35 deletions(-) rename examples/nbody_code_comp/{nbody_baseline.cpp => nbody-AoS-baseline.cpp} (69%) create mode 100644 examples/nbody_code_comp/nbody-AoSoA.cpp create mode 100644 examples/nbody_code_comp/nbody-SoA.cpp rename examples/nbody_code_comp/{nbody_ported.cpp => nbody-ported.cpp} (89%) diff --git a/examples/nbody_code_comp/CMakeLists.txt b/examples/nbody_code_comp/CMakeLists.txt index 3dead4d77b..9818e1afe7 100644 --- a/examples/nbody_code_comp/CMakeLists.txt +++ b/examples/nbody_code_comp/CMakeLists.txt @@ -3,14 +3,22 @@ cmake_minimum_required (VERSION 3.18.3) -project(llama-nbody-baseline CXX) -add_executable(${PROJECT_NAME} nbody_baseline.cpp) +project(llama-nbody-aos-baseline CXX) +add_executable(${PROJECT_NAME} nbody-AoS-baseline.cpp) +target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_20) + +project(llama-nbody-soa CXX) +add_executable(${PROJECT_NAME} nbody-SoA.cpp) +target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_20) + +project(llama-nbody-aosoa CXX) +add_executable(${PROJECT_NAME} nbody-AoSoA.cpp) target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_20) project(llama-nbody-ported CXX) if (NOT TARGET llama::llama) find_package(llama CONFIG REQUIRED) endif() -add_executable(${PROJECT_NAME} nbody_ported.cpp) +add_executable(${PROJECT_NAME} nbody-ported.cpp) target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_20) target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama) diff --git a/examples/nbody_code_comp/nbody_baseline.cpp b/examples/nbody_code_comp/nbody-AoS-baseline.cpp similarity index 69% rename from examples/nbody_code_comp/nbody_baseline.cpp rename to examples/nbody_code_comp/nbody-AoS-baseline.cpp index 1c5e029cb9..d4cf442cd8 100644 --- a/examples/nbody_code_comp/nbody_baseline.cpp +++ b/examples/nbody_code_comp/nbody-AoS-baseline.cpp @@ -13,50 +13,31 @@ constexpr int steps = 5, problemSize = 64 * 1024; struct Vec3 { FP x, y, z; - - auto operator*=(Vec3 v) -> Vec3& { - x *= v.x; - y *= v.y; - z *= v.z; - return *this; - } - - auto operator+=(Vec3 v) -> Vec3& { - x += v.x; - y += v.y; - z += v.z; - return *this; - } - - friend auto operator-(Vec3 a, Vec3 b) -> Vec3 { - return Vec3{a.x - b.x, a.y - b.y, a.z - b.z}; - } - - friend auto operator*(Vec3 a, FP s) -> Vec3 { - return Vec3{a.x * s, a.y * s, a.z * s}; - } }; - struct Particle { Vec3 pos, vel; FP mass; }; inline void pPInteraction(Particle& pi, const Particle& pj) { - auto dist = pi.pos - pj.pos; - dist *= dist; + auto dist = Vec3{pi.pos.x - pj.pos.x, pi.pos.y - pj.pos.y, pi.pos.z - pj.pos.z}; + dist.x *= dist.x; + dist.y *= dist.y; + dist.z *= dist.z; const auto distSqr = eps2 + dist.x + dist.y + dist.z; const auto distSixth = distSqr * distSqr * distSqr; const auto invDistCube = FP{1} / std::sqrt(distSixth); const auto sts = pj.mass * timestep * invDistCube; - pi.vel += dist * sts; + pi.vel.x += dist.x * sts; + pi.vel.y += dist.y * sts; + pi.vel.z += dist.z * sts; } void update(std::span particles) { #pragma GCC ivdep for(int i = 0; i < problemSize; i++) { Particle pi = particles[i]; - for(std::size_t j = 0; j < problemSize; ++j) + for(int j = 0; j < problemSize; ++j) pPInteraction(pi, particles[j]); particles[i].vel = pi.vel; } @@ -64,12 +45,16 @@ void update(std::span particles) { void move(std::span particles) { #pragma GCC ivdep - for(int i = 0; i < problemSize; i++) - particles[i].pos += particles[i].vel * timestep; + for(int i = 0; i < problemSize; i++) { + particles[i].pos.x += particles[i].vel.x * timestep; + particles[i].pos.y += particles[i].vel.y * timestep; + particles[i].pos.z += particles[i].vel.z * timestep; + } } auto main() -> int { auto particles = std::vector(problemSize); + std::default_random_engine engine; std::normal_distribution dist(FP{0}, FP{1}); for(auto& p : particles) { @@ -86,5 +71,6 @@ auto main() -> int { update(particles); ::move(particles); } + return 0; } diff --git a/examples/nbody_code_comp/nbody-AoSoA.cpp b/examples/nbody_code_comp/nbody-AoSoA.cpp new file mode 100644 index 0000000000..3be4b7d3d2 --- /dev/null +++ b/examples/nbody_code_comp/nbody-AoSoA.cpp @@ -0,0 +1,103 @@ +// Copyright 2024 Bernhard Manfred Gruber +// SPDX-License-Identifier: MPL-2.0 + +// clang-format off + +#include +#include +#include + +using FP = float; +constexpr FP timestep = 0.0001f, eps2 = 0.01f; +constexpr int steps = 5, problemSize = 64 * 1024; + +constexpr auto lanes = 16; +constexpr auto blocks = problemSize / lanes; + +struct alignas(lanes * sizeof(FP)) Vec3Block { + FP x[lanes]; + FP y[lanes]; + FP z[lanes]; +}; +struct alignas(lanes * sizeof(FP)) ParticleBlock { + Vec3Block pos, vel; + FP mass[lanes]; +}; + +inline void pPInteraction(FP piposx, FP piposy, FP piposz, FP& pivelx, FP& pively, FP& pivelz, + FP pjposx, FP pjposy, FP pjposz, FP pjmass) { + auto xdist = piposx - pjposx; + auto ydist = piposy - pjposy; + auto zdist = piposz - pjposz; + xdist *= xdist; + ydist *= ydist; + zdist *= zdist; + const auto distSqr = eps2 + xdist + ydist + zdist; + const auto distSixth = distSqr * distSqr * distSqr; + const auto invDistCube = FP{1} / std::sqrt(distSixth); + const auto sts = pjmass * timestep * invDistCube; + pivelx += xdist * sts; + pively += ydist * sts; + pivelz += zdist * sts; +} + +void update(std::span particles) { + for(int bi = 0; bi < blocks; bi++) { + auto blockI = particles[bi]; + for(int bj = 0; bj < blocks; bj++) + for(int j = 0; j < lanes; j++) { +#pragma GCC ivdep + for(int i = 0; i < lanes; i++) { + pPInteraction( + blockI.pos.x[i], + blockI.pos.y[i], + blockI.pos.z[i], + blockI.vel.x[i], + blockI.vel.y[i], + blockI.vel.z[i], + particles[bj].pos.x[j], + particles[bj].pos.y[j], + particles[bj].pos.z[j], + particles[bj].mass[j]); + } + } + + particles[bi].vel = blockI.vel; + } +} + +void move(std::span particles) { + for(int bi = 0; bi < blocks; bi++) { + #pragma GCC ivdep + for(std::size_t i = 0; i < lanes; ++i) { + particles[bi].pos.x[i] += particles[bi].vel.x[i] * timestep; + particles[bi].pos.y[i] += particles[bi].vel.y[i] * timestep; + particles[bi].pos.z[i] += particles[bi].vel.z[i] * timestep; + } + } +} + +auto main() -> int { + auto particles = std::vector(blocks); + + std::default_random_engine engine; + std::normal_distribution dist(FP{0}, FP{1}); + for(int bi = 0; bi < blocks; ++bi) { + for(int i = 0; i < lanes; ++i) { + particles[bi].pos.x[i] = dist(engine); + particles[bi].pos.y[i] = dist(engine); + particles[bi].pos.z[i] = dist(engine); + particles[bi].vel.x[i] = dist(engine) / FP{10}; + particles[bi].vel.y[i] = dist(engine) / FP{10}; + particles[bi].vel.z[i] = dist(engine) / FP{10}; + particles[bi].mass[i] = dist(engine) / FP{100}; + } + } + + for(int s = 0; s < steps; ++s) { + update(particles); + ::move(particles); + } + + return 0; +} diff --git a/examples/nbody_code_comp/nbody-SoA.cpp b/examples/nbody_code_comp/nbody-SoA.cpp new file mode 100644 index 0000000000..ea5e385911 --- /dev/null +++ b/examples/nbody_code_comp/nbody-SoA.cpp @@ -0,0 +1,97 @@ +// Copyright 2024 Bernhard Manfred Gruber +// SPDX-License-Identifier: MPL-2.0 + +// clang-format off + +#include +#include + +using FP = float; +constexpr FP timestep = 0.0001f, eps2 = 0.01f; +constexpr int steps = 5, problemSize = 64 * 1024; + +inline void pPInteraction(FP piposx, FP piposy, FP piposz, FP& pivelx, FP& pively, FP& pivelz, + FP pjposx, FP pjposy, FP pjposz, FP pjmass) { + auto xdist = piposx - pjposx; + auto ydist = piposy - pjposy; + auto zdist = piposz - pjposz; + xdist *= xdist; + ydist *= ydist; + zdist *= zdist; + const auto distSqr = eps2 + xdist + ydist + zdist; + const auto distSixth = distSqr * distSqr * distSqr; + const auto invDistCube = FP{1} / std::sqrt(distSixth); + const auto sts = pjmass * timestep * invDistCube; + pivelx += xdist * sts; + pively += ydist * sts; + pivelz += zdist * sts; +} + +void update(const FP* posx, const FP* posy, const FP* posz, FP* velx, FP* vely, FP* velz, const FP* mass) { +#pragma GCC ivdep + for(int i = 0; i < problemSize; i++) { + const auto piposx = posx[i]; + const auto piposy = posy[i]; + const auto piposz = posz[i]; + auto pivelx = velx[i]; + auto pively = vely[i]; + auto pivelz = velz[i]; + for(int j = 0; j < problemSize; ++j) + pPInteraction(piposx, piposy, piposz, pivelx, pively, pivelz, posx[j], posy[j], posz[j], mass[j]); + velx[i] = pivelx; + vely[i] = pively; + velz[i] = pivelz; + } +} + +void move(FP* posx, FP* posy, FP* posz, const FP* velx, const FP* vely, const FP* velz) { +#pragma GCC ivdep + for(int i = 0; i < problemSize; i++) { + posx[i] += velx[i] * timestep; + posy[i] += vely[i] * timestep; + posz[i] += velz[i] * timestep; + } +} + +template +struct AlignedAllocator { + using value_type = T; + + auto allocate(std::size_t n) const -> T* { + return new(std::align_val_t{64}) T[n]; + } + + void deallocate(T* p, std::size_t) const { + ::operator delete[] (p, std::align_val_t{64}); + } +}; + +auto main() -> int { + using Vector = std::vector>; + auto posx = Vector(problemSize); + auto posy = Vector(problemSize); + auto posz = Vector(problemSize); + auto velx = Vector(problemSize); + auto vely = Vector(problemSize); + auto velz = Vector(problemSize); + auto mass = Vector(problemSize); + + std::default_random_engine engine; + std::normal_distribution dist(FP{0}, FP{1}); + for(int i = 0; i < problemSize; ++i) { + posx[i] = dist(engine); + posy[i] = dist(engine); + posz[i] = dist(engine); + velx[i] = dist(engine) / FP{10}; + vely[i] = dist(engine) / FP{10}; + velz[i] = dist(engine) / FP{10}; + mass[i] = dist(engine) / FP{100}; + } + + for(int s = 0; s < steps; ++s) { + update(posx.data(), posy.data(), posz.data(), velx.data(), vely.data(), velz.data(), mass.data()); + move(posx.data(), posy.data(), posz.data(), velx.data(), vely.data(), velz.data()); + } + + return 0; +} diff --git a/examples/nbody_code_comp/nbody_ported.cpp b/examples/nbody_code_comp/nbody-ported.cpp similarity index 89% rename from examples/nbody_code_comp/nbody_ported.cpp rename to examples/nbody_code_comp/nbody-ported.cpp index 5177d04ec0..118672f0a2 100644 --- a/examples/nbody_code_comp/nbody_ported.cpp +++ b/examples/nbody_code_comp/nbody-ported.cpp @@ -4,7 +4,6 @@ // clang-format off #include "llama/llama.hpp" - #include #include @@ -13,8 +12,16 @@ constexpr FP timestep = 0.0001f, eps2 = 0.01f; constexpr int steps = 5, problemSize = 64 * 1024; struct Pos{}; struct Vel{}; struct X{}; struct Y{}; struct Z{}; struct Mass{}; -using V3 = llama::Record, llama::Field, llama::Field>; -using Particle = llama::Record, llama::Field, llama::Field>; +using Vec3 = llama::Record< + llama::Field, + llama::Field, + llama::Field +>; +using Particle = llama::Record< + llama::Field, + llama::Field, + llama::Field +>; LLAMA_FN_HOST_ACC_INLINE void pPInteraction(auto&& pi, auto&& pj) { auto dist = pi(Pos{}) - pj(Pos{}); @@ -47,6 +54,7 @@ auto main() -> int { const auto extents = ArrayExtents{problemSize}; auto mapping = llama::mapping::AoS{extents}; auto particles = llama::allocViewUninitialized(mapping); + std::default_random_engine engine; std::normal_distribution dist(FP{0}, FP{1}); for(auto&& p : particles) { @@ -63,5 +71,6 @@ auto main() -> int { update(particles); ::move(particles); } + return 0; }