From 2b94b4b1affd4fec6fccdcb0efc2e2386fbb3681 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 11 Jan 2024 15:37:38 +0100
Subject: [PATCH] Add standalone n-body SoA and AoSoA for code comparison

---
 examples/nbody_code_comp/CMakeLists.txt       |  14 ++-
 ...dy_baseline.cpp => nbody-AoS-baseline.cpp} |  44 +++-----
 examples/nbody_code_comp/nbody-AoSoA.cpp      | 103 ++++++++++++++++++
 examples/nbody_code_comp/nbody-SoA.cpp        |  97 +++++++++++++++++
 .../{nbody_ported.cpp => nbody-ported.cpp}    |  15 ++-
 5 files changed, 238 insertions(+), 35 deletions(-)
 rename examples/nbody_code_comp/{nbody_baseline.cpp => nbody-AoS-baseline.cpp} (69%)
 create mode 100644 examples/nbody_code_comp/nbody-AoSoA.cpp
 create mode 100644 examples/nbody_code_comp/nbody-SoA.cpp
 rename examples/nbody_code_comp/{nbody_ported.cpp => nbody-ported.cpp} (89%)
diff --git a/examples/nbody_code_comp/CMakeLists.txt b/examples/nbody_code_comp/CMakeLists.txt
index 3dead4d77b..9818e1afe7 100644
--- a/examples/nbody_code_comp/CMakeLists.txt
+++ b/examples/nbody_code_comp/CMakeLists.txt
@@ -3,14 +3,22 @@
 
 cmake_minimum_required (VERSION 3.18.3)
 
-project(llama-nbody-baseline CXX)
-add_executable(${PROJECT_NAME} nbody_baseline.cpp)
+project(llama-nbody-aos-baseline CXX)
+add_executable(${PROJECT_NAME} nbody-AoS-baseline.cpp)
+target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_20)
+
+project(llama-nbody-soa CXX)
+add_executable(${PROJECT_NAME} nbody-SoA.cpp)
+target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_20)
+
+project(llama-nbody-aosoa CXX)
+add_executable(${PROJECT_NAME} nbody-AoSoA.cpp)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_20)
 
 project(llama-nbody-ported CXX)
 if (NOT TARGET llama::llama)
 	find_package(llama CONFIG REQUIRED)
 endif()
-add_executable(${PROJECT_NAME} nbody_ported.cpp)
+add_executable(${PROJECT_NAME} nbody-ported.cpp)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_20)
 target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama)
diff --git a/examples/nbody_code_comp/nbody_baseline.cpp b/examples/nbody_code_comp/nbody-AoS-baseline.cpp
similarity index 69%
rename from examples/nbody_code_comp/nbody_baseline.cpp
rename to examples/nbody_code_comp/nbody-AoS-baseline.cpp
index 1c5e029cb9..d4cf442cd8 100644
--- a/examples/nbody_code_comp/nbody_baseline.cpp
+++ b/examples/nbody_code_comp/nbody-AoS-baseline.cpp
@@ -13,50 +13,31 @@ constexpr int steps = 5, problemSize = 64 * 1024;
 
 struct Vec3 {
     FP x, y, z;
-
-    auto operator*=(Vec3 v) -> Vec3& {
-        x *= v.x;
-        y *= v.y;
-        z *= v.z;
-        return *this;
-    }
-
-    auto operator+=(Vec3 v) -> Vec3& {
-        x += v.x;
-        y += v.y;
-        z += v.z;
-        return *this;
-    }
-
-    friend auto operator-(Vec3 a, Vec3 b) -> Vec3 {
-        return Vec3{a.x - b.x, a.y - b.y, a.z - b.z};
-    }
-
-    friend auto operator*(Vec3 a, FP s) -> Vec3 {
-        return Vec3{a.x * s, a.y * s, a.z * s};
-    }
 };
-
 struct Particle {
     Vec3 pos, vel;
     FP mass;
 };
 
 inline void pPInteraction(Particle& pi, const Particle& pj) {
-    auto dist = pi.pos - pj.pos;
-    dist *= dist;
+    auto dist = Vec3{pi.pos.x - pj.pos.x, pi.pos.y - pj.pos.y, pi.pos.z - pj.pos.z};
+    dist.x *= dist.x;
+    dist.y *= dist.y;
+    dist.z *= dist.z;
     const auto distSqr = eps2 + dist.x + dist.y + dist.z;
     const auto distSixth = distSqr * distSqr * distSqr;
     const auto invDistCube = FP{1} / std::sqrt(distSixth);
     const auto sts = pj.mass * timestep * invDistCube;
-    pi.vel += dist * sts;
+    pi.vel.x += dist.x * sts;
+    pi.vel.y += dist.y * sts;
+    pi.vel.z += dist.z * sts;
 }
 
 void update(std::span<Particle> particles) {
 #pragma GCC ivdep
     for(int i = 0; i < problemSize; i++) {
         Particle pi = particles[i];
-        for(std::size_t j = 0; j < problemSize; ++j)
+        for(int j = 0; j < problemSize; ++j)
             pPInteraction(pi, particles[j]);
         particles[i].vel = pi.vel;
     }
@@ -64,12 +45,16 @@ void update(std::span<Particle> particles) {
 
 void move(std::span<Particle> particles) {
 #pragma GCC ivdep
-    for(int i = 0; i < problemSize; i++)
-        particles[i].pos += particles[i].vel * timestep;
+    for(int i = 0; i < problemSize; i++) {
+        particles[i].pos.x += particles[i].vel.x * timestep;
+        particles[i].pos.y += particles[i].vel.y * timestep;
+        particles[i].pos.z += particles[i].vel.z * timestep;
+    }
 }
 
 auto main() -> int {
     auto particles = std::vector<Particle>(problemSize);
+	
     std::default_random_engine engine;
     std::normal_distribution<FP> dist(FP{0}, FP{1});
     for(auto& p : particles) {
@@ -86,5 +71,6 @@ auto main() -> int {
         update(particles);
         ::move(particles);
     }
+
     return 0;
 }
diff --git a/examples/nbody_code_comp/nbody-AoSoA.cpp b/examples/nbody_code_comp/nbody-AoSoA.cpp
new file mode 100644
index 0000000000..3be4b7d3d2
--- /dev/null
+++ b/examples/nbody_code_comp/nbody-AoSoA.cpp
@@ -0,0 +1,103 @@
+// Copyright 2024 Bernhard Manfred Gruber
+// SPDX-License-Identifier: MPL-2.0
+
+// clang-format off
+
+#include <random>
+#include <span>
+#include <vector>
+
+using FP = float;
+constexpr FP timestep = 0.0001f, eps2 = 0.01f;
+constexpr int steps = 5, problemSize = 64 * 1024;
+
+constexpr auto lanes = 16;
+constexpr auto blocks = problemSize / lanes;
+
+struct alignas(lanes * sizeof(FP)) Vec3Block {
+    FP x[lanes];
+    FP y[lanes];
+    FP z[lanes];
+};
+struct alignas(lanes * sizeof(FP)) ParticleBlock {
+    Vec3Block pos, vel;
+    FP mass[lanes];
+};
+
+inline void pPInteraction(FP piposx, FP piposy, FP piposz, FP& pivelx, FP& pively, FP& pivelz,
+                          FP pjposx, FP pjposy, FP pjposz, FP pjmass) {
+    auto xdist = piposx - pjposx;
+    auto ydist = piposy - pjposy;
+    auto zdist = piposz - pjposz;
+    xdist *= xdist;
+    ydist *= ydist;
+    zdist *= zdist;
+    const auto distSqr = eps2 + xdist + ydist + zdist;
+    const auto distSixth = distSqr * distSqr * distSqr;
+    const auto invDistCube = FP{1} / std::sqrt(distSixth);
+    const auto sts = pjmass * timestep * invDistCube;
+    pivelx += xdist * sts;
+    pively += ydist * sts;
+    pivelz += zdist * sts;
+}
+
+void update(std::span<ParticleBlock> particles) {
+    for(int bi = 0; bi < blocks; bi++) {
+        auto blockI = particles[bi];
+        for(int bj = 0; bj < blocks; bj++)
+            for(int j = 0; j < lanes; j++) {
+#pragma GCC ivdep
+                for(int i = 0; i < lanes; i++) {
+                    pPInteraction(
+                        blockI.pos.x[i],
+                        blockI.pos.y[i],
+                        blockI.pos.z[i],
+                        blockI.vel.x[i],
+                        blockI.vel.y[i],
+                        blockI.vel.z[i],
+                        particles[bj].pos.x[j],
+                        particles[bj].pos.y[j],
+                        particles[bj].pos.z[j],
+                        particles[bj].mass[j]);
+                }
+            }
+
+        particles[bi].vel = blockI.vel;
+    }
+}
+
+void move(std::span<ParticleBlock> particles) {
+    for(int bi = 0; bi < blocks; bi++) {
+        #pragma GCC ivdep
+        for(std::size_t i = 0; i < lanes; ++i) {
+            particles[bi].pos.x[i] += particles[bi].vel.x[i] * timestep;
+            particles[bi].pos.y[i] += particles[bi].vel.y[i] * timestep;
+            particles[bi].pos.z[i] += particles[bi].vel.z[i] * timestep;
+        }
+    }
+}
+
+auto main() -> int {
+    auto particles = std::vector<ParticleBlock>(blocks);
+	
+    std::default_random_engine engine;
+    std::normal_distribution<FP> dist(FP{0}, FP{1});
+    for(int bi = 0; bi < blocks; ++bi) {
+        for(int i = 0; i < lanes; ++i) {
+            particles[bi].pos.x[i] = dist(engine);
+            particles[bi].pos.y[i] = dist(engine);
+            particles[bi].pos.z[i] = dist(engine);
+            particles[bi].vel.x[i] = dist(engine) / FP{10};
+            particles[bi].vel.y[i] = dist(engine) / FP{10};
+            particles[bi].vel.z[i] = dist(engine) / FP{10};
+            particles[bi].mass[i] = dist(engine) / FP{100};
+        }
+    }
+
+    for(int s = 0; s < steps; ++s) {
+        update(particles);
+        ::move(particles);
+    }
+
+    return 0;
+}
diff --git a/examples/nbody_code_comp/nbody-SoA.cpp b/examples/nbody_code_comp/nbody-SoA.cpp
new file mode 100644
index 0000000000..ea5e385911
--- /dev/null
+++ b/examples/nbody_code_comp/nbody-SoA.cpp
@@ -0,0 +1,97 @@
+// Copyright 2024 Bernhard Manfred Gruber
+// SPDX-License-Identifier: MPL-2.0
+
+// clang-format off
+
+#include <random>
+#include <vector>
+
+using FP = float;
+constexpr FP timestep = 0.0001f, eps2 = 0.01f;
+constexpr int steps = 5, problemSize = 64 * 1024;
+
+inline void pPInteraction(FP piposx, FP piposy, FP piposz, FP& pivelx, FP& pively, FP& pivelz,
+                          FP pjposx, FP pjposy, FP pjposz, FP pjmass) {
+    auto xdist = piposx - pjposx;
+    auto ydist = piposy - pjposy;
+    auto zdist = piposz - pjposz;
+    xdist *= xdist;
+    ydist *= ydist;
+    zdist *= zdist;
+    const auto distSqr = eps2 + xdist + ydist + zdist;
+    const auto distSixth = distSqr * distSqr * distSqr;
+    const auto invDistCube = FP{1} / std::sqrt(distSixth);
+    const auto sts = pjmass * timestep * invDistCube;
+    pivelx += xdist * sts;
+    pively += ydist * sts;
+    pivelz += zdist * sts;
+}
+
+void update(const FP* posx, const FP* posy, const FP* posz, FP* velx, FP* vely, FP* velz, const FP* mass) {
+#pragma GCC ivdep
+    for(int i = 0; i < problemSize; i++) {
+        const auto piposx = posx[i];
+        const auto piposy = posy[i];
+        const auto piposz = posz[i];
+        auto pivelx = velx[i];
+        auto pively = vely[i];
+        auto pivelz = velz[i];
+        for(int j = 0; j < problemSize; ++j)
+            pPInteraction(piposx, piposy, piposz, pivelx, pively, pivelz, posx[j], posy[j], posz[j], mass[j]);
+        velx[i] = pivelx;
+        vely[i] = pively;
+        velz[i] = pivelz;
+    }
+}
+
+void move(FP* posx, FP* posy, FP* posz, const FP* velx, const FP* vely, const FP* velz) {
+#pragma GCC ivdep
+    for(int i = 0; i < problemSize; i++) {
+        posx[i] += velx[i] * timestep;
+        posy[i] += vely[i] * timestep;
+        posz[i] += velz[i] * timestep;
+    }
+}
+
+template<typename T>
+struct AlignedAllocator {
+    using value_type = T;
+
+    auto allocate(std::size_t n) const -> T* {
+        return new(std::align_val_t{64}) T[n];
+    }
+
+    void deallocate(T* p, std::size_t) const {
+        ::operator delete[] (p, std::align_val_t{64});
+    }
+};
+
+auto main() -> int {
+    using Vector = std::vector<FP, AlignedAllocator<FP>>;
+    auto posx = Vector(problemSize);
+    auto posy = Vector(problemSize);
+    auto posz = Vector(problemSize);
+    auto velx = Vector(problemSize);
+    auto vely = Vector(problemSize);
+    auto velz = Vector(problemSize);
+    auto mass = Vector(problemSize);
+	
+    std::default_random_engine engine;
+    std::normal_distribution<FP> dist(FP{0}, FP{1});
+    for(int i = 0; i < problemSize; ++i) {
+        posx[i] = dist(engine);
+        posy[i] = dist(engine);
+        posz[i] = dist(engine);
+        velx[i] = dist(engine) / FP{10};
+        vely[i] = dist(engine) / FP{10};
+        velz[i] = dist(engine) / FP{10};
+        mass[i] = dist(engine) / FP{100};
+    }
+
+    for(int s = 0; s < steps; ++s) {
+        update(posx.data(), posy.data(), posz.data(), velx.data(), vely.data(), velz.data(), mass.data());
+        move(posx.data(), posy.data(), posz.data(), velx.data(), vely.data(), velz.data());
+    }
+
+    return 0;
+}
diff --git a/examples/nbody_code_comp/nbody_ported.cpp b/examples/nbody_code_comp/nbody-ported.cpp
similarity index 89%
rename from examples/nbody_code_comp/nbody_ported.cpp
rename to examples/nbody_code_comp/nbody-ported.cpp
index 5177d04ec0..118672f0a2 100644
--- a/examples/nbody_code_comp/nbody_ported.cpp
+++ b/examples/nbody_code_comp/nbody-ported.cpp
@@ -4,7 +4,6 @@
 // clang-format off
 
 #include "llama/llama.hpp"
-
 #include <random>
 #include <vector>
 
@@ -13,8 +12,16 @@ constexpr FP timestep = 0.0001f, eps2 = 0.01f;
 constexpr int steps = 5, problemSize = 64 * 1024;
 
 struct Pos{}; struct Vel{}; struct X{}; struct Y{}; struct Z{}; struct Mass{};
-using V3 = llama::Record<llama::Field<X, FP>, llama::Field<Y, FP>, llama::Field<Z, FP>>;
-using Particle = llama::Record<llama::Field<Pos, V3>, llama::Field<Vel, V3>, llama::Field<Mass, FP>>;
+using Vec3 = llama::Record<
+    llama::Field<X, FP>,
+    llama::Field<Y, FP>,
+    llama::Field<Z, FP>
+>;
+using Particle = llama::Record<
+    llama::Field<Pos, Vec3>,
+    llama::Field<Vel, Vec3>,
+    llama::Field<Mass, FP>
+>;
 
 LLAMA_FN_HOST_ACC_INLINE void pPInteraction(auto&& pi, auto&& pj) {
     auto dist = pi(Pos{}) - pj(Pos{});
@@ -47,6 +54,7 @@ auto main() -> int {
     const auto extents = ArrayExtents{problemSize};
     auto mapping = llama::mapping::AoS<ArrayExtents, Particle>{extents};
     auto particles = llama::allocViewUninitialized(mapping);
+
     std::default_random_engine engine;
     std::normal_distribution<FP> dist(FP{0}, FP{1});
     for(auto&& p : particles) {
@@ -63,5 +71,6 @@ auto main() -> int {
         update(particles);
         ::move(particles);
     }
+
     return 0;
 }