Skip to content

Commit

Permalink
Use OpenMP with SIMD or LLAMA_INDEPENDENT_DATA
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Dec 29, 2023
1 parent dba1158 commit 14080fc
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 28 deletions.
11 changes: 9 additions & 2 deletions examples/nbody/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,23 @@
cmake_minimum_required (VERSION 3.18.3)
project(llama-nbody CXX)

option(LLAMA_NBODY_OPENMP OFF)

find_package(fmt CONFIG REQUIRED)
find_package(OpenMP REQUIRED)
if (LLAMA_NBODY_OPENMP)
find_package(OpenMP REQUIRED)
endif()
find_package(xsimd QUIET)
if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()

add_executable(${PROJECT_NAME} nbody.cpp ../common/Stopwatch.hpp)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama fmt::fmt OpenMP::OpenMP_CXX)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama fmt::fmt)
if (LLAMA_NBODY_OPENMP)
target_link_libraries(${PROJECT_NAME} PRIVATE OpenMP::OpenMP_CXX)
endif()
if (xsimd_FOUND)
target_link_libraries(${PROJECT_NAME} PRIVATE xsimd)
else()
Expand Down
18 changes: 18 additions & 0 deletions examples/nbody/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# n-body simulation

This example shows an n-body simulation, comparing a LLAMA implementation with manually written versions.

## OpenMP

All kernels can be run scalar or using multithreading+SIMD.
For the latter, enable `LLAMA_NBODY_OPENMP` in cmake,
and specify OpenMP thread affinity when executing:
`OMP_NUM_THREADS=x OMP_PROC_BIND=true OMP_PLACES=cores llama-nbody`,
where x is the number of cores on your system.

## rsqrt

The use of the `rsqrt` instruction is disabled,
which is required for comparable benchmarks (so all versions use sqrt and divison).
You can enable `rsqrt` by setting the appropriate variable in the C++ source file,
and compile with `-ffast-math`.
53 changes: 27 additions & 26 deletions examples/nbody/nbody.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@
# define HAVE_XSIMD
#endif

// for multithreading, specify OpenMP thread affinity:
// OMP_NUM_THREADS=x OMP_PROC_BIND=true OMP_PLACES=cores llama-nbody, where x is the number of cores on your system
// for comparable benchmarks, disable rsqrt and replace -ffast-math by -fno-math-errno (so std::sqrt() can be
// vectorized), so all versions use sqrt and divison
#ifdef _OPENMP
# define PARALLEL_FOR _Pragma("omp parallel for simd")
#else
# define PARALLEL_FOR LLAMA_INDEPENDENT_DATA
#endif

using FP = float;

Expand Down Expand Up @@ -217,7 +218,7 @@ namespace usellama
template<int Width, typename View>
void updateSimd(View& particles)
{
# pragma omp parallel for
PARALLEL_FOR
for(std::size_t i = 0; i < problemSize; i += Width)
{
using RecordDim = typename View::RecordDim;
Expand All @@ -232,7 +233,7 @@ namespace usellama
template<int Width, typename View>
void moveSimd(View& particles)
{
# pragma omp parallel for
PARALLEL_FOR
for(std::size_t i = 0; i < problemSize; i += Width)
{
using RecordDim = typename View::RecordDim;
Expand All @@ -248,7 +249,7 @@ namespace usellama
template<typename View>
void update(View& particles)
{
#pragma omp parallel for
PARALLEL_FOR
for(std::size_t i = 0; i < problemSize; i++)
{
llama::One<Particle> pi = particles(i);
Expand All @@ -261,7 +262,7 @@ namespace usellama
template<typename View>
void move(View& particles)
{
#pragma omp parallel for
PARALLEL_FOR
for(std::size_t i = 0; i < problemSize; i++)
particles(i)(tag::Pos{}) += particles(i)(tag::Vel{}) * timestep;
}
Expand Down Expand Up @@ -421,7 +422,7 @@ namespace manualAoS

void update(Particle* particles)
{
#pragma omp parallel for
PARALLEL_FOR
for(std::size_t i = 0; i < problemSize; i++)
{
Particle pi = particles[i];
Expand All @@ -433,7 +434,7 @@ namespace manualAoS

void move(Particle* particles)
{
#pragma omp parallel for
PARALLEL_FOR
for(std::size_t i = 0; i < problemSize; i++)
particles[i].pos += particles[i].vel * timestep;
}
Expand Down Expand Up @@ -511,7 +512,7 @@ namespace manualSoA

void update(FP* posx, FP* posy, FP* posz, FP* velx, FP* vely, FP* velz, FP* mass)
{
#pragma omp parallel for
PARALLEL_FOR
for(std::size_t i = 0; i < problemSize; i++)
{
const FP piposx = posx[i];
Expand All @@ -530,7 +531,7 @@ namespace manualSoA

void move(FP* posx, FP* posy, FP* posz, const FP* velx, const FP* vely, const FP* velz)
{
#pragma omp parallel for
PARALLEL_FOR
for(std::size_t i = 0; i < problemSize; i++)
{
posx[i] += velx[i] * timestep;
Expand Down Expand Up @@ -640,7 +641,7 @@ namespace manualAoSoA
void update(ParticleBlock<Lanes>* particles)
{
constexpr auto blocks = problemSize / Lanes;
#pragma omp parallel for
PARALLEL_FOR
for(std::size_t bi = 0; bi < blocks; bi++)
{
auto blockI = particles[bi];
Expand Down Expand Up @@ -675,7 +676,7 @@ namespace manualAoSoA
constexpr auto blocks = problemSize / Lanes;
constexpr auto blocksPerTile = 128; // L1D_SIZE / sizeof(ParticleBlock<Lanes>);
static_assert(blocks % blocksPerTile == 0);
#pragma omp parallel for
PARALLEL_FOR
for(std::size_t ti = 0; ti < blocks / blocksPerTile; ti++)
for(std::size_t tj = 0; tj < blocks / blocksPerTile; tj++)
for(std::size_t bi = 0; bi < blocksPerTile; bi++)
Expand Down Expand Up @@ -709,7 +710,7 @@ namespace manualAoSoA
void move(ParticleBlock<Lanes>* particles)
{
constexpr auto blocks = problemSize / Lanes;
#pragma omp parallel for
PARALLEL_FOR
for(std::size_t bi = 0; bi < blocks; bi++)
{
LLAMA_INDEPENDENT_DATA
Expand Down Expand Up @@ -857,7 +858,7 @@ namespace manualAoSoAManualAVX
// update (read/write) 8 particles I based on the influence of 1 particle J
void update8(ParticleBlock* particles)
{
# pragma omp parallel for
PARALLEL_FOR
for(std::size_t bi = 0; bi < blocks; bi++)
{
auto& blockI = particles[bi];
Expand Down Expand Up @@ -903,7 +904,7 @@ namespace manualAoSoAManualAVX
// update (read/write) 1 particles I based on the influence of 8 particles J with accumulator
void update1(ParticleBlock* particles)
{
# pragma omp parallel for
PARALLEL_FOR
for(std::size_t bi = 0; bi < blocks; bi++)
for(std::size_t i = 0; i < lanes; i++)
{
Expand Down Expand Up @@ -933,7 +934,7 @@ namespace manualAoSoAManualAVX

void move(ParticleBlock* particles)
{
# pragma omp parallel for
PARALLEL_FOR
for(std::size_t bi = 0; bi < blocks; bi++)
{
auto& block = particles[bi];
Expand Down Expand Up @@ -1071,7 +1072,7 @@ namespace manualAoSoASIMD
constexpr auto lanes = Simd::size;
constexpr auto blocks = problemSize / lanes;

# pragma omp parallel for
PARALLEL_FOR
for(std::ptrdiff_t bi = 0; bi < blocks; bi++)
{
auto& blockI = particles[bi];
Expand Down Expand Up @@ -1110,7 +1111,7 @@ namespace manualAoSoASIMD

constexpr auto blocksPerTile = 128; // L1D_SIZE / sizeof(ParticleBlock);
static_assert(blocks % blocksPerTile == 0);
# pragma omp parallel for
PARALLEL_FOR
for(std::ptrdiff_t ti = 0; ti < blocks / blocksPerTile; ti++)
for(std::size_t bi = 0; bi < blocksPerTile; bi++)
{
Expand Down Expand Up @@ -1169,7 +1170,7 @@ namespace manualAoSoASIMD
constexpr auto lanes = Simd::size;
constexpr auto blocks = problemSize / lanes;

# pragma omp parallel for
PARALLEL_FOR
for(std::ptrdiff_t bi = 0; bi < blocks; bi++)
{
auto& blockI = particles[bi];
Expand Down Expand Up @@ -1212,7 +1213,7 @@ namespace manualAoSoASIMD
{
constexpr auto blocks = problemSize / Simd::size;

# pragma omp parallel for
PARALLEL_FOR
for(std::ptrdiff_t bi = 0; bi < blocks; bi++)
{
// std::for_each(ex, particles, particles + BLOCKS, [&](ParticleBlock& block) {
Expand Down Expand Up @@ -1328,7 +1329,7 @@ namespace manualAoSSIMD
template<typename Simd>
void update(Particle* particles)
{
# pragma omp parallel for
PARALLEL_FOR
for(std::ptrdiff_t i = 0; i < problemSize; i += Simd::size)
{
auto& pi = particles[i];
Expand Down Expand Up @@ -1359,7 +1360,7 @@ namespace manualAoSSIMD
template<typename Simd>
void move(Particle* particles)
{
# pragma omp parallel for
PARALLEL_FOR
for(std::ptrdiff_t i = 0; i < problemSize; i += Simd::size)
{
auto& pi = particles[i];
Expand Down Expand Up @@ -1419,7 +1420,7 @@ namespace manualSoASIMD
template<typename Simd>
void update(const FP* posx, const FP* posy, const FP* posz, FP* velx, FP* vely, FP* velz, const FP* mass)
{
# pragma omp parallel for
PARALLEL_FOR
for(std::ptrdiff_t i = 0; i < problemSize; i += Simd::size)
{
const Simd piposx = Simd::load_aligned(posx + i);
Expand Down Expand Up @@ -1449,7 +1450,7 @@ namespace manualSoASIMD
template<typename Simd>
void move(FP* posx, FP* posy, FP* posz, const FP* velx, const FP* vely, const FP* velz)
{
# pragma omp parallel for
PARALLEL_FOR
for(std::ptrdiff_t i = 0; i < problemSize; i += Simd::size)
{
(Simd::load_aligned(posx + i) + Simd::load_aligned(velx + i) * timestep).store_aligned(posx + i);
Expand Down

0 comments on commit 14080fc

Please sign in to comment.