Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Support async execution in OpenMP wherever it's supported #725

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions coreneuron/gpu/nrn_acc_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -326,8 +326,8 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
/* Here is the example of using OpenACC data enter/exit
* Remember that we are not allowed to use nt->_data but we have to use:
* double *dtmp = nt->_data; // now use dtmp!
#pragma acc enter data copyin(dtmp[0:nt->_ndata]) async(nt->stream_id)
#pragma acc wait(nt->stream_id)
#pragma acc enter data copyin(dtmp[0:nt->_ndata]) async(nt->streams[nt->stream_id])
#pragma acc wait(nt->streams[nt->stream_id])
*/

/*update d_nt._data to point to device copy */
Expand Down Expand Up @@ -736,7 +736,7 @@ void update_net_receive_buffer(NrnThread* nt) {
nrb->_nrb_flag[:nrb->_cnt],
nrb->_displ[:nrb->_displ_cnt + 1],
nrb->_nrb_index[:nrb->_cnt])
async(nt->stream_id))
async(nt->streams[nt->stream_id]))
nrn_pragma_omp(target update to(nrb->_cnt,
nrb->_displ_cnt,
nrb->_pnt_index[:nrb->_cnt],
Expand All @@ -749,7 +749,7 @@ void update_net_receive_buffer(NrnThread* nt) {
}
}
}
nrn_pragma_acc(wait(nt->stream_id))
nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
}

void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
Expand Down
4 changes: 3 additions & 1 deletion coreneuron/io/phase2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,9 @@ void Phase2::populate(NrnThread& nt, const UserParams& userParams) {
* greater than number of omp threads.
*/
#if defined(_OPENMP)
nt.stream_id = omp_get_thread_num();
const auto omp_thread_num = omp_get_thread_num();
nt.stream_id = omp_thread_num;
nt.streams[nt.stream_id] = omp_thread_num;
#endif

int shadow_rhs_cnt = 0;
Expand Down
8 changes: 4 additions & 4 deletions coreneuron/mechanism/capac.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
ni [0:_cntml_actual],
_vec_d [0:_nt->end]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
async(_nt->streams[_nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
for (_iml = 0; _iml < _cntml_actual; _iml++) {
_vec_d[ni[_iml]] += cfac * cm;
}
Expand Down Expand Up @@ -116,8 +116,8 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
ni [0:_cntml_actual],
_vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
async(_nt->streams[_nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
for (int _iml = 0; _iml < _cntml_actual; _iml++) {
i_cap = cfac * cm * _vec_rhs[ni[_iml]];
}
Expand Down
18 changes: 8 additions & 10 deletions coreneuron/mechanism/eion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,14 +263,12 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) {
int _cntml_padded = ml->_nodecount_padded;
pd = ml->data;
ppd = ml->pdata;
// clang-format off
nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5],
nrn_ion_global_map[0:nrn_ion_global_map_size]
[0:ion_global_map_member_size])
if (nt->compute_gpu)
async(nt->stream_id))
// clang-format on
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
nrn_pragma_acc(parallel loop present(
pd [0:_cntml_padded * 5],
nrn_ion_global_map
[0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu)
async(nt->streams[nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
dcurdv = 0.;
cur = 0.;
Expand Down Expand Up @@ -342,8 +340,8 @@ void second_order_cur(NrnThread* _nt, int secondorder) {
nrn_pragma_acc(parallel loop present(pd [0:_cntml_padded * 5],
ni [0:_cntml_actual],
_vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
async(_nt->streams[_nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
cur += dcurdv * (_vec_rhs[ni[_iml]]);
}
Expand Down
20 changes: 14 additions & 6 deletions coreneuron/network/netcvode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -533,8 +533,10 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method

nrn_pragma_acc(parallel loop present(
nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end])
copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu))
copy(net_send_buf_count) if (nt->compute_gpu)
async(nt->streams[nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count)
if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
for (int i = 0; i < nt->ncell; ++i) {
PreSyn* ps = presyns + i;
PreSynHelper* psh = presyns_helper + i;
Expand All @@ -561,16 +563,22 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method
nt->_net_send_buffer[idx] = i;
}
}
nrn_pragma_acc(wait(nt->stream_id))
nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
nrn_pragma_omp(taskwait)
nt->_net_send_buffer_cnt = net_send_buf_count;

if (nt->compute_gpu && nt->_net_send_buffer_cnt) {
#ifdef CORENEURON_ENABLE_GPU
int* nsbuffer = nt->_net_send_buffer;
#endif
nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->stream_id))
nrn_pragma_acc(wait(nt->stream_id))
nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt]))
nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt])
async(nt->streams[nt->stream_id]))
nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
// clang-format off
nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt])
depend(inout: nt->streams[nt->stream_id]) nowait)
// clang-format on
nrn_pragma_omp(taskwait)
}

// on CPU...
Expand Down
15 changes: 9 additions & 6 deletions coreneuron/network/partrans.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,22 +56,25 @@ void nrnmpi_v_transfer() {
nrn_pragma_acc(parallel loop present(src_indices [0:n_src_gather],
src_data [0:nt->_ndata],
src_gather [0:n_src_gather]) if (nt->compute_gpu)
async(nt->stream_id))
async(nt->streams[nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
for (int i = 0; i < n_src_gather; ++i) {
src_gather[i] = src_data[src_indices[i]];
}
nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu)
async(nt->stream_id))
nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu))
async(nt->streams[nt->stream_id]))
// clang-format off
nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu)
depend(inout: nt->streams[nt->stream_id]) nowait)
// clang-format on
}

// copy gathered source values to outsrc_buf_
bool compute_gpu = false;
for (int tid = 0; tid < nrn_nthread; ++tid) {
if (nrn_threads[tid].compute_gpu) {
compute_gpu = true;
nrn_pragma_acc(wait(nrn_threads[tid].stream_id))
nrn_pragma_acc(wait async(nrn_threads[tid].streams[nrn_threads[tid].stream_id]))
}
TransferThreadData& ttd = transfer_thread_data_[tid];
size_t n_outsrc_indices = ttd.outsrc_indices.size();
Expand Down Expand Up @@ -122,8 +125,8 @@ void nrnthread_v_transfer(NrnThread* _nt) {
nrn_pragma_acc(parallel loop present(insrc_indices [0:ntar],
tar_data [0:ndata],
insrc_buf_ [0:n_insrc_buf]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu))
async(_nt->streams[_nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
for (size_t i = 0; i < ntar; ++i) {
tar_data[tar_indices[i]] = insrc_buf_[insrc_indices[i]];
}
Expand Down
30 changes: 18 additions & 12 deletions coreneuron/permute/cellorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -600,14 +600,18 @@ void solve_interleaved2(int ith) {
defined(_OPENACC)
int nstride = stridedispl[nwarp];
#endif
nrn_pragma_acc(parallel loop gang vector vector_length(
warpsize) present(nt [0:1],
strides [0:nstride],
ncycles [0:nwarp],
stridedispl [0:nwarp + 1],
rootbegin [0:nwarp + 1],
nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
nrn_pragma_acc(parallel loop gang vector vector_length(warpsize)
present(nt [0:1],
strides [0:nstride],
ncycles [0:nwarp],
stridedispl [0:nwarp + 1],
rootbegin [0:nwarp + 1],
nodebegin [0:nwarp + 1]) if (nt->compute_gpu)
async(nt->streams[nt->stream_id]))
// clang-format off
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)
depend(inout: nt->streams[nt->stream_id]) nowait)
// clang-format on
for (int icore = 0; icore < ncore; ++icore) {
int iwarp = icore / warpsize; // figure out the >> value
int ic = icore & (warpsize - 1); // figure out the & mask
Expand All @@ -626,7 +630,8 @@ void solve_interleaved2(int ith) {
} // serial test mode
#endif
}
nrn_pragma_acc(wait(nt->stream_id))
nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
nrn_pragma_omp(taskwait)
#ifdef _OPENACC
}
#endif
Expand Down Expand Up @@ -660,14 +665,15 @@ void solve_interleaved1(int ith) {
firstnode [0:ncell],
lastnode [0:ncell],
cellsize [0:ncell]) if (nt->compute_gpu)
async(nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
async(nt->streams[nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
for (int icell = 0; icell < ncell; ++icell) {
int icellsize = cellsize[icell];
triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode);
bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode);
}
nrn_pragma_acc(wait(nt->stream_id))
nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
nrn_pragma_omp(taskwait)
}

void solve_interleaved(int ith) {
Expand Down
33 changes: 19 additions & 14 deletions coreneuron/sim/fadvance_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */
nt->cj = 1.0 / dt;
}
nrn_pragma_acc(update device(nt->_t, nt->_dt, nt->cj)
async(nt->stream_id) if (nt->compute_gpu))
async(nt->streams[nt->stream_id]) if (nt->compute_gpu))
// clang-format off
nrn_pragma_omp(target update to(nt->_t, nt->_dt, nt->cj)
if(nt->compute_gpu))
Expand Down Expand Up @@ -206,15 +206,15 @@ void update(NrnThread* _nt) {
/* do not need to worry about linmod or extracellular*/
if (secondorder) {
nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
async(_nt->streams[_nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
for (int i = 0; i < i2; ++i) {
vec_v[i] += 2. * vec_rhs[i];
}
} else {
nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
async(_nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
async(_nt->streams[_nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
for (int i = 0; i < i2; ++i) {
vec_v[i] += vec_rhs[i];
}
Expand Down Expand Up @@ -295,8 +295,8 @@ void nrncore2nrn_send_values(NrnThread* nth) {
assert(vs < tr->bsize);

nrn_pragma_acc(parallel loop present(tr [0:1]) if (nth->compute_gpu)
async(nth->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu))
async(nth->streams[nth->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu) depend(inout: nth->streams[nth->stream_id]) nowait)
for (int i = 0; i < tr->n_trajec; ++i) {
tr->varrays[i][vs] = *tr->gather[i];
}
Expand All @@ -316,10 +316,13 @@ void nrncore2nrn_send_values(NrnThread* nth) {
for (int i = 0; i < tr->n_trajec; ++i) {
double* gather_i = tr->gather[i];
nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu)
async(nth->stream_id))
nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu))
async(nth->streams[nth->stream_id]))
// clang-format off
nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu)
depend(inout: nth->streams[nth->stream_id]) nowait)
// clang-format on
}
nrn_pragma_acc(wait(nth->stream_id))
nrn_pragma_acc(wait async(nth->streams[nth->stream_id]))
for (int i = 0; i < tr->n_trajec; ++i) {
*(tr->scatter[i]) = *(tr->gather[i]);
}
Expand All @@ -341,8 +344,9 @@ static void* nrn_fixed_step_thread(NrnThread* nth) {
if (nth->ncell) {
/*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can
launch kernel) */
nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
nrn_pragma_acc(wait(nth->stream_id))
nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu)
async(nth->streams[nth->stream_id]))
nrn_pragma_acc(wait async(nth->streams[nth->stream_id]))
nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
fixed_play_continuous(nth);

Expand Down Expand Up @@ -377,8 +381,9 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) {

if (nth->ncell) {
/*@todo: do we need to update nth->_t on GPU */
nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
nrn_pragma_acc(wait(nth->stream_id))
nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu)
async(nth->streams[nth->stream_id]))
nrn_pragma_acc(wait async(nth->streams[nth->stream_id]))
nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
fixed_play_continuous(nth);
nonvint(nth);
Expand Down
8 changes: 4 additions & 4 deletions coreneuron/sim/fast_imem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ void nrn_calc_fast_imem(NrnThread* nt) {
double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
nrn_pragma_acc(
parallel loop present(vec_rhs, vec_area, fast_imem_d, fast_imem_rhs) if (nt->compute_gpu)
async(nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
async(nt->streams[nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
for (int i = i1; i < i3; ++i) {
fast_imem_rhs[i] = (fast_imem_d[i] * vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
}
Expand All @@ -69,8 +69,8 @@ void nrn_calc_fast_imem_init(NrnThread* nt) {

double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
nrn_pragma_acc(parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu)
async(nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
async(nt->streams[nt->stream_id]))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
for (int i = i1; i < i3; ++i) {
fast_imem_rhs[i] = (vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
}
Expand Down
7 changes: 7 additions & 0 deletions coreneuron/sim/multicore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
#include "coreneuron/coreneuron.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"

#if defined(_OPENMP)
#include <omp.h>
#endif

/*
Now that threads have taken over the actual_v, v_node, etc, it might
be a good time to regularize the method of freeing, allocating, and
Expand Down Expand Up @@ -106,6 +110,9 @@ void nrn_threads_create(int n) {
for (int i = 0; i < nrn_nthread; ++i) {
NrnThread& nt = nrn_threads[i];
nt.id = i;
#if defined(_OPENMP)
nt.streams.reserve(omp_get_num_threads());
#endif
for (int j = 0; j < BEFORE_AFTER_SIZE; ++j) {
nt.tbl[j] = nullptr;
}
Expand Down
Loading