Skip to content

Commit d202588

Browse files
committed
Merge branch 'development' of https://github.com/AMReX-Codes/amrex into development
2 parents f08be5f + 209018d commit d202588

File tree

2 files changed

+98
-74
lines changed

2 files changed

+98
-74
lines changed

Src/Particle/AMReX_ParticleContainerI.H

Lines changed: 95 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1104,7 +1104,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
11041104
floor((p_ptr[i].pos(2)-plo[2])*dxi[2]))
11051105
);
11061106

1107-
iv += domain.smallEnd();
1107+
iv += domain.smallEnd();
11081108

11091109
int grid_id = (*mask_ptr)(iv);
11101110

@@ -1120,7 +1120,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
11201120
}
11211121
});
11221122

1123-
thrust::exclusive_scan(thrust::device,
1123+
thrust::exclusive_scan(thrust::cuda::par(Cuda::The_ThrustCachedAllocator()),
11241124
thrust::make_zip_iterator(thrust::make_tuple(m_lo.begin(), m_hi.begin())),
11251125
thrust::make_zip_iterator(thrust::make_tuple(m_lo.end(), m_hi.end())),
11261126
thrust::make_zip_iterator(thrust::make_tuple(m_lo.begin(), m_hi.begin())),
@@ -1238,7 +1238,12 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
12381238
// Note that a negative grid id means the particle has left the domain in a non-
12391239
// periodic direction - we remove those from the simulation volume here.
12401240
//
1241-
for (auto& kv : m_not_ours) kv.second.resize(0);
1241+
std::map<int, size_t> send_bytes;
1242+
std::map<int, int> proc_index;
1243+
Vector<int> DstProc;
1244+
Vector<std::size_t> sOffset; // Offset (in bytes) in the send buffer
1245+
char* snd_buffer;
1246+
12421247
const int num_grids = ba.size();
12431248
const int num_to_move = m_grids_to_redistribute.size();
12441249

@@ -1311,18 +1316,47 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
13111316
thrust::host_vector<int> stop(m_grid_end);
13121317

13131318
std::map<int, size_t> grid_counts;
1319+
std::map<int, size_t> np_counts;
13141320
for (int i = 0; i < num_grids; ++i)
13151321
{
13161322
const int dest_proc = dmap[i];
1323+
const size_t num_to_add = stop[i+1] - start[i+1];
13171324
if (dest_proc != ParallelDescriptor::MyProc())
13181325
{
13191326
grid_counts[dest_proc] += 1;
1327+
np_counts[dest_proc] += num_to_add;
13201328
}
13211329
}
1330+
1331+
for (const auto& kv : np_counts)
1332+
{
1333+
send_bytes[kv.first] = sizeof(size_t) + kv.second*superparticle_size
1334+
+ grid_counts[kv.first]*(sizeof(size_t) + 2*sizeof(int));
1335+
}
13221336

1337+
Vector<int> current_sizes;
1338+
std::size_t TotSndBytes = 0;
1339+
for (const auto& kv : send_bytes)
1340+
{
1341+
DstProc.push_back(kv.first);
1342+
proc_index[kv.first] = DstProc.size() - 1;
1343+
sOffset.push_back(TotSndBytes);
1344+
TotSndBytes += kv.second;
1345+
current_sizes.push_back(0);
1346+
}
1347+
1348+
if (ParallelDescriptor::UseGpuAwareMpi())
1349+
{
1350+
snd_buffer = static_cast<char*>(amrex::The_Device_Arena()->alloc(TotSndBytes));
1351+
}
1352+
else
1353+
{
1354+
snd_buffer = static_cast<char*>(amrex::The_Pinned_Arena()->alloc(TotSndBytes));
1355+
}
1356+
13231357
//
13241358
// Each destination grid, copy the appropriate particle data, passing the non-local data
1325-
// into m_not_ours
1359+
// into snd_buffer
13261360
//
13271361
for (int i = 0; i < num_grids; ++i)
13281362
{
@@ -1373,41 +1407,38 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
13731407
}
13741408
else // this is the non-local case
13751409
{
1376-
char* dst;
1377-
const size_t old_size = m_not_ours[dest_proc].size();
1410+
char* dst = snd_buffer + sOffset[proc_index[dest_proc]];
1411+
const size_t old_size = current_sizes[proc_index[dest_proc]];
13781412
const size_t new_size
13791413
= old_size + num_to_add*superparticle_size + sizeof(size_t) + 2*sizeof(int);
1380-
1414+
13811415
if (old_size == 0)
13821416
{
1383-
m_not_ours[dest_proc].resize(new_size + sizeof(size_t));
1384-
cudaMemcpyAsync(thrust::raw_pointer_cast(m_not_ours[dest_proc].data()),
1385-
&grid_counts[dest_proc], sizeof(size_t), cudaMemcpyHostToDevice);
1386-
dst = thrust::raw_pointer_cast(
1387-
m_not_ours[dest_proc].data() + old_size + sizeof(size_t));
1417+
current_sizes[proc_index[dest_proc]] = new_size + sizeof(size_t);
1418+
cudaMemcpyAsync(dst, &grid_counts[dest_proc], sizeof(size_t),
1419+
cudaMemcpyHostToHost);
1420+
dst += sizeof(size_t);
13881421
} else
13891422
{
1390-
m_not_ours[dest_proc].resize(new_size);
1391-
dst = thrust::raw_pointer_cast(m_not_ours[dest_proc].data() + old_size);
1423+
current_sizes[proc_index[dest_proc]] = new_size;
1424+
dst += old_size;
13921425
}
1393-
1394-
cudaMemcpyAsync(thrust::raw_pointer_cast(dst),
1395-
&num_to_add, sizeof(size_t), cudaMemcpyHostToDevice);
1426+
1427+
cudaMemcpyAsync(dst, &num_to_add, sizeof(size_t), cudaMemcpyHostToHost);
13961428
dst += sizeof(size_t);
13971429

1398-
cudaMemcpyAsync(thrust::raw_pointer_cast(dst), &i, sizeof(int), cudaMemcpyHostToDevice);
1430+
cudaMemcpyAsync(dst, &i, sizeof(int), cudaMemcpyHostToHost);
13991431
dst += sizeof(int);
14001432

1401-
cudaMemcpyAsync(thrust::raw_pointer_cast(dst),
1402-
&dest_proc, sizeof(int), cudaMemcpyHostToDevice);
1433+
cudaMemcpyAsync(dst, &dest_proc, sizeof(int), cudaMemcpyHostToHost);
14031434
dst += sizeof(int);
14041435

14051436
// pack structs
14061437
{
14071438
auto& aos = m_aos_to_redistribute;
1408-
cudaMemcpyAsync(thrust::raw_pointer_cast(dst),
1439+
cudaMemcpyAsync(dst,
14091440
thrust::raw_pointer_cast(aos.data() + start[i+1]),
1410-
num_to_add*sizeof(ParticleType), cudaMemcpyDeviceToDevice);
1441+
num_to_add*sizeof(ParticleType), cudaMemcpyDeviceToHost);
14111442
dst += num_to_add*sizeof(ParticleType);
14121443
}
14131444

@@ -1416,9 +1447,9 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
14161447
{
14171448
if (not communicate_real_comp[j]) continue;
14181449
auto& attrib = m_real_arrays_to_redistribute[j];
1419-
cudaMemcpyAsync(thrust::raw_pointer_cast(dst),
1450+
cudaMemcpyAsync(dst,
14201451
thrust::raw_pointer_cast(attrib.data() + start[i+1]),
1421-
num_to_add*sizeof(Real), cudaMemcpyDeviceToDevice);
1452+
num_to_add*sizeof(Real), cudaMemcpyDeviceToHost);
14221453
dst += num_to_add*sizeof(Real);
14231454
}
14241455

@@ -1427,16 +1458,16 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
14271458
{
14281459
if (not communicate_int_comp[j]) continue;
14291460
auto& attrib = m_int_arrays_to_redistribute[j];
1430-
cudaMemcpyAsync(thrust::raw_pointer_cast(dst),
1461+
cudaMemcpyAsync(dst,
14311462
thrust::raw_pointer_cast(attrib.data() + start[i+1]),
1432-
num_to_add*sizeof(int), cudaMemcpyDeviceToDevice);
1463+
num_to_add*sizeof(int), cudaMemcpyDeviceToHost);
14331464
dst += num_to_add*sizeof(int);
14341465
}
14351466
}
14361467
}
14371468
}
14381469

1439-
RedistributeMPIGPU();
1470+
RedistributeMPIGPU(send_bytes, DstProc, sOffset, snd_buffer);
14401471

14411472
EnforcePeriodicGPU();
14421473

@@ -1487,7 +1518,9 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
14871518
template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
14881519
void
14891520
ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1490-
::RedistributeMPIGPU ()
1521+
::RedistributeMPIGPU (const std::map<int, size_t>& send_bytes,
1522+
Vector<int>& DstProc, Vector<std::size_t>& sOffset,
1523+
char* snd_buffer)
14911524
{
14921525
BL_PROFILE("ParticleContainer::RedistributeMPIGPU()");
14931526

@@ -1500,11 +1533,11 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
15001533

15011534
long NumSnds = 0;
15021535

1503-
for (const auto& kv : m_not_ours)
1536+
for (const auto& kv : send_bytes)
15041537
{
1505-
const int np = kv.second.size();
1506-
Snds[kv.first] = np;
1507-
NumSnds += np;
1538+
const size_t nbytes = kv.second;
1539+
Snds[kv.first] = nbytes;
1540+
NumSnds += nbytes;
15081541
}
15091542

15101543
ParallelDescriptor::ReduceLongMax(NumSnds);
@@ -1540,30 +1573,29 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
15401573
}
15411574

15421575
const int nrcvs = RcvProc.size();
1576+
const int nsnds = DstProc.size();
15431577
Vector<MPI_Status> stats(nrcvs);
15441578
Vector<MPI_Request> rreqs(nrcvs);
15451579

15461580
const int SeqNum = ParallelDescriptor::SeqNum();
15471581

1548-
// Allocate data for rcvs as one big chunk.
1549-
m_recvdata.resize(TotRcvBytes);
1550-
1582+
// Allocate data for rcvs as one big chunk.
15511583
char* rcv_buffer;
15521584
if (ParallelDescriptor::UseGpuAwareMpi())
15531585
{
1554-
rcv_buffer = thrust::raw_pointer_cast(m_recvdata.data());
1586+
rcv_buffer = static_cast<char*>(amrex::The_Device_Arena()->alloc(TotRcvBytes));
15551587
}
15561588
else
15571589
{
1558-
m_host_rcv_buffer.resize(TotRcvBytes);
1559-
rcv_buffer = &(m_host_rcv_buffer[0]);
1590+
rcv_buffer = static_cast<char*>(amrex::The_Pinned_Arena()->alloc(TotRcvBytes));
15601591
}
15611592

15621593
// Post receives.
15631594
for (int i = 0; i < nrcvs; ++i) {
15641595
const auto Who = RcvProc[i];
15651596
const auto offset = rOffset[i];
15661597
const auto Cnt = Rcvs[Who];
1598+
15671599
BL_ASSERT(Cnt > 0);
15681600
BL_ASSERT(Cnt < std::numeric_limits<int>::max());
15691601
BL_ASSERT(Who >= 0 && Who < NProcs);
@@ -1573,50 +1605,35 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
15731605
}
15741606

15751607
// Send.
1576-
for (const auto& kv : m_not_ours) {
1577-
const auto Who = kv.first;
1578-
const auto Cnt = kv.second.size();
1579-
1608+
for (int i = 0; i < nsnds; ++i) {
1609+
const auto Who = DstProc[i];
1610+
const auto offset = sOffset[i];
1611+
const auto Cnt = Snds[Who];
1612+
15801613
BL_ASSERT(Cnt > 0);
15811614
BL_ASSERT(Who >= 0 && Who < NProcs);
15821615
BL_ASSERT(Cnt < std::numeric_limits<int>::max());
15831616

1584-
if (ParallelDescriptor::UseGpuAwareMpi())
1585-
{
1586-
ParallelDescriptor::Send(thrust::raw_pointer_cast(kv.second.data()),
1587-
Cnt, Who, SeqNum);
1588-
} else
1589-
{
1590-
m_host_snd_buffer.resize(Cnt);
1591-
thrust::copy(kv.second.begin(), kv.second.end(), m_host_snd_buffer.begin());
1592-
ParallelDescriptor::Send(thrust::raw_pointer_cast(m_host_snd_buffer.data()),
1593-
Cnt, Who, SeqNum);
1594-
}
1617+
ParallelDescriptor::Send(snd_buffer + offset, Cnt, Who, SeqNum);
15951618
}
15961619

15971620
if (nrcvs > 0) {
15981621
ParallelDescriptor::Waitall(rreqs, stats);
15991622

1600-
if (not ParallelDescriptor::UseGpuAwareMpi())
1601-
{
1602-
thrust::copy(m_host_rcv_buffer.begin(), m_host_rcv_buffer.end(),
1603-
m_recvdata.data());
1604-
}
1605-
16061623
for (int i = 0; i < nrcvs; ++i) {
16071624
const int offset = rOffset[i];
1608-
char* buffer = thrust::raw_pointer_cast(m_recvdata.data() + offset);
1625+
char* buffer = thrust::raw_pointer_cast(rcv_buffer + offset);
16091626
size_t num_grids, num_particles;
16101627
int gid, pid;
1611-
cudaMemcpy(&num_grids, buffer, sizeof(size_t), cudaMemcpyDeviceToHost);
1628+
cudaMemcpy(&num_grids, buffer, sizeof(size_t), cudaMemcpyHostToHost);
16121629
buffer += sizeof(size_t);
16131630

16141631
for (int g = 0; g < num_grids; ++g) {
1615-
cudaMemcpy(&num_particles, buffer, sizeof(size_t), cudaMemcpyDeviceToHost);
1632+
cudaMemcpy(&num_particles, buffer, sizeof(size_t), cudaMemcpyHostToHost);
16161633
buffer += sizeof(size_t);
1617-
cudaMemcpy(&gid, buffer, sizeof(int), cudaMemcpyDeviceToHost);
1634+
cudaMemcpy(&gid, buffer, sizeof(int), cudaMemcpyHostToHost);
16181635
buffer += sizeof(int);
1619-
cudaMemcpy(&pid, buffer, sizeof(int), cudaMemcpyDeviceToHost);
1636+
cudaMemcpy(&pid, buffer, sizeof(int), cudaMemcpyHostToHost);
16201637
buffer += sizeof(int);
16211638

16221639
if (num_particles == 0) continue;
@@ -1635,15 +1652,15 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
16351652
//copy structs
16361653
cudaMemcpyAsync(static_cast<ParticleType*>(aos().data()) + old_size,
16371654
buffer, num_particles*sizeof(ParticleType),
1638-
cudaMemcpyDeviceToDevice);
1655+
cudaMemcpyHostToDevice);
16391656
buffer += num_particles*sizeof(ParticleType);
16401657

16411658
// copy real arrays
16421659
for (int j = 0; j < NArrayReal; ++j) {
16431660
if (not communicate_real_comp[j]) continue;
16441661
auto& attrib = soa.GetRealData(j);
16451662
cudaMemcpyAsync(attrib.data() + old_size, buffer, num_particles*sizeof(Real),
1646-
cudaMemcpyDeviceToDevice);
1663+
cudaMemcpyHostToDevice);
16471664
buffer += num_particles*sizeof(Real);
16481665
}
16491666

@@ -1652,13 +1669,23 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
16521669
if (not communicate_int_comp[j]) continue;
16531670
auto& attrib = soa.GetIntData(j);
16541671
cudaMemcpyAsync(attrib.data() + old_size, buffer, num_particles*sizeof(int),
1655-
cudaMemcpyDeviceToDevice);
1672+
cudaMemcpyHostToDevice);
16561673
buffer += num_particles*sizeof(int);
16571674
}
16581675
}
16591676
}
16601677
}
1661-
}
1678+
}
1679+
1680+
if (ParallelDescriptor::UseGpuAwareMpi())
1681+
{
1682+
amrex::The_Device_Arena()->free(snd_buffer);
1683+
amrex::The_Device_Arena()->free(rcv_buffer);
1684+
} else {
1685+
amrex::The_Pinned_Arena()->free(snd_buffer);
1686+
amrex::The_Pinned_Arena()->free(rcv_buffer);
1687+
}
1688+
16621689
#endif // MPI
16631690
}
16641691
#endif // AMREX_USE_CUDA

Src/Particle/AMReX_Particles.H

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,9 @@ public:
554554
bool OKGPU (int lev_min = 0, int lev_max = -1, int nGrow = 0) const;
555555

556556
#ifdef AMREX_USE_CUDA
557-
void RedistributeMPIGPU ();
557+
void RedistributeMPIGPU (const std::map<int, size_t>& send_bytes,
558+
Vector<int>& DstProc, Vector<std::size_t>& sOffset,
559+
char* snd_buffer);
558560

559561
void EnforcePeriodicGPU ();
560562
#endif
@@ -641,16 +643,11 @@ protected:
641643
mutable thrust::device_vector<int> m_hi;
642644
mutable thrust::device_vector<int> m_output;
643645

644-
mutable thrust::host_vector<char> m_host_snd_buffer;
645-
mutable thrust::host_vector<char> m_host_rcv_buffer;
646-
647646
mutable thrust::device_vector<amrex::IntVect> m_cells_tmp;
648647
mutable thrust::device_vector<int> m_grids_tmp;
649648
mutable thrust::device_vector<int> m_grid_begin;
650649
mutable thrust::device_vector<int> m_grid_end;
651650
mutable thrust::device_vector<int> m_index_sequence_tmp;
652-
mutable std::map<int, thrust::device_vector<char> > m_not_ours;
653-
mutable thrust::device_vector<char> m_recvdata;
654651
#endif
655652

656653
private:

0 commit comments

Comments
 (0)