@@ -1104,7 +1104,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1104
1104
floor ((p_ptr[i].pos (2 )-plo[2 ])*dxi[2 ]))
1105
1105
);
1106
1106
1107
- iv += domain.smallEnd ();
1107
+ iv += domain.smallEnd ();
1108
1108
1109
1109
int grid_id = (*mask_ptr)(iv);
1110
1110
@@ -1120,7 +1120,7 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1120
1120
}
1121
1121
});
1122
1122
1123
- thrust::exclusive_scan (thrust::device ,
1123
+ thrust::exclusive_scan (thrust::cuda::par ( Cuda::The_ThrustCachedAllocator ()) ,
1124
1124
thrust::make_zip_iterator (thrust::make_tuple (m_lo.begin (), m_hi.begin ())),
1125
1125
thrust::make_zip_iterator (thrust::make_tuple (m_lo.end (), m_hi.end ())),
1126
1126
thrust::make_zip_iterator (thrust::make_tuple (m_lo.begin (), m_hi.begin ())),
@@ -1238,7 +1238,12 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1238
1238
// Note that a negative grid id means the particle has left the domain in a non-
1239
1239
// periodic direction - we remove those from the simulation volume here.
1240
1240
//
1241
- for (auto & kv : m_not_ours) kv.second .resize (0 );
1241
+ std::map<int , size_t > send_bytes;
1242
+ std::map<int , int > proc_index;
1243
+ Vector<int > DstProc;
1244
+ Vector<std::size_t > sOffset ; // Offset (in bytes) in the send buffer
1245
+ char * snd_buffer;
1246
+
1242
1247
const int num_grids = ba.size ();
1243
1248
const int num_to_move = m_grids_to_redistribute.size ();
1244
1249
@@ -1311,18 +1316,47 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1311
1316
thrust::host_vector<int > stop (m_grid_end);
1312
1317
1313
1318
std::map<int , size_t > grid_counts;
1319
+ std::map<int , size_t > np_counts;
1314
1320
for (int i = 0 ; i < num_grids; ++i)
1315
1321
{
1316
1322
const int dest_proc = dmap[i];
1323
+ const size_t num_to_add = stop[i+1 ] - start[i+1 ];
1317
1324
if (dest_proc != ParallelDescriptor::MyProc ())
1318
1325
{
1319
1326
grid_counts[dest_proc] += 1 ;
1327
+ np_counts[dest_proc] += num_to_add;
1320
1328
}
1321
1329
}
1330
+
1331
+ for (const auto & kv : np_counts)
1332
+ {
1333
+ send_bytes[kv.first ] = sizeof (size_t ) + kv.second *superparticle_size
1334
+ + grid_counts[kv.first ]*(sizeof (size_t ) + 2 *sizeof (int ));
1335
+ }
1322
1336
1337
+ Vector<int > current_sizes;
1338
+ std::size_t TotSndBytes = 0 ;
1339
+ for (const auto & kv : send_bytes)
1340
+ {
1341
+ DstProc.push_back (kv.first );
1342
+ proc_index[kv.first ] = DstProc.size () - 1 ;
1343
+ sOffset .push_back (TotSndBytes);
1344
+ TotSndBytes += kv.second ;
1345
+ current_sizes.push_back (0 );
1346
+ }
1347
+
1348
+ if (ParallelDescriptor::UseGpuAwareMpi ())
1349
+ {
1350
+ snd_buffer = static_cast <char *>(amrex::The_Device_Arena ()->alloc (TotSndBytes));
1351
+ }
1352
+ else
1353
+ {
1354
+ snd_buffer = static_cast <char *>(amrex::The_Pinned_Arena ()->alloc (TotSndBytes));
1355
+ }
1356
+
1323
1357
//
1324
1358
// Each destination grid, copy the appropriate particle data, passing the non-local data
1325
- // into m_not_ours
1359
+ // into snd_buffer
1326
1360
//
1327
1361
for (int i = 0 ; i < num_grids; ++i)
1328
1362
{
@@ -1373,41 +1407,38 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1373
1407
}
1374
1408
else // this is the non-local case
1375
1409
{
1376
- char * dst;
1377
- const size_t old_size = m_not_ours[ dest_proc]. size () ;
1410
+ char * dst = snd_buffer + sOffset [proc_index[dest_proc]] ;
1411
+ const size_t old_size = current_sizes[proc_index[ dest_proc]] ;
1378
1412
const size_t new_size
1379
1413
= old_size + num_to_add*superparticle_size + sizeof (size_t ) + 2 *sizeof (int );
1380
-
1414
+
1381
1415
if (old_size == 0 )
1382
1416
{
1383
- m_not_ours[dest_proc].resize (new_size + sizeof (size_t ));
1384
- cudaMemcpyAsync (thrust::raw_pointer_cast (m_not_ours[dest_proc].data ()),
1385
- &grid_counts[dest_proc], sizeof (size_t ), cudaMemcpyHostToDevice);
1386
- dst = thrust::raw_pointer_cast (
1387
- m_not_ours[dest_proc].data () + old_size + sizeof (size_t ));
1417
+ current_sizes[proc_index[dest_proc]] = new_size + sizeof (size_t );
1418
+ cudaMemcpyAsync (dst, &grid_counts[dest_proc], sizeof (size_t ),
1419
+ cudaMemcpyHostToHost);
1420
+ dst += sizeof (size_t );
1388
1421
} else
1389
1422
{
1390
- m_not_ours[ dest_proc]. resize ( new_size) ;
1391
- dst = thrust::raw_pointer_cast (m_not_ours[dest_proc]. data () + old_size) ;
1423
+ current_sizes[proc_index[ dest_proc]] = new_size;
1424
+ dst += old_size;
1392
1425
}
1393
-
1394
- cudaMemcpyAsync (thrust::raw_pointer_cast (dst),
1395
- &num_to_add, sizeof (size_t ), cudaMemcpyHostToDevice);
1426
+
1427
+ cudaMemcpyAsync (dst, &num_to_add, sizeof (size_t ), cudaMemcpyHostToHost);
1396
1428
dst += sizeof (size_t );
1397
1429
1398
- cudaMemcpyAsync (thrust::raw_pointer_cast ( dst) , &i, sizeof (int ), cudaMemcpyHostToDevice );
1430
+ cudaMemcpyAsync (dst, &i, sizeof (int ), cudaMemcpyHostToHost );
1399
1431
dst += sizeof (int );
1400
1432
1401
- cudaMemcpyAsync (thrust::raw_pointer_cast (dst),
1402
- &dest_proc, sizeof (int ), cudaMemcpyHostToDevice);
1433
+ cudaMemcpyAsync (dst, &dest_proc, sizeof (int ), cudaMemcpyHostToHost);
1403
1434
dst += sizeof (int );
1404
1435
1405
1436
// pack structs
1406
1437
{
1407
1438
auto & aos = m_aos_to_redistribute;
1408
- cudaMemcpyAsync (thrust::raw_pointer_cast ( dst) ,
1439
+ cudaMemcpyAsync (dst,
1409
1440
thrust::raw_pointer_cast (aos.data () + start[i+1 ]),
1410
- num_to_add*sizeof (ParticleType), cudaMemcpyDeviceToDevice );
1441
+ num_to_add*sizeof (ParticleType), cudaMemcpyDeviceToHost );
1411
1442
dst += num_to_add*sizeof (ParticleType);
1412
1443
}
1413
1444
@@ -1416,9 +1447,9 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1416
1447
{
1417
1448
if (not communicate_real_comp[j]) continue ;
1418
1449
auto & attrib = m_real_arrays_to_redistribute[j];
1419
- cudaMemcpyAsync (thrust::raw_pointer_cast ( dst),
1450
+ cudaMemcpyAsync (dst,
1420
1451
thrust::raw_pointer_cast (attrib.data () + start[i+1 ]),
1421
- num_to_add*sizeof (Real), cudaMemcpyDeviceToDevice );
1452
+ num_to_add*sizeof (Real), cudaMemcpyDeviceToHost );
1422
1453
dst += num_to_add*sizeof (Real);
1423
1454
}
1424
1455
@@ -1427,16 +1458,16 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1427
1458
{
1428
1459
if (not communicate_int_comp[j]) continue ;
1429
1460
auto & attrib = m_int_arrays_to_redistribute[j];
1430
- cudaMemcpyAsync (thrust::raw_pointer_cast ( dst) ,
1461
+ cudaMemcpyAsync (dst,
1431
1462
thrust::raw_pointer_cast (attrib.data () + start[i+1 ]),
1432
- num_to_add*sizeof (int ), cudaMemcpyDeviceToDevice );
1463
+ num_to_add*sizeof (int ), cudaMemcpyDeviceToHost );
1433
1464
dst += num_to_add*sizeof (int );
1434
1465
}
1435
1466
}
1436
1467
}
1437
1468
}
1438
1469
1439
- RedistributeMPIGPU ();
1470
+ RedistributeMPIGPU (send_bytes, DstProc, sOffset , snd_buffer );
1440
1471
1441
1472
EnforcePeriodicGPU ();
1442
1473
@@ -1487,7 +1518,9 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1487
1518
template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
1488
1519
void
1489
1520
ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1490
- ::RedistributeMPIGPU ()
1521
+ ::RedistributeMPIGPU (const std::map<int , size_t >& send_bytes,
1522
+ Vector<int >& DstProc, Vector<std::size_t >& sOffset ,
1523
+ char * snd_buffer)
1491
1524
{
1492
1525
BL_PROFILE (" ParticleContainer::RedistributeMPIGPU()" );
1493
1526
@@ -1500,11 +1533,11 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1500
1533
1501
1534
long NumSnds = 0 ;
1502
1535
1503
- for (const auto & kv : m_not_ours )
1536
+ for (const auto & kv : send_bytes )
1504
1537
{
1505
- const int np = kv.second . size ();
1506
- Snds[kv.first ] = np ;
1507
- NumSnds += np ;
1538
+ const size_t nbytes = kv.second ;
1539
+ Snds[kv.first ] = nbytes ;
1540
+ NumSnds += nbytes ;
1508
1541
}
1509
1542
1510
1543
ParallelDescriptor::ReduceLongMax (NumSnds);
@@ -1540,30 +1573,29 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1540
1573
}
1541
1574
1542
1575
const int nrcvs = RcvProc.size ();
1576
+ const int nsnds = DstProc.size ();
1543
1577
Vector<MPI_Status> stats (nrcvs);
1544
1578
Vector<MPI_Request> rreqs (nrcvs);
1545
1579
1546
1580
const int SeqNum = ParallelDescriptor::SeqNum ();
1547
1581
1548
- // Allocate data for rcvs as one big chunk.
1549
- m_recvdata.resize (TotRcvBytes);
1550
-
1582
+ // Allocate data for rcvs as one big chunk.
1551
1583
char * rcv_buffer;
1552
1584
if (ParallelDescriptor::UseGpuAwareMpi ())
1553
1585
{
1554
- rcv_buffer = thrust::raw_pointer_cast (m_recvdata. data ( ));
1586
+ rcv_buffer = static_cast < char *>( amrex::The_Device_Arena ()-> alloc (TotRcvBytes ));
1555
1587
}
1556
1588
else
1557
1589
{
1558
- m_host_rcv_buffer.resize (TotRcvBytes);
1559
- rcv_buffer = &(m_host_rcv_buffer[0 ]);
1590
+ rcv_buffer = static_cast <char *>(amrex::The_Pinned_Arena ()->alloc (TotRcvBytes));
1560
1591
}
1561
1592
1562
1593
// Post receives.
1563
1594
for (int i = 0 ; i < nrcvs; ++i) {
1564
1595
const auto Who = RcvProc[i];
1565
1596
const auto offset = rOffset[i];
1566
1597
const auto Cnt = Rcvs[Who];
1598
+
1567
1599
BL_ASSERT (Cnt > 0 );
1568
1600
BL_ASSERT (Cnt < std::numeric_limits<int >::max ());
1569
1601
BL_ASSERT (Who >= 0 && Who < NProcs);
@@ -1573,50 +1605,35 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1573
1605
}
1574
1606
1575
1607
// Send.
1576
- for (const auto & kv : m_not_ours) {
1577
- const auto Who = kv.first ;
1578
- const auto Cnt = kv.second .size ();
1579
-
1608
+ for (int i = 0 ; i < nsnds; ++i) {
1609
+ const auto Who = DstProc[i];
1610
+ const auto offset = sOffset [i];
1611
+ const auto Cnt = Snds[Who];
1612
+
1580
1613
BL_ASSERT (Cnt > 0 );
1581
1614
BL_ASSERT (Who >= 0 && Who < NProcs);
1582
1615
BL_ASSERT (Cnt < std::numeric_limits<int >::max ());
1583
1616
1584
- if (ParallelDescriptor::UseGpuAwareMpi ())
1585
- {
1586
- ParallelDescriptor::Send (thrust::raw_pointer_cast (kv.second .data ()),
1587
- Cnt, Who, SeqNum);
1588
- } else
1589
- {
1590
- m_host_snd_buffer.resize (Cnt);
1591
- thrust::copy (kv.second .begin (), kv.second .end (), m_host_snd_buffer.begin ());
1592
- ParallelDescriptor::Send (thrust::raw_pointer_cast (m_host_snd_buffer.data ()),
1593
- Cnt, Who, SeqNum);
1594
- }
1617
+ ParallelDescriptor::Send (snd_buffer + offset, Cnt, Who, SeqNum);
1595
1618
}
1596
1619
1597
1620
if (nrcvs > 0 ) {
1598
1621
ParallelDescriptor::Waitall (rreqs, stats);
1599
1622
1600
- if (not ParallelDescriptor::UseGpuAwareMpi ())
1601
- {
1602
- thrust::copy (m_host_rcv_buffer.begin (), m_host_rcv_buffer.end (),
1603
- m_recvdata.data ());
1604
- }
1605
-
1606
1623
for (int i = 0 ; i < nrcvs; ++i) {
1607
1624
const int offset = rOffset[i];
1608
- char * buffer = thrust::raw_pointer_cast (m_recvdata. data () + offset);
1625
+ char * buffer = thrust::raw_pointer_cast (rcv_buffer + offset);
1609
1626
size_t num_grids, num_particles;
1610
1627
int gid, pid;
1611
- cudaMemcpy (&num_grids, buffer, sizeof (size_t ), cudaMemcpyDeviceToHost );
1628
+ cudaMemcpy (&num_grids, buffer, sizeof (size_t ), cudaMemcpyHostToHost );
1612
1629
buffer += sizeof (size_t );
1613
1630
1614
1631
for (int g = 0 ; g < num_grids; ++g) {
1615
- cudaMemcpy (&num_particles, buffer, sizeof (size_t ), cudaMemcpyDeviceToHost );
1632
+ cudaMemcpy (&num_particles, buffer, sizeof (size_t ), cudaMemcpyHostToHost );
1616
1633
buffer += sizeof (size_t );
1617
- cudaMemcpy (&gid, buffer, sizeof (int ), cudaMemcpyDeviceToHost );
1634
+ cudaMemcpy (&gid, buffer, sizeof (int ), cudaMemcpyHostToHost );
1618
1635
buffer += sizeof (int );
1619
- cudaMemcpy (&pid, buffer, sizeof (int ), cudaMemcpyDeviceToHost );
1636
+ cudaMemcpy (&pid, buffer, sizeof (int ), cudaMemcpyHostToHost );
1620
1637
buffer += sizeof (int );
1621
1638
1622
1639
if (num_particles == 0 ) continue ;
@@ -1635,15 +1652,15 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1635
1652
// copy structs
1636
1653
cudaMemcpyAsync (static_cast <ParticleType*>(aos ().data ()) + old_size,
1637
1654
buffer, num_particles*sizeof (ParticleType),
1638
- cudaMemcpyDeviceToDevice );
1655
+ cudaMemcpyHostToDevice );
1639
1656
buffer += num_particles*sizeof (ParticleType);
1640
1657
1641
1658
// copy real arrays
1642
1659
for (int j = 0 ; j < NArrayReal; ++j) {
1643
1660
if (not communicate_real_comp[j]) continue ;
1644
1661
auto & attrib = soa.GetRealData (j);
1645
1662
cudaMemcpyAsync (attrib.data () + old_size, buffer, num_particles*sizeof (Real),
1646
- cudaMemcpyDeviceToDevice );
1663
+ cudaMemcpyHostToDevice );
1647
1664
buffer += num_particles*sizeof (Real);
1648
1665
}
1649
1666
@@ -1652,13 +1669,23 @@ ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
1652
1669
if (not communicate_int_comp[j]) continue ;
1653
1670
auto & attrib = soa.GetIntData (j);
1654
1671
cudaMemcpyAsync (attrib.data () + old_size, buffer, num_particles*sizeof (int ),
1655
- cudaMemcpyDeviceToDevice );
1672
+ cudaMemcpyHostToDevice );
1656
1673
buffer += num_particles*sizeof (int );
1657
1674
}
1658
1675
}
1659
1676
}
1660
1677
}
1661
- }
1678
+ }
1679
+
1680
+ if (ParallelDescriptor::UseGpuAwareMpi ())
1681
+ {
1682
+ amrex::The_Device_Arena ()->free (snd_buffer);
1683
+ amrex::The_Device_Arena ()->free (rcv_buffer);
1684
+ } else {
1685
+ amrex::The_Pinned_Arena ()->free (snd_buffer);
1686
+ amrex::The_Pinned_Arena ()->free (rcv_buffer);
1687
+ }
1688
+
1662
1689
#endif // MPI
1663
1690
}
1664
1691
#endif // AMREX_USE_CUDA
0 commit comments