-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmpi.jl
97 lines (81 loc) · 2.71 KB
/
mpi.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# CUDA-aware MPI not available
function exchange_ghost(Q, NV, rank, comm, sbuf_h, sbuf_d, rbuf_h, rbuf_d)
nthreads = (NG, 16, 16)
nblocks = (1, cld((Ny+2*NG), 16), cld((Nz+2*NG), 16))
# x+
src = (rank == 0 ? MPI.PROC_NULL : (rank - 1))
dst = (rank == Nprocs -1 ? MPI.PROC_NULL : (rank + 1))
if src != MPI.PROC_NULL || dst != MPI.PROC_NULL
if dst != MPI.PROC_NULL
@cuda threads=nthreads blocks=nblocks pack_R(sbuf_d, Q, NV)
copyto!(sbuf_h, sbuf_d)
end
MPI.Sendrecv!(sbuf_h, rbuf_h, comm; dest=dst, source=src)
if src != MPI.PROC_NULL
copyto!(rbuf_d, rbuf_h)
@cuda threads=nthreads blocks=nblocks unpack_L(rbuf_d, Q, NV)
end
end
# x-
src = (rank == Nprocs - 1 ? MPI.PROC_NULL : (rank + 1))
dst = (rank == 0 ? MPI.PROC_NULL : (rank - 1))
if src != MPI.PROC_NULL || dst != MPI.PROC_NULL
if dst != MPI.PROC_NULL
@cuda threads=nthreads blocks=nblocks pack_L(sbuf_d, Q, NV)
copyto!(sbuf_h, sbuf_d)
end
MPI.Sendrecv!(sbuf_h, rbuf_h, comm; dest=dst, source=src)
if src != MPI.PROC_NULL
copyto!(rbuf_d, rbuf_h)
@cuda threads=nthreads blocks=nblocks unpack_R(rbuf_d, Q, NV)
end
end
end
function pack_R(buf, Q, NV)
i = (blockIdx().x-1i32)* blockDim().x + threadIdx().x
j = (blockIdx().y-1i32)* blockDim().y + threadIdx().y
k = (blockIdx().z-1i32)* blockDim().z + threadIdx().z
if i > NG || j > Ny+2*NG || k > Nz+2*NG
return
end
for n = 1:NV
@inbounds buf[i, j, k, n] = Q[Nxp+i, j, k, n]
end
return
end
function pack_L(buf, Q, NV)
i = (blockIdx().x-1i32)* blockDim().x + threadIdx().x
j = (blockIdx().y-1i32)* blockDim().y + threadIdx().y
k = (blockIdx().z-1i32)* blockDim().z + threadIdx().z
if i > NG || j > Ny+2*NG || k > Nz+2*NG
return
end
for n = 1:NV
@inbounds buf[i, j, k, n] = Q[NG+i, j, k, n]
end
return
end
function unpack_L(buf, Q, NV)
i = (blockIdx().x-1i32)* blockDim().x + threadIdx().x
j = (blockIdx().y-1i32)* blockDim().y + threadIdx().y
k = (blockIdx().z-1i32)* blockDim().z + threadIdx().z
if i > NG || j > Ny+2*NG || k > Nz+2*NG
return
end
for n = 1:NV
@inbounds Q[i, j, k, n] = buf[i, j, k, n]
end
return
end
function unpack_R(buf, Q, NV)
i = (blockIdx().x-1i32)* blockDim().x + threadIdx().x
j = (blockIdx().y-1i32)* blockDim().y + threadIdx().y
k = (blockIdx().z-1i32)* blockDim().z + threadIdx().z
if i > NG || j > Ny+2*NG || k > Nz+2*NG
return
end
for n = 1:NV
@inbounds Q[i+Nxp+NG, j, k, n] = buf[i, j, k, n]
end
return
end