Skip to content

Commit 07144fd

Browse files
committed
WIP: mpas-o tracer gpu
1 parent 2eca590 commit 07144fd

File tree

3 files changed

+276
-10
lines changed

3 files changed

+276
-10
lines changed
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#ifndef _MPAS_OCEAN_PARTICLE_TRACKER_CUH
2+
#define _MPAS_OCEAN_PARTICLE_TRACKER_CUH
3+
4+
#include <vector>
5+
#include <set>
6+
#include <ftk/features/feature_point_lite.hh>
7+
#include <ftk/mesh/bvh2d.hh>
8+
9+
typedef struct {
10+
// gpu
11+
int device;
12+
13+
// mesh info
14+
int ncells, nlayers, nverts, max_edges, nattrs;
15+
16+
// mesh
17+
double *d_Xc, *d_Xv; // cell/vertex coordinates
18+
int *d_nedges_on_cell,
19+
*d_cells_on_cell,
20+
*d_verts_on_cell;
21+
22+
// time-varying data
23+
double *d_V[2], // velocity fields of adjacent timesteps
24+
*d_Vv[2], // vertical velocities
25+
*d_zTop[2], // top layer depth
26+
*d_A[2]; // scalar attrs
27+
28+
// particle data
29+
int nparticles;
30+
ftk::feature_point_lite_t *hcps = NULL, *dcps = NULL;
31+
32+
} mop_ctx_t;
33+
34+
void mop_create_ctx(mop_ctx_t **c_, int device=0);
35+
void mop_destroy_ctx(mop_ctx_t **c_);
36+
void mop_load_mesh(mop_ctx_t *c,
37+
const int ncells,
38+
const int nlayers,
39+
const int nverts,
40+
const int max_edges,
41+
const int nattrs,
42+
const double *n_edges_on_cell,
43+
const double *cells_on_cell,
44+
const double *verts_on_cell);
45+
46+
void mop_load_data(mop_ctx_t *c,
47+
const double *V,
48+
const double *Vv,
49+
const double *zTop,
50+
const double *A);
51+
52+
void mop_execute(mop_ctx_t *c, int scope, int current_timestep);
53+
void mop_swap(mop_ctx_t *c);
54+
55+
#endif

include/ftk/numeric/cross_product.hh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@
66
namespace ftk {
77

88
template <typename T>
9+
__device__ __host__
910
static inline T cross_product2(const T A[2], const T B[2])
1011
{
1112
return A[0]*B[1] - A[1]*B[0];
1213
}
1314

1415
template <typename T>
16+
__device__ __host__
1517
static inline void cross_product(const T A[3], const T B[3], T C[3])
1618
{
1719
C[0] = A[1]*B[2] - A[2]*B[1];

src/filters/particle_tracer_mpas.cu

Lines changed: 219 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
#include <nvfunctional>
22
#include <ftk/numeric/mpas.hh>
33
#include <ftk/numeric/wachspress_interpolation.hh>
4+
#include <ftk/filters/mpas_ocean_particle_tracker.cuh>
5+
6+
typedef mop_ctx_t ctx_t;
47

58
static const int MAX_VERTS = 10;
69
static const int MAX_LAYERS = 100;
@@ -20,11 +23,11 @@ inline bool point_in_cell(
2023
double xv[][3], // returns vertex coordinates
2124
const int max_edges,
2225
const double *Xv,
23-
const int *n_edges_on_cell,
26+
const int *nedges_on_cell,
2427
const int *verts_on_cell)
2528
{
2629
// if (cell < 0) return false;
27-
const int nverts = n_edges_on_cell[cell];
30+
const int nverts = nedges_on_cell[cell];
2831
// double xv[MAX_VERTS][3];
2932

3033
for (int i = 0; i < nverts; i ++) {
@@ -46,22 +49,22 @@ static int locate_cell_local( // local search among neighbors
4649
double xv[][3], // returns vertex coordinates
4750
const double *Xv,
4851
const int max_edges,
49-
const int *n_edges_on_cell, // also n_verts_on_cell
52+
const int *nedges_on_cell, // also n_verts_on_cell
5053
const int *cells_on_cell,
5154
const int *verts_on_cell)
5255
{
5356
if (curr < 0)
5457
return -1; // not found
5558
else if (point_in_cell(
5659
curr, x, iv, xv,
57-
max_edges, Xv, n_edges_on_cell, verts_on_cell))
60+
max_edges, Xv, nedges_on_cell, verts_on_cell))
5861
return curr;
5962
else {
60-
for (int i = 0; i < n_edges_on_cell[curr]; i ++) {
63+
for (int i = 0; i < nedges_on_cell[curr]; i ++) {
6164
const int cell = cells_on_cell[i + max_edges * curr];
6265
if (point_in_cell(
6366
cell, x, iv, xv,
64-
max_edges, Xv, n_edges_on_cell, verts_on_cell))
67+
max_edges, Xv, nedges_on_cell, verts_on_cell))
6568
return cell;
6669
}
6770
return -1; // not found among neighbors
@@ -76,15 +79,15 @@ static bool mpas_eval(
7679
double *f, // scalar attributs
7780
const double *V, // velocity field
7881
const double *Vv, // vertical velocities
82+
const double *zTop, // top layer depth
7983
const int nattrs, // number of scalar attributes
8084
const double *A, // scalar attributes
8185
const double *Xv, // vertex locations
8286
const int max_edges,
83-
const int *n_edges_on_cell,
87+
const int *nedges_on_cell,
8488
const int *cells_on_cell,
8589
const int *verts_on_cell,
8690
const int nlayers,
87-
const double *zTop, // top layer depth
8891
int &hint_c,
8992
int &hint_l) // hint for searching cell and layer
9093
{
@@ -93,12 +96,12 @@ static bool mpas_eval(
9396

9497
const int cell = locate_cell_local(hint_c,
9598
x, iv, xv,
96-
Xv, max_edges, n_edges_on_cell,
99+
Xv, max_edges, nedges_on_cell,
97100
cells_on_cell, verts_on_cell);
98101
if (cell < 0) return false;
99102
else hint_c = cell;
100103

101-
const int nverts = n_edges_on_cell[cell];
104+
const int nverts = nedges_on_cell[cell];
102105

103106
// compute weights based on xyzVerts
104107
double omega[MAX_VERTS];
@@ -191,3 +194,209 @@ static bool mpas_eval(
191194

192195
return true;
193196
}
197+
198+
///////////////////////////
199+
void mop_create_ctx(mop_ctx_t **c_, int device)
200+
{
201+
*c_ = (ctx_t*)malloc(sizeof(ctx_t));
202+
ctx_t *c = *c_;
203+
memset(c, 0, sizeof(ctx_t));
204+
205+
c->device = device;
206+
cudaSetDevice(device);
207+
208+
c->d_Xc = NULL;
209+
c->d_Xv = NULL;
210+
c->d_nedges_on_cell = NULL;
211+
c->d_cells_on_cell = NULL;
212+
c->d_verts_on_cell = NULL;
213+
214+
c->d_V[0] = NULL;
215+
c->d_V[1] = NULL;
216+
c->d_Vv[0] = NULL;
217+
c->d_Vv[1] = NULL;
218+
c->d_zTop[0] = NULL;
219+
c->d_zTop[1] = NULL;
220+
c->d_A[0] = NULL;
221+
c->d_A[1] = NULL;
222+
}
223+
224+
void mop_destroy_ctx(mop_ctx_t **c_)
225+
{
226+
ctx_t *c = *c_;
227+
228+
if (c->d_Xc != NULL) cudaFree(c->d_Xc);
229+
// TODO
230+
231+
free(*c_);
232+
*c_ = NULL;
233+
}
234+
235+
void mop_load_mesh(mop_ctx_t *c,
236+
const int ncells,
237+
const int nlayers,
238+
const int nverts,
239+
const int max_edges,
240+
const int nattrs,
241+
const double *Xc,
242+
const double *Xv,
243+
const int *nedges_on_cell,
244+
const int *cells_on_cell,
245+
const int *verts_on_cell)
246+
{
247+
c->ncells = ncells;
248+
c->nlayers = nlayers;
249+
c->nverts = nverts;
250+
c->max_edges = max_edges;
251+
c->nattrs = nattrs;
252+
253+
cudaMalloc((void**)&c->d_Xc, size_t(ncells) * sizeof(double) * 3);
254+
cudaMemcpy(c->d_Xc, Xc, size_t(ncells) * sizeof(double) * 3, cudaMemcpyHostToDevice);
255+
256+
cudaMalloc((void**)&c->d_Xv, size_t(nverts) * sizeof(double) * 3);
257+
cudaMemcpy(c->d_Xv, Xv, size_t(nverts) * sizeof(double) * 3, cudaMemcpyHostToDevice);
258+
259+
cudaMalloc((void**)&c->d_nedges_on_cell, size_t(ncells) * sizeof(int));
260+
cudaMemcpy(c->d_nedges_on_cell, nedges_on_cell, size_t(ncells) * sizeof(int), cudaMemcpyHostToDevice);
261+
262+
cudaMalloc((void**)&c->d_cells_on_cell, size_t(ncells) * max_edges * sizeof(int));
263+
cudaMemcpy(c->d_cells_on_cell, cells_on_cell, size_t(ncells) * max_edges * sizeof(int), cudaMemcpyHostToDevice);
264+
265+
cudaMalloc((void**)&c->d_verts_on_cell, size_t(nverts) * 3 * sizeof(int));
266+
cudaMemcpy(c->d_verts_on_cell, verts_on_cell, size_t(nverts) * 3 * sizeof(int), cudaMemcpyHostToDevice);
267+
268+
// checkLastCudaError("[FTK-CUDA] loading mpas mesh");
269+
}
270+
271+
#if 0
272+
void mop_load_data(mop_ctx_t *c,
273+
const double *V,
274+
const double *Vv,
275+
const double *zTop,
276+
const double *A)
277+
{
278+
double *dd_V;
279+
if (c->d_V[0] == NULL) {
280+
cudaMalloc((void**)&c->d_V[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
281+
checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0");
282+
dd_V = c->d_V[0];
283+
} else if (c->d_V[1] == NULL) {
284+
cudaMalloc((void**)&c->d_V[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
285+
checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0.1");
286+
dd_V = c->d_V[1];
287+
} else {
288+
std::swap(c->d_V[0], c->d_V[1]);
289+
dd_V = c->d_V[1];
290+
}
291+
// fprintf(stderr, "dd=%p, d0=%p, d1=%p, src=%p\n", dd_V, c->d_V[0], c->d_V[1], scalar);
292+
cudaMemcpy(dd_V, scalar, sizeof(double) * size_t(c->m2n0 * c->nphi),
293+
cudaMemcpyHostToDevice);
294+
checkLastCudaError("[FTK-CUDA] loading scalar field data, memcpy 0");
295+
296+
///
297+
double *dd_Vv;
298+
if (c->d_vector[0] == NULL) {
299+
cudaMalloc((void**)&c->d_vector[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 2);
300+
dd_Vv = c->d_vector[0];
301+
} else if (c->d_vector[1] == NULL) {
302+
cudaMalloc((void**)&c->d_vector[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi)* 2);
303+
dd_Vv = c->d_vector[1];
304+
} else {
305+
std::swap(c->d_vector[0], c->d_vector[1]);
306+
dd_Vv = c->d_vector[1];
307+
}
308+
cudaMemcpy(dd_Vv, vector, sizeof(double) * size_t(c->m2n0 * c->nphi * 2),
309+
cudaMemcpyHostToDevice);
310+
checkLastCudaError("[FTK-CUDA] loading vector field data");
311+
312+
///
313+
double *dd_zTop;
314+
if (c->d_jacobian[0] == NULL) {
315+
cudaMalloc((void**)&c->d_jacobian[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 4);
316+
dd_zTop = c->d_jacobian[0];
317+
} else if (c->d_jacobian[1] == NULL) {
318+
cudaMalloc((void**)&c->d_jacobian[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 4);
319+
dd_zTop = c->d_jacobian[1];
320+
} else {
321+
std::swap(c->d_jacobian[0], c->d_jacobian[1]);
322+
dd_zTop = c->d_jacobian[1];
323+
}
324+
cudaMemcpy(dd_zTop, jacobian, sizeof(double) * size_t(c->m2n0 * c->nphi) * 4,
325+
cudaMemcpyHostToDevice);
326+
checkLastCudaError("[FTK-CUDA] loading jacobian field data");
327+
328+
329+
///
330+
double *dd_A;
331+
if (c->d_V[0] == NULL) {
332+
cudaMalloc((void**)&c->d_V[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
333+
checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0");
334+
dd_A = c->d_V[0];
335+
} else if (c->d_V[1] == NULL) {
336+
cudaMalloc((void**)&c->d_V[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
337+
checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0.1");
338+
dd_A = c->d_V[1];
339+
} else {
340+
std::swap(c->d_V[0], c->d_V[1]);
341+
dd_A = c->d_V[1];
342+
}
343+
cudaMemcpy(dd_A, scalar, sizeof(double) * size_t(c->m2n0 * c->nphi),
344+
cudaMemcpyHostToDevice);
345+
checkLastCudaError("[FTK-CUDA] loading scalar field data, memcpy 0");
346+
}
347+
348+
void mop_execute(mop_ctx_t *c, int scope, int current_timestep)
349+
{
350+
const int np = c->nphi * c->iphi * c->vphi;
351+
const int mx3n1 = (2 * c->m2n1 + c->m2n0) * np;
352+
const int mx3n2 = (3 * c->m2n2 + 2 * c->m2n1) * np;
353+
// const int mx4n2 = 3 * mx3n2 + 2 * mx3n1;
354+
const int mx4n2_ordinal = mx3n2,
355+
mx4n2_interval = 2 * mx3n2 + 2 * mx3n1;
356+
// fprintf(stderr, "executing timestep %d\n", current_timestep);
357+
358+
size_t ntasks;
359+
if (scope == scope_ordinal) ntasks = mx4n2_ordinal;
360+
else ntasks = mx4n2_interval;
361+
362+
fprintf(stderr, "ntasks=%zu\n", ntasks);
363+
364+
const int maxGridDim = 1024;
365+
const int blockSize = 256;
366+
const int nBlocks = idivup(ntasks, blockSize);
367+
dim3 gridSize;
368+
369+
if (nBlocks >= maxGridDim) gridSize = dim3(idivup(nBlocks, maxGridDim), maxGridDim);
370+
else gridSize = dim3(nBlocks);
371+
372+
sweep_simplices<int, double><<<gridSize, blockSize>>>(
373+
scope, current_timestep,
374+
c->factor,
375+
c->nphi, c->iphi, c->vphi,
376+
c->m2n0, c->m2n1, c->m2n2,
377+
c->d_m2coords, c->d_m2edges, c->d_m2tris,
378+
c->d_psin,
379+
c->d_interpolants,
380+
c->d_V[0], c->d_V[1],
381+
c->d_vector[0], c->d_vector[1],
382+
c->d_jacobian[0], c->d_jacobian[1],
383+
*c->dncps, c->dcps);
384+
cudaDeviceSynchronize();
385+
checkLastCudaError("[FTK-CUDA] sweep_simplicies");
386+
387+
cudaMemcpy(&c->hncps, c->dncps, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
388+
cudaMemset(c->dncps, 0, sizeof(unsigned long long)); // clear the counter
389+
checkLastCudaError("[FTK-CUDA] cuda memcpy device to host, 1");
390+
fprintf(stderr, "ncps=%llu\n", c->hncps);
391+
cudaMemcpy(c->hcps, c->dcps, sizeof(cp_t) * c->hncps, cudaMemcpyDeviceToHost);
392+
393+
checkLastCudaError("[FTK-CUDA] cuda memcpy device to host, 2");
394+
}
395+
396+
void mop_swap(mop_ctx_t *c)
397+
{
398+
std::swap(c->d_V[0], c->d_V[1]);
399+
std::swap(c->d_vector[0], c->d_vector[1]);
400+
std::swap(c->d_jacobian[0], c->d_jacobian[1]);
401+
}
402+
#endif

0 commit comments

Comments
 (0)