1
1
#include < nvfunctional>
2
2
#include < ftk/numeric/mpas.hh>
3
3
#include < ftk/numeric/wachspress_interpolation.hh>
4
+ #include < ftk/filters/mpas_ocean_particle_tracker.cuh>
5
+
6
+ typedef mop_ctx_t ctx_t ;
4
7
5
8
static const int MAX_VERTS = 10 ;
6
9
static const int MAX_LAYERS = 100 ;
@@ -20,11 +23,11 @@ inline bool point_in_cell(
20
23
double xv[][3 ], // returns vertex coordinates
21
24
const int max_edges,
22
25
const double *Xv,
23
- const int *n_edges_on_cell ,
26
+ const int *nedges_on_cell ,
24
27
const int *verts_on_cell)
25
28
{
26
29
// if (cell < 0) return false;
27
- const int nverts = n_edges_on_cell [cell];
30
+ const int nverts = nedges_on_cell [cell];
28
31
// double xv[MAX_VERTS][3];
29
32
30
33
for (int i = 0 ; i < nverts; i ++) {
@@ -46,22 +49,22 @@ static int locate_cell_local( // local search among neighbors
46
49
double xv[][3 ], // returns vertex coordinates
47
50
const double *Xv,
48
51
const int max_edges,
49
- const int *n_edges_on_cell , // also n_verts_on_cell
52
+ const int *nedges_on_cell , // also n_verts_on_cell
50
53
const int *cells_on_cell,
51
54
const int *verts_on_cell)
52
55
{
53
56
if (curr < 0 )
54
57
return -1 ; // not found
55
58
else if (point_in_cell (
56
59
curr, x, iv, xv,
57
- max_edges, Xv, n_edges_on_cell , verts_on_cell))
60
+ max_edges, Xv, nedges_on_cell , verts_on_cell))
58
61
return curr;
59
62
else {
60
- for (int i = 0 ; i < n_edges_on_cell [curr]; i ++) {
63
+ for (int i = 0 ; i < nedges_on_cell [curr]; i ++) {
61
64
const int cell = cells_on_cell[i + max_edges * curr];
62
65
if (point_in_cell (
63
66
cell, x, iv, xv,
64
- max_edges, Xv, n_edges_on_cell , verts_on_cell))
67
+ max_edges, Xv, nedges_on_cell , verts_on_cell))
65
68
return cell;
66
69
}
67
70
return -1 ; // not found among neighbors
@@ -76,15 +79,15 @@ static bool mpas_eval(
76
79
double *f, // scalar attributs
77
80
const double *V, // velocity field
78
81
const double *Vv, // vertical velocities
82
+ const double *zTop, // top layer depth
79
83
const int nattrs, // number of scalar attributes
80
84
const double *A, // scalar attributes
81
85
const double *Xv, // vertex locations
82
86
const int max_edges,
83
- const int *n_edges_on_cell ,
87
+ const int *nedges_on_cell ,
84
88
const int *cells_on_cell,
85
89
const int *verts_on_cell,
86
90
const int nlayers,
87
- const double *zTop, // top layer depth
88
91
int &hint_c,
89
92
int &hint_l) // hint for searching cell and layer
90
93
{
@@ -93,12 +96,12 @@ static bool mpas_eval(
93
96
94
97
const int cell = locate_cell_local (hint_c,
95
98
x, iv, xv,
96
- Xv, max_edges, n_edges_on_cell ,
99
+ Xv, max_edges, nedges_on_cell ,
97
100
cells_on_cell, verts_on_cell);
98
101
if (cell < 0 ) return false ;
99
102
else hint_c = cell;
100
103
101
- const int nverts = n_edges_on_cell [cell];
104
+ const int nverts = nedges_on_cell [cell];
102
105
103
106
// compute weights based on xyzVerts
104
107
double omega[MAX_VERTS];
@@ -191,3 +194,209 @@ static bool mpas_eval(
191
194
192
195
return true ;
193
196
}
197
+
198
+ // /////////////////////////
199
+ void mop_create_ctx (mop_ctx_t **c_, int device)
200
+ {
201
+ *c_ = (ctx_t *)malloc (sizeof (ctx_t ));
202
+ ctx_t *c = *c_;
203
+ memset (c, 0 , sizeof (ctx_t ));
204
+
205
+ c->device = device;
206
+ cudaSetDevice (device);
207
+
208
+ c->d_Xc = NULL ;
209
+ c->d_Xv = NULL ;
210
+ c->d_nedges_on_cell = NULL ;
211
+ c->d_cells_on_cell = NULL ;
212
+ c->d_verts_on_cell = NULL ;
213
+
214
+ c->d_V [0 ] = NULL ;
215
+ c->d_V [1 ] = NULL ;
216
+ c->d_Vv [0 ] = NULL ;
217
+ c->d_Vv [1 ] = NULL ;
218
+ c->d_zTop [0 ] = NULL ;
219
+ c->d_zTop [1 ] = NULL ;
220
+ c->d_A [0 ] = NULL ;
221
+ c->d_A [1 ] = NULL ;
222
+ }
223
+
224
+ void mop_destroy_ctx (mop_ctx_t **c_)
225
+ {
226
+ ctx_t *c = *c_;
227
+
228
+ if (c->d_Xc != NULL ) cudaFree (c->d_Xc );
229
+ // TODO
230
+
231
+ free (*c_);
232
+ *c_ = NULL ;
233
+ }
234
+
235
+ void mop_load_mesh (mop_ctx_t *c,
236
+ const int ncells,
237
+ const int nlayers,
238
+ const int nverts,
239
+ const int max_edges,
240
+ const int nattrs,
241
+ const double *Xc,
242
+ const double *Xv,
243
+ const int *nedges_on_cell,
244
+ const int *cells_on_cell,
245
+ const int *verts_on_cell)
246
+ {
247
+ c->ncells = ncells;
248
+ c->nlayers = nlayers;
249
+ c->nverts = nverts;
250
+ c->max_edges = max_edges;
251
+ c->nattrs = nattrs;
252
+
253
+ cudaMalloc ((void **)&c->d_Xc , size_t (ncells) * sizeof (double ) * 3 );
254
+ cudaMemcpy (c->d_Xc , Xc, size_t (ncells) * sizeof (double ) * 3 , cudaMemcpyHostToDevice);
255
+
256
+ cudaMalloc ((void **)&c->d_Xv , size_t (nverts) * sizeof (double ) * 3 );
257
+ cudaMemcpy (c->d_Xv , Xv, size_t (nverts) * sizeof (double ) * 3 , cudaMemcpyHostToDevice);
258
+
259
+ cudaMalloc ((void **)&c->d_nedges_on_cell , size_t (ncells) * sizeof (int ));
260
+ cudaMemcpy (c->d_nedges_on_cell , nedges_on_cell, size_t (ncells) * sizeof (int ), cudaMemcpyHostToDevice);
261
+
262
+ cudaMalloc ((void **)&c->d_cells_on_cell , size_t (ncells) * max_edges * sizeof (int ));
263
+ cudaMemcpy (c->d_cells_on_cell , cells_on_cell, size_t (ncells) * max_edges * sizeof (int ), cudaMemcpyHostToDevice);
264
+
265
+ cudaMalloc ((void **)&c->d_verts_on_cell , size_t (nverts) * 3 * sizeof (int ));
266
+ cudaMemcpy (c->d_verts_on_cell , verts_on_cell, size_t (nverts) * 3 * sizeof (int ), cudaMemcpyHostToDevice);
267
+
268
+ // checkLastCudaError("[FTK-CUDA] loading mpas mesh");
269
+ }
270
+
271
+ #if 0
272
+ void mop_load_data(mop_ctx_t *c,
273
+ const double *V,
274
+ const double *Vv,
275
+ const double *zTop,
276
+ const double *A)
277
+ {
278
+ double *dd_V;
279
+ if (c->d_V[0] == NULL) {
280
+ cudaMalloc((void**)&c->d_V[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
281
+ checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0");
282
+ dd_V = c->d_V[0];
283
+ } else if (c->d_V[1] == NULL) {
284
+ cudaMalloc((void**)&c->d_V[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
285
+ checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0.1");
286
+ dd_V = c->d_V[1];
287
+ } else {
288
+ std::swap(c->d_V[0], c->d_V[1]);
289
+ dd_V = c->d_V[1];
290
+ }
291
+ // fprintf(stderr, "dd=%p, d0=%p, d1=%p, src=%p\n", dd_V, c->d_V[0], c->d_V[1], scalar);
292
+ cudaMemcpy(dd_V, scalar, sizeof(double) * size_t(c->m2n0 * c->nphi),
293
+ cudaMemcpyHostToDevice);
294
+ checkLastCudaError("[FTK-CUDA] loading scalar field data, memcpy 0");
295
+
296
+ ///
297
+ double *dd_Vv;
298
+ if (c->d_vector[0] == NULL) {
299
+ cudaMalloc((void**)&c->d_vector[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 2);
300
+ dd_Vv = c->d_vector[0];
301
+ } else if (c->d_vector[1] == NULL) {
302
+ cudaMalloc((void**)&c->d_vector[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi)* 2);
303
+ dd_Vv = c->d_vector[1];
304
+ } else {
305
+ std::swap(c->d_vector[0], c->d_vector[1]);
306
+ dd_Vv = c->d_vector[1];
307
+ }
308
+ cudaMemcpy(dd_Vv, vector, sizeof(double) * size_t(c->m2n0 * c->nphi * 2),
309
+ cudaMemcpyHostToDevice);
310
+ checkLastCudaError("[FTK-CUDA] loading vector field data");
311
+
312
+ ///
313
+ double *dd_zTop;
314
+ if (c->d_jacobian[0] == NULL) {
315
+ cudaMalloc((void**)&c->d_jacobian[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 4);
316
+ dd_zTop = c->d_jacobian[0];
317
+ } else if (c->d_jacobian[1] == NULL) {
318
+ cudaMalloc((void**)&c->d_jacobian[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 4);
319
+ dd_zTop = c->d_jacobian[1];
320
+ } else {
321
+ std::swap(c->d_jacobian[0], c->d_jacobian[1]);
322
+ dd_zTop = c->d_jacobian[1];
323
+ }
324
+ cudaMemcpy(dd_zTop, jacobian, sizeof(double) * size_t(c->m2n0 * c->nphi) * 4,
325
+ cudaMemcpyHostToDevice);
326
+ checkLastCudaError("[FTK-CUDA] loading jacobian field data");
327
+
328
+
329
+ ///
330
+ double *dd_A;
331
+ if (c->d_V[0] == NULL) {
332
+ cudaMalloc((void**)&c->d_V[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
333
+ checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0");
334
+ dd_A = c->d_V[0];
335
+ } else if (c->d_V[1] == NULL) {
336
+ cudaMalloc((void**)&c->d_V[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
337
+ checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0.1");
338
+ dd_A = c->d_V[1];
339
+ } else {
340
+ std::swap(c->d_V[0], c->d_V[1]);
341
+ dd_A = c->d_V[1];
342
+ }
343
+ cudaMemcpy(dd_A, scalar, sizeof(double) * size_t(c->m2n0 * c->nphi),
344
+ cudaMemcpyHostToDevice);
345
+ checkLastCudaError("[FTK-CUDA] loading scalar field data, memcpy 0");
346
+ }
347
+
348
+ void mop_execute(mop_ctx_t *c, int scope, int current_timestep)
349
+ {
350
+ const int np = c->nphi * c->iphi * c->vphi;
351
+ const int mx3n1 = (2 * c->m2n1 + c->m2n0) * np;
352
+ const int mx3n2 = (3 * c->m2n2 + 2 * c->m2n1) * np;
353
+ // const int mx4n2 = 3 * mx3n2 + 2 * mx3n1;
354
+ const int mx4n2_ordinal = mx3n2,
355
+ mx4n2_interval = 2 * mx3n2 + 2 * mx3n1;
356
+ // fprintf(stderr, "executing timestep %d\n", current_timestep);
357
+
358
+ size_t ntasks;
359
+ if (scope == scope_ordinal) ntasks = mx4n2_ordinal;
360
+ else ntasks = mx4n2_interval;
361
+
362
+ fprintf(stderr, "ntasks=%zu\n", ntasks);
363
+
364
+ const int maxGridDim = 1024;
365
+ const int blockSize = 256;
366
+ const int nBlocks = idivup(ntasks, blockSize);
367
+ dim3 gridSize;
368
+
369
+ if (nBlocks >= maxGridDim) gridSize = dim3(idivup(nBlocks, maxGridDim), maxGridDim);
370
+ else gridSize = dim3(nBlocks);
371
+
372
+ sweep_simplices<int, double><<<gridSize, blockSize>>>(
373
+ scope, current_timestep,
374
+ c->factor,
375
+ c->nphi, c->iphi, c->vphi,
376
+ c->m2n0, c->m2n1, c->m2n2,
377
+ c->d_m2coords, c->d_m2edges, c->d_m2tris,
378
+ c->d_psin,
379
+ c->d_interpolants,
380
+ c->d_V[0], c->d_V[1],
381
+ c->d_vector[0], c->d_vector[1],
382
+ c->d_jacobian[0], c->d_jacobian[1],
383
+ *c->dncps, c->dcps);
384
+ cudaDeviceSynchronize();
385
+ checkLastCudaError("[FTK-CUDA] sweep_simplicies");
386
+
387
+ cudaMemcpy(&c->hncps, c->dncps, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
388
+ cudaMemset(c->dncps, 0, sizeof(unsigned long long)); // clear the counter
389
+ checkLastCudaError("[FTK-CUDA] cuda memcpy device to host, 1");
390
+ fprintf(stderr, "ncps=%llu\n", c->hncps);
391
+ cudaMemcpy(c->hcps, c->dcps, sizeof(cp_t) * c->hncps, cudaMemcpyDeviceToHost);
392
+
393
+ checkLastCudaError("[FTK-CUDA] cuda memcpy device to host, 2");
394
+ }
395
+
396
+ void mop_swap(mop_ctx_t *c)
397
+ {
398
+ std::swap(c->d_V[0], c->d_V[1]);
399
+ std::swap(c->d_vector[0], c->d_vector[1]);
400
+ std::swap(c->d_jacobian[0], c->d_jacobian[1]);
401
+ }
402
+ #endif
0 commit comments