11#include < nvfunctional>
22#include < ftk/numeric/mpas.hh>
33#include < ftk/numeric/wachspress_interpolation.hh>
4+ #include < ftk/filters/mpas_ocean_particle_tracker.cuh>
5+
6+ typedef mop_ctx_t ctx_t ;
47
58static const int MAX_VERTS = 10 ;
69static const int MAX_LAYERS = 100 ;
@@ -20,11 +23,11 @@ inline bool point_in_cell(
2023 double xv[][3 ], // returns vertex coordinates
2124 const int max_edges,
2225 const double *Xv,
23- const int *n_edges_on_cell ,
26+ const int *nedges_on_cell ,
2427 const int *verts_on_cell)
2528{
2629 // if (cell < 0) return false;
27- const int nverts = n_edges_on_cell [cell];
30+ const int nverts = nedges_on_cell [cell];
2831 // double xv[MAX_VERTS][3];
2932
3033 for (int i = 0 ; i < nverts; i ++) {
@@ -46,22 +49,22 @@ static int locate_cell_local( // local search among neighbors
4649 double xv[][3 ], // returns vertex coordinates
4750 const double *Xv,
4851 const int max_edges,
49- const int *n_edges_on_cell , // also n_verts_on_cell
52+ const int *nedges_on_cell , // also n_verts_on_cell
5053 const int *cells_on_cell,
5154 const int *verts_on_cell)
5255{
5356 if (curr < 0 )
5457 return -1 ; // not found
5558 else if (point_in_cell (
5659 curr, x, iv, xv,
57- max_edges, Xv, n_edges_on_cell , verts_on_cell))
60+ max_edges, Xv, nedges_on_cell , verts_on_cell))
5861 return curr;
5962 else {
60- for (int i = 0 ; i < n_edges_on_cell [curr]; i ++) {
63+ for (int i = 0 ; i < nedges_on_cell [curr]; i ++) {
6164 const int cell = cells_on_cell[i + max_edges * curr];
6265 if (point_in_cell (
6366 cell, x, iv, xv,
64- max_edges, Xv, n_edges_on_cell , verts_on_cell))
67+ max_edges, Xv, nedges_on_cell , verts_on_cell))
6568 return cell;
6669 }
6770 return -1 ; // not found among neighbors
@@ -76,15 +79,15 @@ static bool mpas_eval(
7679 double *f, // scalar attributs
7780 const double *V, // velocity field
7881 const double *Vv, // vertical velocities
82+ const double *zTop, // top layer depth
7983 const int nattrs, // number of scalar attributes
8084 const double *A, // scalar attributes
8185 const double *Xv, // vertex locations
8286 const int max_edges,
83- const int *n_edges_on_cell ,
87+ const int *nedges_on_cell ,
8488 const int *cells_on_cell,
8589 const int *verts_on_cell,
8690 const int nlayers,
87- const double *zTop, // top layer depth
8891 int &hint_c,
8992 int &hint_l) // hint for searching cell and layer
9093{
@@ -93,12 +96,12 @@ static bool mpas_eval(
9396
9497 const int cell = locate_cell_local (hint_c,
9598 x, iv, xv,
96- Xv, max_edges, n_edges_on_cell ,
99+ Xv, max_edges, nedges_on_cell ,
97100 cells_on_cell, verts_on_cell);
98101 if (cell < 0 ) return false ;
99102 else hint_c = cell;
100103
101- const int nverts = n_edges_on_cell [cell];
104+ const int nverts = nedges_on_cell [cell];
102105
103106 // compute weights based on xyzVerts
104107 double omega[MAX_VERTS];
@@ -191,3 +194,209 @@ static bool mpas_eval(
191194
192195 return true ;
193196}
197+
198+ // /////////////////////////
199+ void mop_create_ctx (mop_ctx_t **c_, int device)
200+ {
201+ *c_ = (ctx_t *)malloc (sizeof (ctx_t ));
202+ ctx_t *c = *c_;
203+ memset (c, 0 , sizeof (ctx_t ));
204+
205+ c->device = device;
206+ cudaSetDevice (device);
207+
208+ c->d_Xc = NULL ;
209+ c->d_Xv = NULL ;
210+ c->d_nedges_on_cell = NULL ;
211+ c->d_cells_on_cell = NULL ;
212+ c->d_verts_on_cell = NULL ;
213+
214+ c->d_V [0 ] = NULL ;
215+ c->d_V [1 ] = NULL ;
216+ c->d_Vv [0 ] = NULL ;
217+ c->d_Vv [1 ] = NULL ;
218+ c->d_zTop [0 ] = NULL ;
219+ c->d_zTop [1 ] = NULL ;
220+ c->d_A [0 ] = NULL ;
221+ c->d_A [1 ] = NULL ;
222+ }
223+
224+ void mop_destroy_ctx (mop_ctx_t **c_)
225+ {
226+ ctx_t *c = *c_;
227+
228+ if (c->d_Xc != NULL ) cudaFree (c->d_Xc );
229+ // TODO
230+
231+ free (*c_);
232+ *c_ = NULL ;
233+ }
234+
235+ void mop_load_mesh (mop_ctx_t *c,
236+ const int ncells,
237+ const int nlayers,
238+ const int nverts,
239+ const int max_edges,
240+ const int nattrs,
241+ const double *Xc,
242+ const double *Xv,
243+ const int *nedges_on_cell,
244+ const int *cells_on_cell,
245+ const int *verts_on_cell)
246+ {
247+ c->ncells = ncells;
248+ c->nlayers = nlayers;
249+ c->nverts = nverts;
250+ c->max_edges = max_edges;
251+ c->nattrs = nattrs;
252+
253+ cudaMalloc ((void **)&c->d_Xc , size_t (ncells) * sizeof (double ) * 3 );
254+ cudaMemcpy (c->d_Xc , Xc, size_t (ncells) * sizeof (double ) * 3 , cudaMemcpyHostToDevice);
255+
256+ cudaMalloc ((void **)&c->d_Xv , size_t (nverts) * sizeof (double ) * 3 );
257+ cudaMemcpy (c->d_Xv , Xv, size_t (nverts) * sizeof (double ) * 3 , cudaMemcpyHostToDevice);
258+
259+ cudaMalloc ((void **)&c->d_nedges_on_cell , size_t (ncells) * sizeof (int ));
260+ cudaMemcpy (c->d_nedges_on_cell , nedges_on_cell, size_t (ncells) * sizeof (int ), cudaMemcpyHostToDevice);
261+
262+ cudaMalloc ((void **)&c->d_cells_on_cell , size_t (ncells) * max_edges * sizeof (int ));
263+ cudaMemcpy (c->d_cells_on_cell , cells_on_cell, size_t (ncells) * max_edges * sizeof (int ), cudaMemcpyHostToDevice);
264+
265+ cudaMalloc ((void **)&c->d_verts_on_cell , size_t (nverts) * 3 * sizeof (int ));
266+ cudaMemcpy (c->d_verts_on_cell , verts_on_cell, size_t (nverts) * 3 * sizeof (int ), cudaMemcpyHostToDevice);
267+
268+ // checkLastCudaError("[FTK-CUDA] loading mpas mesh");
269+ }
270+
271+ #if 0
272+ void mop_load_data(mop_ctx_t *c,
273+ const double *V,
274+ const double *Vv,
275+ const double *zTop,
276+ const double *A)
277+ {
278+ double *dd_V;
279+ if (c->d_V[0] == NULL) {
280+ cudaMalloc((void**)&c->d_V[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
281+ checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0");
282+ dd_V = c->d_V[0];
283+ } else if (c->d_V[1] == NULL) {
284+ cudaMalloc((void**)&c->d_V[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
285+ checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0.1");
286+ dd_V = c->d_V[1];
287+ } else {
288+ std::swap(c->d_V[0], c->d_V[1]);
289+ dd_V = c->d_V[1];
290+ }
291+ // fprintf(stderr, "dd=%p, d0=%p, d1=%p, src=%p\n", dd_V, c->d_V[0], c->d_V[1], scalar);
292+ cudaMemcpy(dd_V, scalar, sizeof(double) * size_t(c->m2n0 * c->nphi),
293+ cudaMemcpyHostToDevice);
294+ checkLastCudaError("[FTK-CUDA] loading scalar field data, memcpy 0");
295+
296+ ///
297+ double *dd_Vv;
298+ if (c->d_vector[0] == NULL) {
299+ cudaMalloc((void**)&c->d_vector[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 2);
300+ dd_Vv = c->d_vector[0];
301+ } else if (c->d_vector[1] == NULL) {
302+ cudaMalloc((void**)&c->d_vector[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi)* 2);
303+ dd_Vv = c->d_vector[1];
304+ } else {
305+ std::swap(c->d_vector[0], c->d_vector[1]);
306+ dd_Vv = c->d_vector[1];
307+ }
308+ cudaMemcpy(dd_Vv, vector, sizeof(double) * size_t(c->m2n0 * c->nphi * 2),
309+ cudaMemcpyHostToDevice);
310+ checkLastCudaError("[FTK-CUDA] loading vector field data");
311+
312+ ///
313+ double *dd_zTop;
314+ if (c->d_jacobian[0] == NULL) {
315+ cudaMalloc((void**)&c->d_jacobian[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 4);
316+ dd_zTop = c->d_jacobian[0];
317+ } else if (c->d_jacobian[1] == NULL) {
318+ cudaMalloc((void**)&c->d_jacobian[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 4);
319+ dd_zTop = c->d_jacobian[1];
320+ } else {
321+ std::swap(c->d_jacobian[0], c->d_jacobian[1]);
322+ dd_zTop = c->d_jacobian[1];
323+ }
324+ cudaMemcpy(dd_zTop, jacobian, sizeof(double) * size_t(c->m2n0 * c->nphi) * 4,
325+ cudaMemcpyHostToDevice);
326+ checkLastCudaError("[FTK-CUDA] loading jacobian field data");
327+
328+
329+ ///
330+ double *dd_A;
331+ if (c->d_V[0] == NULL) {
332+ cudaMalloc((void**)&c->d_V[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
333+ checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0");
334+ dd_A = c->d_V[0];
335+ } else if (c->d_V[1] == NULL) {
336+ cudaMalloc((void**)&c->d_V[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
337+ checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0.1");
338+ dd_A = c->d_V[1];
339+ } else {
340+ std::swap(c->d_V[0], c->d_V[1]);
341+ dd_A = c->d_V[1];
342+ }
343+ cudaMemcpy(dd_A, scalar, sizeof(double) * size_t(c->m2n0 * c->nphi),
344+ cudaMemcpyHostToDevice);
345+ checkLastCudaError("[FTK-CUDA] loading scalar field data, memcpy 0");
346+ }
347+
348+ void mop_execute(mop_ctx_t *c, int scope, int current_timestep)
349+ {
350+ const int np = c->nphi * c->iphi * c->vphi;
351+ const int mx3n1 = (2 * c->m2n1 + c->m2n0) * np;
352+ const int mx3n2 = (3 * c->m2n2 + 2 * c->m2n1) * np;
353+ // const int mx4n2 = 3 * mx3n2 + 2 * mx3n1;
354+ const int mx4n2_ordinal = mx3n2,
355+ mx4n2_interval = 2 * mx3n2 + 2 * mx3n1;
356+ // fprintf(stderr, "executing timestep %d\n", current_timestep);
357+
358+ size_t ntasks;
359+ if (scope == scope_ordinal) ntasks = mx4n2_ordinal;
360+ else ntasks = mx4n2_interval;
361+
362+ fprintf(stderr, "ntasks=%zu\n", ntasks);
363+
364+ const int maxGridDim = 1024;
365+ const int blockSize = 256;
366+ const int nBlocks = idivup(ntasks, blockSize);
367+ dim3 gridSize;
368+
369+ if (nBlocks >= maxGridDim) gridSize = dim3(idivup(nBlocks, maxGridDim), maxGridDim);
370+ else gridSize = dim3(nBlocks);
371+
372+ sweep_simplices<int, double><<<gridSize, blockSize>>>(
373+ scope, current_timestep,
374+ c->factor,
375+ c->nphi, c->iphi, c->vphi,
376+ c->m2n0, c->m2n1, c->m2n2,
377+ c->d_m2coords, c->d_m2edges, c->d_m2tris,
378+ c->d_psin,
379+ c->d_interpolants,
380+ c->d_V[0], c->d_V[1],
381+ c->d_vector[0], c->d_vector[1],
382+ c->d_jacobian[0], c->d_jacobian[1],
383+ *c->dncps, c->dcps);
384+ cudaDeviceSynchronize();
385+ checkLastCudaError("[FTK-CUDA] sweep_simplicies");
386+
387+ cudaMemcpy(&c->hncps, c->dncps, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
388+ cudaMemset(c->dncps, 0, sizeof(unsigned long long)); // clear the counter
389+ checkLastCudaError("[FTK-CUDA] cuda memcpy device to host, 1");
390+ fprintf(stderr, "ncps=%llu\n", c->hncps);
391+ cudaMemcpy(c->hcps, c->dcps, sizeof(cp_t) * c->hncps, cudaMemcpyDeviceToHost);
392+
393+ checkLastCudaError("[FTK-CUDA] cuda memcpy device to host, 2");
394+ }
395+
396+ void mop_swap(mop_ctx_t *c)
397+ {
398+ std::swap(c->d_V[0], c->d_V[1]);
399+ std::swap(c->d_vector[0], c->d_vector[1]);
400+ std::swap(c->d_jacobian[0], c->d_jacobian[1]);
401+ }
402+ #endif
0 commit comments