WIP: mpas-o tracer gpu

hguo · hguo · commit 07144fd2c7a2 · 2023-10-15T22:03:20.000-04:00
diff --git a/include/ftk/filters/mpas_ocean_particle_tracker.cuh b/include/ftk/filters/mpas_ocean_particle_tracker.cuh
@@ -0,0 +1,55 @@
+#ifndef _MPAS_OCEAN_PARTICLE_TRACKER_CUH
+#define _MPAS_OCEAN_PARTICLE_TRACKER_CUH
+
+#include <vector>
+#include <set>
+#include <ftk/features/feature_point_lite.hh>
+#include <ftk/mesh/bvh2d.hh>
+
+typedef struct {
+  // gpu
+  int device;
+
+  // mesh info
+  int ncells, nlayers, nverts, max_edges, nattrs;
+
+  // mesh
+  double *d_Xc, *d_Xv; // cell/vertex coordinates
+  int *d_nedges_on_cell, 
+      *d_cells_on_cell,
+      *d_verts_on_cell;
+
+  // time-varying data
+  double *d_V[2], // velocity fields of adjacent timesteps
+         *d_Vv[2], // vertical velocities
+         *d_zTop[2], // top layer depth
+         *d_A[2]; // scalar attrs
+
+  // particle data
+  int nparticles;
+  ftk::feature_point_lite_t *hcps = NULL, *dcps = NULL;
+
+} mop_ctx_t;
+
+void mop_create_ctx(mop_ctx_t **c_, int device=0);
+void mop_destroy_ctx(mop_ctx_t **c_);
+void mop_load_mesh(mop_ctx_t *c,
+    const int ncells, 
+    const int nlayers, 
+    const int nverts, 
+    const int max_edges,
+    const int nattrs,
+    const double *n_edges_on_cell, 
+    const double *cells_on_cell,
+    const double *verts_on_cell);
+
+void mop_load_data(mop_ctx_t *c,
+    const double *V, 
+    const double *Vv, 
+    const double *zTop, 
+    const double *A);
+
+void mop_execute(mop_ctx_t *c, int scope, int current_timestep);
+void mop_swap(mop_ctx_t *c);
+
+#endif
diff --git a/include/ftk/numeric/cross_product.hh b/include/ftk/numeric/cross_product.hh
@@ -6,12 +6,14 @@
 namespace ftk {
 
 template <typename T>
+__device__ __host__
 static inline T cross_product2(const T A[2], const T B[2])
 {
   return A[0]*B[1] - A[1]*B[0];
 }
 
 template <typename T>
+__device__ __host__
 static inline void cross_product(const T A[3], const T B[3], T C[3])
 {
   C[0] = A[1]*B[2] - A[2]*B[1]; 
diff --git a/src/filters/particle_tracer_mpas.cu b/src/filters/particle_tracer_mpas.cu
@@ -1,6 +1,9 @@
 #include <nvfunctional>
 #include <ftk/numeric/mpas.hh>
 #include <ftk/numeric/wachspress_interpolation.hh>
+#include <ftk/filters/mpas_ocean_particle_tracker.cuh>
+
+typedef mop_ctx_t ctx_t;
 
 static const int MAX_VERTS = 10;
 static const int MAX_LAYERS = 100;
@@ -20,11 +23,11 @@ inline bool point_in_cell(
     double xv[][3], // returns vertex coordinates
     const int max_edges,
     const double *Xv,
-    const int *n_edges_on_cell, 
+    const int *nedges_on_cell, 
     const int *verts_on_cell)
 {
   // if (cell < 0) return false;
-  const int nverts = n_edges_on_cell[cell];
+  const int nverts = nedges_on_cell[cell];
   // double xv[MAX_VERTS][3];
 
   for (int i = 0; i < nverts; i ++) {
@@ -46,22 +49,22 @@ static int locate_cell_local( // local search among neighbors
     double xv[][3], // returns vertex coordinates
     const double *Xv,
     const int max_edges,
-    const int *n_edges_on_cell, // also n_verts_on_cell
+    const int *nedges_on_cell, // also n_verts_on_cell
     const int *cells_on_cell,
     const int *verts_on_cell)
 {
   if (curr < 0)
     return -1; // not found
   else if (point_in_cell(
         curr, x, iv, xv, 
-        max_edges, Xv, n_edges_on_cell, verts_on_cell))
+        max_edges, Xv, nedges_on_cell, verts_on_cell))
     return curr;
   else {
-    for (int i = 0; i < n_edges_on_cell[curr]; i ++) {
+    for (int i = 0; i < nedges_on_cell[curr]; i ++) {
       const int cell = cells_on_cell[i + max_edges * curr];
       if (point_in_cell(
             cell, x, iv, xv,
-            max_edges, Xv, n_edges_on_cell, verts_on_cell))
+            max_edges, Xv, nedges_on_cell, verts_on_cell))
         return cell;
     }
     return -1; // not found among neighbors
@@ -76,15 +79,15 @@ static bool mpas_eval(
     double *f,          // scalar attributs
     const double *V,    // velocity field
     const double *Vv,   // vertical velocities
+    const double *zTop, // top layer depth
     const int nattrs,   // number of scalar attributes
     const double *A,    // scalar attributes
     const double *Xv,   // vertex locations
     const int max_edges,
-    const int *n_edges_on_cell, 
+    const int *nedges_on_cell, 
     const int *cells_on_cell,
     const int *verts_on_cell,
     const int nlayers,
-    const double *zTop, // top layer depth
     int &hint_c, 
     int &hint_l)        // hint for searching cell and layer
 {
@@ -93,12 +96,12 @@ static bool mpas_eval(
 
   const int cell = locate_cell_local(hint_c, 
       x, iv, xv, 
-      Xv, max_edges, n_edges_on_cell, 
+      Xv, max_edges, nedges_on_cell, 
       cells_on_cell, verts_on_cell);
   if (cell < 0) return false;
   else hint_c = cell;
 
-  const int nverts = n_edges_on_cell[cell];
+  const int nverts = nedges_on_cell[cell];
 
   // compute weights based on xyzVerts
   double omega[MAX_VERTS]; 
@@ -191,3 +194,209 @@ static bool mpas_eval(
 
   return true;
 }
+
+///////////////////////////
+void mop_create_ctx(mop_ctx_t **c_, int device)
+{
+  *c_ = (ctx_t*)malloc(sizeof(ctx_t));
+  ctx_t *c = *c_;
+  memset(c, 0, sizeof(ctx_t));
+
+  c->device = device;
+  cudaSetDevice(device);
+
+  c->d_Xc = NULL;
+  c->d_Xv = NULL;
+  c->d_nedges_on_cell = NULL;
+  c->d_cells_on_cell = NULL;
+  c->d_verts_on_cell = NULL;
+ 
+  c->d_V[0] = NULL;
+  c->d_V[1] = NULL;
+  c->d_Vv[0] = NULL;
+  c->d_Vv[1] = NULL;
+  c->d_zTop[0] = NULL;
+  c->d_zTop[1] = NULL;
+  c->d_A[0] = NULL;
+  c->d_A[1] = NULL;
+}
+
+void mop_destroy_ctx(mop_ctx_t **c_)
+{
+  ctx_t *c = *c_;
+
+  if (c->d_Xc != NULL) cudaFree(c->d_Xc);
+  // TODO
+
+  free(*c_);
+  *c_ = NULL;
+}
+
+void mop_load_mesh(mop_ctx_t *c,
+    const int ncells, 
+    const int nlayers, 
+    const int nverts, 
+    const int max_edges,
+    const int nattrs,
+    const double *Xc,
+    const double *Xv,
+    const int *nedges_on_cell, 
+    const int *cells_on_cell,
+    const int *verts_on_cell)
+{
+  c->ncells = ncells;
+  c->nlayers = nlayers;
+  c->nverts = nverts;
+  c->max_edges = max_edges;
+  c->nattrs = nattrs;
+
+  cudaMalloc((void**)&c->d_Xc, size_t(ncells) * sizeof(double) * 3);
+  cudaMemcpy(c->d_Xc, Xc, size_t(ncells) * sizeof(double) * 3, cudaMemcpyHostToDevice);
+
+  cudaMalloc((void**)&c->d_Xv, size_t(nverts) * sizeof(double) * 3);
+  cudaMemcpy(c->d_Xv, Xv, size_t(nverts) * sizeof(double) * 3, cudaMemcpyHostToDevice);
+
+  cudaMalloc((void**)&c->d_nedges_on_cell, size_t(ncells) * sizeof(int));
+  cudaMemcpy(c->d_nedges_on_cell, nedges_on_cell, size_t(ncells) * sizeof(int), cudaMemcpyHostToDevice);
+
+  cudaMalloc((void**)&c->d_cells_on_cell, size_t(ncells) * max_edges * sizeof(int));
+  cudaMemcpy(c->d_cells_on_cell, cells_on_cell, size_t(ncells) * max_edges * sizeof(int), cudaMemcpyHostToDevice);
+
+  cudaMalloc((void**)&c->d_verts_on_cell, size_t(nverts) * 3 * sizeof(int));
+  cudaMemcpy(c->d_verts_on_cell, verts_on_cell, size_t(nverts) * 3 * sizeof(int), cudaMemcpyHostToDevice);
+
+  // checkLastCudaError("[FTK-CUDA] loading mpas mesh");
+}
+
+#if 0
+void mop_load_data(mop_ctx_t *c,
+    const double *V, 
+    const double *Vv, 
+    const double *zTop, 
+    const double *A)
+{
+  double *dd_V;
+  if (c->d_V[0] == NULL) {
+    cudaMalloc((void**)&c->d_V[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
+    checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0");
+    dd_V = c->d_V[0];
+  } else if (c->d_V[1] == NULL) {
+    cudaMalloc((void**)&c->d_V[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
+    checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0.1");
+    dd_V = c->d_V[1];
+  } else {
+    std::swap(c->d_V[0], c->d_V[1]);
+    dd_V = c->d_V[1];
+  }
+  // fprintf(stderr, "dd=%p, d0=%p, d1=%p, src=%p\n", dd_V, c->d_V[0], c->d_V[1], scalar);
+  cudaMemcpy(dd_V, scalar, sizeof(double) * size_t(c->m2n0 * c->nphi), 
+      cudaMemcpyHostToDevice);
+  checkLastCudaError("[FTK-CUDA] loading scalar field data, memcpy 0");
+ 
+  /// 
+  double *dd_Vv;
+  if (c->d_vector[0] == NULL) {
+    cudaMalloc((void**)&c->d_vector[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 2);
+    dd_Vv = c->d_vector[0];
+  } else if (c->d_vector[1] == NULL) {
+    cudaMalloc((void**)&c->d_vector[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi)* 2);
+    dd_Vv = c->d_vector[1];
+  } else {
+    std::swap(c->d_vector[0], c->d_vector[1]);
+    dd_Vv = c->d_vector[1];
+  }
+  cudaMemcpy(dd_Vv, vector, sizeof(double) * size_t(c->m2n0 * c->nphi * 2), 
+      cudaMemcpyHostToDevice);
+  checkLastCudaError("[FTK-CUDA] loading vector field data");
+
+  /// 
+  double *dd_zTop;
+  if (c->d_jacobian[0] == NULL) {
+    cudaMalloc((void**)&c->d_jacobian[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 4);
+    dd_zTop = c->d_jacobian[0];
+  } else if (c->d_jacobian[1] == NULL) {
+    cudaMalloc((void**)&c->d_jacobian[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 4);
+    dd_zTop = c->d_jacobian[1];
+  } else {
+    std::swap(c->d_jacobian[0], c->d_jacobian[1]);
+    dd_zTop = c->d_jacobian[1];
+  }
+  cudaMemcpy(dd_zTop, jacobian, sizeof(double) * size_t(c->m2n0 * c->nphi) * 4, 
+      cudaMemcpyHostToDevice);
+  checkLastCudaError("[FTK-CUDA] loading jacobian field data");
+
+
+  /// 
+  double *dd_A;
+  if (c->d_V[0] == NULL) {
+    cudaMalloc((void**)&c->d_V[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
+    checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0");
+    dd_A = c->d_V[0];
+  } else if (c->d_V[1] == NULL) {
+    cudaMalloc((void**)&c->d_V[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
+    checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0.1");
+    dd_A = c->d_V[1];
+  } else {
+    std::swap(c->d_V[0], c->d_V[1]);
+    dd_A = c->d_V[1];
+  }
+  cudaMemcpy(dd_A, scalar, sizeof(double) * size_t(c->m2n0 * c->nphi), 
+      cudaMemcpyHostToDevice);
+  checkLastCudaError("[FTK-CUDA] loading scalar field data, memcpy 0");
+}
+
+void mop_execute(mop_ctx_t *c, int scope, int current_timestep)
+{
+  const int np = c->nphi * c->iphi * c->vphi;
+  const int mx3n1 = (2 * c->m2n1 + c->m2n0) * np;
+  const int mx3n2 = (3 * c->m2n2 + 2 * c->m2n1) * np;
+  // const int mx4n2 = 3 * mx3n2 + 2 * mx3n1;
+  const int mx4n2_ordinal  = mx3n2, 
+            mx4n2_interval = 2 * mx3n2 + 2 * mx3n1;
+  // fprintf(stderr, "executing timestep %d\n", current_timestep);
+
+  size_t ntasks;
+  if (scope == scope_ordinal) ntasks = mx4n2_ordinal;
+  else ntasks = mx4n2_interval;
+  
+  fprintf(stderr, "ntasks=%zu\n", ntasks);
+  
+  const int maxGridDim = 1024;
+  const int blockSize = 256;
+  const int nBlocks = idivup(ntasks, blockSize);
+  dim3 gridSize;
+
+  if (nBlocks >= maxGridDim) gridSize = dim3(idivup(nBlocks, maxGridDim), maxGridDim);
+  else gridSize = dim3(nBlocks);
+
+  sweep_simplices<int, double><<<gridSize, blockSize>>>(
+      scope, current_timestep, 
+      c->factor,
+      c->nphi, c->iphi, c->vphi, 
+      c->m2n0, c->m2n1, c->m2n2, 
+      c->d_m2coords, c->d_m2edges, c->d_m2tris, 
+      c->d_psin,
+      c->d_interpolants, 
+      c->d_V[0], c->d_V[1],
+      c->d_vector[0], c->d_vector[1],
+      c->d_jacobian[0], c->d_jacobian[1], 
+      *c->dncps, c->dcps);
+  cudaDeviceSynchronize();
+  checkLastCudaError("[FTK-CUDA] sweep_simplicies");
+
+  cudaMemcpy(&c->hncps, c->dncps, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
+  cudaMemset(c->dncps, 0, sizeof(unsigned long long)); // clear the counter
+  checkLastCudaError("[FTK-CUDA] cuda memcpy device to host, 1");
+  fprintf(stderr, "ncps=%llu\n", c->hncps);
+  cudaMemcpy(c->hcps, c->dcps, sizeof(cp_t) * c->hncps, cudaMemcpyDeviceToHost);
+  
+  checkLastCudaError("[FTK-CUDA] cuda memcpy device to host, 2");
+}
+
+void mop_swap(mop_ctx_t *c)
+{
+  std::swap(c->d_V[0], c->d_V[1]);
+  std::swap(c->d_vector[0], c->d_vector[1]);
+  std::swap(c->d_jacobian[0], c->d_jacobian[1]);
+}
+#endif

Original file line number	Diff line number	Diff line change
`@@ -6,12 +6,14 @@`
`6`	`6`	`namespace ftk {`
`7`	`7`
`8`	`8`	`template <typename T>`
	`9`	`+__device__ __host__`
`9`	`10`	`static inline T cross_product2(const T A[2], const T B[2])`
`10`	`11`	`{`
`11`	`12`	`return A[0]B[1] - A[1]B[0];`
`12`	`13`	`}`
`13`	`14`
`14`	`15`	`template <typename T>`
	`16`	`+__device__ __host__`
`15`	`17`	`static inline void cross_product(const T A[3], const T B[3], T C[3])`
`16`	`18`	`{`
`17`	`19`	`C[0] = A[1]B[2] - A[2]B[1];`