Skip to content

Commit

Permalink
WIP: mpas-o tracer gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
hguo committed Oct 16, 2023
1 parent 2eca590 commit 07144fd
Show file tree
Hide file tree
Showing 3 changed files with 276 additions and 10 deletions.
55 changes: 55 additions & 0 deletions include/ftk/filters/mpas_ocean_particle_tracker.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#ifndef _MPAS_OCEAN_PARTICLE_TRACKER_CUH
#define _MPAS_OCEAN_PARTICLE_TRACKER_CUH

#include <vector>
#include <set>
#include <ftk/features/feature_point_lite.hh>
#include <ftk/mesh/bvh2d.hh>

typedef struct {
// gpu
int device;

// mesh info
int ncells, nlayers, nverts, max_edges, nattrs;

// mesh
double *d_Xc, *d_Xv; // cell/vertex coordinates
int *d_nedges_on_cell,
*d_cells_on_cell,
*d_verts_on_cell;

// time-varying data
double *d_V[2], // velocity fields of adjacent timesteps
*d_Vv[2], // vertical velocities
*d_zTop[2], // top layer depth
*d_A[2]; // scalar attrs

// particle data
int nparticles;
ftk::feature_point_lite_t *hcps = NULL, *dcps = NULL;

} mop_ctx_t;

void mop_create_ctx(mop_ctx_t **c_, int device=0);
void mop_destroy_ctx(mop_ctx_t **c_);
void mop_load_mesh(mop_ctx_t *c,
const int ncells,
const int nlayers,
const int nverts,
const int max_edges,
const int nattrs,
const double *n_edges_on_cell,
const double *cells_on_cell,
const double *verts_on_cell);

void mop_load_data(mop_ctx_t *c,
const double *V,
const double *Vv,
const double *zTop,
const double *A);

void mop_execute(mop_ctx_t *c, int scope, int current_timestep);
void mop_swap(mop_ctx_t *c);

#endif
2 changes: 2 additions & 0 deletions include/ftk/numeric/cross_product.hh
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
namespace ftk {

template <typename T>
__device__ __host__
static inline T cross_product2(const T A[2], const T B[2])
{
return A[0]*B[1] - A[1]*B[0];
}

template <typename T>
__device__ __host__
static inline void cross_product(const T A[3], const T B[3], T C[3])
{
C[0] = A[1]*B[2] - A[2]*B[1];
Expand Down
229 changes: 219 additions & 10 deletions src/filters/particle_tracer_mpas.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#include <nvfunctional>
#include <ftk/numeric/mpas.hh>
#include <ftk/numeric/wachspress_interpolation.hh>
#include <ftk/filters/mpas_ocean_particle_tracker.cuh>

typedef mop_ctx_t ctx_t;

static const int MAX_VERTS = 10;
static const int MAX_LAYERS = 100;
Expand All @@ -20,11 +23,11 @@ inline bool point_in_cell(
double xv[][3], // returns vertex coordinates
const int max_edges,
const double *Xv,
const int *n_edges_on_cell,
const int *nedges_on_cell,
const int *verts_on_cell)
{
// if (cell < 0) return false;
const int nverts = n_edges_on_cell[cell];
const int nverts = nedges_on_cell[cell];
// double xv[MAX_VERTS][3];

for (int i = 0; i < nverts; i ++) {
Expand All @@ -46,22 +49,22 @@ static int locate_cell_local( // local search among neighbors
double xv[][3], // returns vertex coordinates
const double *Xv,
const int max_edges,
const int *n_edges_on_cell, // also n_verts_on_cell
const int *nedges_on_cell, // also n_verts_on_cell
const int *cells_on_cell,
const int *verts_on_cell)
{
if (curr < 0)
return -1; // not found
else if (point_in_cell(
curr, x, iv, xv,
max_edges, Xv, n_edges_on_cell, verts_on_cell))
max_edges, Xv, nedges_on_cell, verts_on_cell))
return curr;
else {
for (int i = 0; i < n_edges_on_cell[curr]; i ++) {
for (int i = 0; i < nedges_on_cell[curr]; i ++) {
const int cell = cells_on_cell[i + max_edges * curr];
if (point_in_cell(
cell, x, iv, xv,
max_edges, Xv, n_edges_on_cell, verts_on_cell))
max_edges, Xv, nedges_on_cell, verts_on_cell))
return cell;
}
return -1; // not found among neighbors
Expand All @@ -76,15 +79,15 @@ static bool mpas_eval(
double *f, // scalar attributs
const double *V, // velocity field
const double *Vv, // vertical velocities
const double *zTop, // top layer depth
const int nattrs, // number of scalar attributes
const double *A, // scalar attributes
const double *Xv, // vertex locations
const int max_edges,
const int *n_edges_on_cell,
const int *nedges_on_cell,
const int *cells_on_cell,
const int *verts_on_cell,
const int nlayers,
const double *zTop, // top layer depth
int &hint_c,
int &hint_l) // hint for searching cell and layer
{
Expand All @@ -93,12 +96,12 @@ static bool mpas_eval(

const int cell = locate_cell_local(hint_c,
x, iv, xv,
Xv, max_edges, n_edges_on_cell,
Xv, max_edges, nedges_on_cell,
cells_on_cell, verts_on_cell);
if (cell < 0) return false;
else hint_c = cell;

const int nverts = n_edges_on_cell[cell];
const int nverts = nedges_on_cell[cell];

// compute weights based on xyzVerts
double omega[MAX_VERTS];
Expand Down Expand Up @@ -191,3 +194,209 @@ static bool mpas_eval(

return true;
}

///////////////////////////
void mop_create_ctx(mop_ctx_t **c_, int device)
{
*c_ = (ctx_t*)malloc(sizeof(ctx_t));
ctx_t *c = *c_;
memset(c, 0, sizeof(ctx_t));

c->device = device;
cudaSetDevice(device);

c->d_Xc = NULL;
c->d_Xv = NULL;
c->d_nedges_on_cell = NULL;
c->d_cells_on_cell = NULL;
c->d_verts_on_cell = NULL;

c->d_V[0] = NULL;
c->d_V[1] = NULL;
c->d_Vv[0] = NULL;
c->d_Vv[1] = NULL;
c->d_zTop[0] = NULL;
c->d_zTop[1] = NULL;
c->d_A[0] = NULL;
c->d_A[1] = NULL;
}

void mop_destroy_ctx(mop_ctx_t **c_)
{
ctx_t *c = *c_;

if (c->d_Xc != NULL) cudaFree(c->d_Xc);
// TODO

free(*c_);
*c_ = NULL;
}

void mop_load_mesh(mop_ctx_t *c,
const int ncells,
const int nlayers,
const int nverts,
const int max_edges,
const int nattrs,
const double *Xc,
const double *Xv,
const int *nedges_on_cell,
const int *cells_on_cell,
const int *verts_on_cell)
{
c->ncells = ncells;
c->nlayers = nlayers;
c->nverts = nverts;
c->max_edges = max_edges;
c->nattrs = nattrs;

cudaMalloc((void**)&c->d_Xc, size_t(ncells) * sizeof(double) * 3);
cudaMemcpy(c->d_Xc, Xc, size_t(ncells) * sizeof(double) * 3, cudaMemcpyHostToDevice);

cudaMalloc((void**)&c->d_Xv, size_t(nverts) * sizeof(double) * 3);
cudaMemcpy(c->d_Xv, Xv, size_t(nverts) * sizeof(double) * 3, cudaMemcpyHostToDevice);

cudaMalloc((void**)&c->d_nedges_on_cell, size_t(ncells) * sizeof(int));
cudaMemcpy(c->d_nedges_on_cell, nedges_on_cell, size_t(ncells) * sizeof(int), cudaMemcpyHostToDevice);

cudaMalloc((void**)&c->d_cells_on_cell, size_t(ncells) * max_edges * sizeof(int));
cudaMemcpy(c->d_cells_on_cell, cells_on_cell, size_t(ncells) * max_edges * sizeof(int), cudaMemcpyHostToDevice);

cudaMalloc((void**)&c->d_verts_on_cell, size_t(nverts) * 3 * sizeof(int));
cudaMemcpy(c->d_verts_on_cell, verts_on_cell, size_t(nverts) * 3 * sizeof(int), cudaMemcpyHostToDevice);

// checkLastCudaError("[FTK-CUDA] loading mpas mesh");
}

#if 0
void mop_load_data(mop_ctx_t *c,
const double *V,
const double *Vv,
const double *zTop,
const double *A)
{
double *dd_V;
if (c->d_V[0] == NULL) {
cudaMalloc((void**)&c->d_V[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0");
dd_V = c->d_V[0];
} else if (c->d_V[1] == NULL) {
cudaMalloc((void**)&c->d_V[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0.1");
dd_V = c->d_V[1];
} else {
std::swap(c->d_V[0], c->d_V[1]);
dd_V = c->d_V[1];
}
// fprintf(stderr, "dd=%p, d0=%p, d1=%p, src=%p\n", dd_V, c->d_V[0], c->d_V[1], scalar);
cudaMemcpy(dd_V, scalar, sizeof(double) * size_t(c->m2n0 * c->nphi),
cudaMemcpyHostToDevice);
checkLastCudaError("[FTK-CUDA] loading scalar field data, memcpy 0");

///
double *dd_Vv;
if (c->d_vector[0] == NULL) {
cudaMalloc((void**)&c->d_vector[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 2);
dd_Vv = c->d_vector[0];
} else if (c->d_vector[1] == NULL) {
cudaMalloc((void**)&c->d_vector[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi)* 2);
dd_Vv = c->d_vector[1];
} else {
std::swap(c->d_vector[0], c->d_vector[1]);
dd_Vv = c->d_vector[1];
}
cudaMemcpy(dd_Vv, vector, sizeof(double) * size_t(c->m2n0 * c->nphi * 2),
cudaMemcpyHostToDevice);
checkLastCudaError("[FTK-CUDA] loading vector field data");

///
double *dd_zTop;
if (c->d_jacobian[0] == NULL) {
cudaMalloc((void**)&c->d_jacobian[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 4);
dd_zTop = c->d_jacobian[0];
} else if (c->d_jacobian[1] == NULL) {
cudaMalloc((void**)&c->d_jacobian[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi) * 4);
dd_zTop = c->d_jacobian[1];
} else {
std::swap(c->d_jacobian[0], c->d_jacobian[1]);
dd_zTop = c->d_jacobian[1];
}
cudaMemcpy(dd_zTop, jacobian, sizeof(double) * size_t(c->m2n0 * c->nphi) * 4,
cudaMemcpyHostToDevice);
checkLastCudaError("[FTK-CUDA] loading jacobian field data");


///
double *dd_A;
if (c->d_V[0] == NULL) {
cudaMalloc((void**)&c->d_V[0], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0");
dd_A = c->d_V[0];
} else if (c->d_V[1] == NULL) {
cudaMalloc((void**)&c->d_V[1], sizeof(double) * size_t(c->m2n0) * size_t(c->nphi));
checkLastCudaError("[FTK-CUDA] loading scalar field data, malloc 0.1");
dd_A = c->d_V[1];
} else {
std::swap(c->d_V[0], c->d_V[1]);
dd_A = c->d_V[1];
}
cudaMemcpy(dd_A, scalar, sizeof(double) * size_t(c->m2n0 * c->nphi),
cudaMemcpyHostToDevice);
checkLastCudaError("[FTK-CUDA] loading scalar field data, memcpy 0");
}

void mop_execute(mop_ctx_t *c, int scope, int current_timestep)
{
const int np = c->nphi * c->iphi * c->vphi;
const int mx3n1 = (2 * c->m2n1 + c->m2n0) * np;
const int mx3n2 = (3 * c->m2n2 + 2 * c->m2n1) * np;
// const int mx4n2 = 3 * mx3n2 + 2 * mx3n1;
const int mx4n2_ordinal = mx3n2,
mx4n2_interval = 2 * mx3n2 + 2 * mx3n1;
// fprintf(stderr, "executing timestep %d\n", current_timestep);

size_t ntasks;
if (scope == scope_ordinal) ntasks = mx4n2_ordinal;
else ntasks = mx4n2_interval;

fprintf(stderr, "ntasks=%zu\n", ntasks);

const int maxGridDim = 1024;
const int blockSize = 256;
const int nBlocks = idivup(ntasks, blockSize);
dim3 gridSize;

if (nBlocks >= maxGridDim) gridSize = dim3(idivup(nBlocks, maxGridDim), maxGridDim);
else gridSize = dim3(nBlocks);

sweep_simplices<int, double><<<gridSize, blockSize>>>(
scope, current_timestep,
c->factor,
c->nphi, c->iphi, c->vphi,
c->m2n0, c->m2n1, c->m2n2,
c->d_m2coords, c->d_m2edges, c->d_m2tris,
c->d_psin,
c->d_interpolants,
c->d_V[0], c->d_V[1],
c->d_vector[0], c->d_vector[1],
c->d_jacobian[0], c->d_jacobian[1],
*c->dncps, c->dcps);
cudaDeviceSynchronize();
checkLastCudaError("[FTK-CUDA] sweep_simplicies");

cudaMemcpy(&c->hncps, c->dncps, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
cudaMemset(c->dncps, 0, sizeof(unsigned long long)); // clear the counter
checkLastCudaError("[FTK-CUDA] cuda memcpy device to host, 1");
fprintf(stderr, "ncps=%llu\n", c->hncps);
cudaMemcpy(c->hcps, c->dcps, sizeof(cp_t) * c->hncps, cudaMemcpyDeviceToHost);

checkLastCudaError("[FTK-CUDA] cuda memcpy device to host, 2");
}

void mop_swap(mop_ctx_t *c)
{
std::swap(c->d_V[0], c->d_V[1]);
std::swap(c->d_vector[0], c->d_vector[1]);
std::swap(c->d_jacobian[0], c->d_jacobian[1]);
}
#endif

0 comments on commit 07144fd

Please sign in to comment.