From 1f038e767011e20100af6ab0db02c69cf7ebe55c Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Sat, 15 Jun 2024 21:52:24 -0700
Subject: [PATCH] Multi-level composite solver using hypre (#3987)

This adds a multi-level composite solver for cell-centered ABecLaplacian
using hypre. Currently it uses the sstruct interface and BoomerAMG.
What's different from our existing hypre solvers is that this one does
not use amrex::MLMG at all and it can do multi-level composite solve
entirely with hypre, whereas all previous solvers use hypre as a
"bottom" solver of MLMG's v-cycle (even though the v-cycle could have
only one level).
---
 Src/Base/AMReX_GpuDevice.H                    |    4 +
 Src/Base/AMReX_GpuDevice.cpp                  |   13 +
 Src/Extern/HYPRE/AMReX_HypreMLABecLap.H       |  134 ++
 Src/Extern/HYPRE/AMReX_HypreMLABecLap.cpp     | 1464 +++++++++++++++++
 Src/Extern/HYPRE/AMReX_HypreMLABecLap_2D_K.H  |  308 ++++
 Src/Extern/HYPRE/AMReX_HypreMLABecLap_3D_K.H  |  649 ++++++++
 Src/Extern/HYPRE/AMReX_HypreMLABecLap_K.H     |  298 ++++
 Src/Extern/HYPRE/CMakeLists.txt               |    4 +
 Src/Extern/HYPRE/Make.package                 |    6 +-
 .../LinearSolvers/ABecLaplacian_C/GNUmakefile |    5 +
 Tests/LinearSolvers/ABecLaplacian_C/MyTest.H  |    7 +
 .../LinearSolvers/ABecLaplacian_C/MyTest.cpp  |  131 ++
 .../ABecLaplacian_C/inputs-mlhypre            |   22 +
 13 files changed, 3043 insertions(+), 2 deletions(-)
 create mode 100644 Src/Extern/HYPRE/AMReX_HypreMLABecLap.H
 create mode 100644 Src/Extern/HYPRE/AMReX_HypreMLABecLap.cpp
 create mode 100644 Src/Extern/HYPRE/AMReX_HypreMLABecLap_2D_K.H
 create mode 100644 Src/Extern/HYPRE/AMReX_HypreMLABecLap_3D_K.H
 create mode 100644 Src/Extern/HYPRE/AMReX_HypreMLABecLap_K.H
 create mode 100644 Tests/LinearSolvers/ABecLaplacian_C/inputs-mlhypre
diff --git a/Src/Base/AMReX_GpuDevice.H b/Src/Base/AMReX_GpuDevice.H
index 45570310347..7c17c918a70 100644
--- a/Src/Base/AMReX_GpuDevice.H
+++ b/Src/Base/AMReX_GpuDevice.H
@@ -314,6 +314,10 @@ dtod_memcpy (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
 
 #endif
 
+#ifdef AMREX_USE_HYPRE
+void hypreSynchronize ();
+#endif
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index 193b73b7af9..fa190a22c1c 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -5,6 +5,10 @@
 #include <AMReX_Print.H>
 #include <AMReX_GpuLaunch.H>
 
+#ifdef AMREX_USE_HYPRE
+#  include <_hypre_utilities.h>
+#endif
+
 #include <iostream>
 #include <map>
 #include <algorithm>
@@ -1029,4 +1033,13 @@ Device::profilerStop ()
 #endif
 }
 
+#ifdef AMREX_USE_HYPRE
+void hypreSynchronize ()
+{
+#ifdef AMREX_USE_GPU
+    hypre_SyncCudaDevice(hypre_handle()); // works for non-cuda device too
+#endif
+}
+#endif
+
 }
diff --git a/Src/Extern/HYPRE/AMReX_HypreMLABecLap.H b/Src/Extern/HYPRE/AMReX_HypreMLABecLap.H
new file mode 100644
index 00000000000..04147207d36
--- /dev/null
+++ b/Src/Extern/HYPRE/AMReX_HypreMLABecLap.H
@@ -0,0 +1,134 @@
+#ifndef AMREX_HYPRE_ML_ABECLAP_H_
+#define AMREX_HYPRE_ML_ABECLAP_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_Geometry.H>
+#include <AMReX_LO_BCTYPES.H>
+#include <AMReX_MLMGBndry.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_MultiMask.H>
+
+#include <HYPRE_sstruct_ls.h>
+
+#include <limits>
+#include <utility>
+
+namespace amrex {
+
+enum struct HypreSolverID {
+    BoomerAMG, SSAMG
+};
+
+// single component only, cell centered only
+
+class HypreMLABecLap
+{
+public:
+    HypreMLABecLap (HypreMLABecLap const&) = delete;
+    HypreMLABecLap (HypreMLABecLap &&) = delete;
+    HypreMLABecLap& operator= (HypreMLABecLap const&) = delete;
+    HypreMLABecLap& operator= (HypreMLABecLap &&) = delete;
+
+    HypreMLABecLap (Vector<Geometry> a_geom,
+                    Vector<BoxArray> a_grids,
+                    Vector<DistributionMapping> a_dmap,
+                    HypreSolverID a_hypre_solver_id,
+                    std::string a_parmparse_prefix = "hypre_mlabeclap");
+
+    ~HypreMLABecLap ();
+
+    void setVerbose (int v) { m_verbose = v; }
+    void setMaxIter (int v) { m_maxiter = v; }
+
+    void setup (Real a_ascalar, Real a_bscalar,
+                Vector<MultiFab const*> const& a_acoefs,
+                Vector<Array<MultiFab const*,AMREX_SPACEDIM>> const& a_bcoefs,
+                Array<LinOpBCType,AMREX_SPACEDIM> const& a_lobc,
+                Array<LinOpBCType,AMREX_SPACEDIM> const& a_hibc,
+                Vector<MultiFab const*> const& a_levelbcdata,
+                std::pair<MultiFab const*, IntVect> const& a_coarse_bc = {nullptr, IntVect(0)});
+
+    void solve (Vector<MultiFab*> const& a_sol, Vector<MultiFab const*> const& a_rhs,
+                Real a_reltol, Real a_abstol);
+
+    // update? updateDirichleBC? or updateCoeffs?
+
+    // public for cuda
+
+    void commBCoefs (int flev, Array<MultiFab const*,AMREX_SPACEDIM> const& a_bcoefs);
+    void commBCoefs_local (int flev, Array<MultiFab const*,AMREX_SPACEDIM> const& a_bcoefs,
+                           Vector<FabArrayBase::CopyComTag> const& tags);
+
+private:
+
+    void addNonStencilEntriesToGraph ();
+
+    int m_verbose = 0;
+    int m_maxiter = 200;
+
+    Vector<Geometry> m_geom;
+    Vector<BoxArray> m_grids;
+    Vector<DistributionMapping> m_dmap;
+    std::string m_parmparse_prefix;
+    int m_nlevels = 0;
+    MPI_Comm m_comm = MPI_COMM_NULL;
+
+    Vector<IntVect> m_ref_ratio;
+
+    Real m_ascalar = std::numeric_limits<Real>::max();
+    Real m_bscalar = std::numeric_limits<Real>::max();
+    Array<LinOpBCType,AMREX_SPACEDIM> m_lobc{AMREX_D_DECL(LinOpBCType::bogus,
+                                                          LinOpBCType::bogus,
+                                                          LinOpBCType::bogus)};
+    Array<LinOpBCType,AMREX_SPACEDIM> m_hibc{AMREX_D_DECL(LinOpBCType::bogus,
+                                                          LinOpBCType::bogus,
+                                                          LinOpBCType::bogus)};
+
+    Vector<std::unique_ptr<MLMGBndry>> m_bndry;
+    Vector<std::unique_ptr<BndryRegister>> m_bndry_rhs;
+    Vector<iMultiFab> m_fine_masks;
+
+    // For coarse cells at coarse/fine interface. The vector is for AMR
+    // levels.
+    Vector<iMultiFab> m_c2f_offset_from; // offset for sparse coarse from-cells
+    Vector<LayoutData<int>> m_c2f_total_from; // # of coarse from-cells w/ c2f entries
+    Vector<iMultiFab> m_c2f_nentries; // # of non-stencil entries
+    Vector<iMultiFab> m_c2f_offset_to; // offset for sparse to-cells, including fine (and coarse in 3d) cells
+    Vector<LayoutData<int>> m_c2f_total_to; // total sum of non-stencil entries in a Box
+
+    // B coefficients at coarse/fine interface
+    Vector<Array<iMultiFab,AMREX_SPACEDIM>> m_offset_cf_bcoefs;
+    Vector<Array<LayoutData<std::unique_ptr<Gpu::DeviceVector<Real>>>,AMREX_SPACEDIM>> m_cf_bcoefs;
+
+#ifdef AMREX_USE_GPU
+    template <class T> using HostVector = Gpu::PinnedVector<T>;
+#else
+    template <class T> using HostVector = Vector<T>;
+#endif
+
+    // For fine cells at coarse/fine interface. The non-stencil entries are
+    // from fine to coarse. The outer vector is for AMR levels.
+    Vector<HostVector<int>> m_f2c_bno; // local box number
+    Vector<HostVector<IntVect>> m_f2c_cell; // fine cell
+    Vector<Vector<HYPRE_Int>> m_f2c_nentries; // # of non-stencil entries
+    Vector<HostVector<std::size_t>> m_f2c_offset; // offset into m_f2c_values
+    Vector<HostVector<Real>> m_f2c_values; // values for non-stencil entries
+
+    HYPRE_SStructGrid    m_ss_grid = nullptr;
+    HYPRE_SStructStencil m_ss_stencil = nullptr;
+    HYPRE_SStructGraph   m_ss_graph = nullptr;
+    HYPRE_SStructSolver  m_ss_solver = nullptr;
+    HYPRE_SStructSolver  m_ss_precond = nullptr;
+    HYPRE_SStructMatrix  m_ss_A = nullptr;
+    HYPRE_SStructVector  m_ss_x = nullptr;
+    HYPRE_SStructVector  m_ss_b = nullptr;
+
+    HYPRE_Solver m_solver = nullptr;
+
+    HypreSolverID m_hypre_solver_id = HypreSolverID::BoomerAMG;
+    HYPRE_Int m_hypre_object_type = HYPRE_PARCSR;
+};
+
+}
+
+#endif
diff --git a/Src/Extern/HYPRE/AMReX_HypreMLABecLap.cpp b/Src/Extern/HYPRE/AMReX_HypreMLABecLap.cpp
new file mode 100644
index 00000000000..fb01d72eaff
--- /dev/null
+++ b/Src/Extern/HYPRE/AMReX_HypreMLABecLap.cpp
@@ -0,0 +1,1464 @@
+#include <AMReX_HypreMLABecLap.H>
+#include <AMReX_MultiFabUtil.H>
+#include <AMReX_HypreMLABecLap_K.H>
+
+#include <functional>
+#include <numeric>
+#include <tuple>
+
+namespace amrex {
+
+HypreMLABecLap::HypreMLABecLap (Vector<Geometry> a_geom,
+                                Vector<BoxArray> a_grids,
+                                Vector<DistributionMapping> a_dmap,
+                                HypreSolverID a_hypre_solver_id,
+                                std::string a_parmparse_prefix)
+    : m_geom(std::move(a_geom)),
+      m_grids(std::move(a_grids)),
+      m_dmap(std::move(a_dmap)),
+      m_parmparse_prefix(std::move(a_parmparse_prefix)),
+      m_nlevels(int(m_grids.size())),
+      m_comm(ParallelContext::CommunicatorSub()),
+      m_hypre_solver_id(a_hypre_solver_id),
+      m_hypre_object_type((a_hypre_solver_id == HypreSolverID::BoomerAMG) ?
+                          HYPRE_PARCSR : HYPRE_SSTRUCT)
+{
+    BL_PROFILE("HypreMLABecLap::HypreMLABecLap");
+
+#ifndef AMREX_FEATURE_HYPRE_SSAMG
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_hypre_solver_id == HypreSolverID::BoomerAMG,
+                                     "HypreMLABecLap only supports BoomerAMG ifndef AMREX_FEATURE_HYPRE_SSAMG");
+#endif
+
+    m_ref_ratio.resize(m_nlevels-1);
+    for (int ilev = 0; ilev < m_nlevels-1; ++ilev) {
+        m_ref_ratio[ilev] = m_geom[ilev+1].Domain().length()
+            /               m_geom[ilev  ].Domain().length();
+        AMREX_ASSERT(m_geom[ilev+1].Domain() == amrex::refine(m_geom[ilev].Domain(),
+                                                              m_ref_ratio[ilev])
+                     && m_ref_ratio[ilev].allLE(4));
+    }
+
+    m_bndry.resize(m_nlevels);
+    for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+        int const ncomp = 1;
+        m_bndry[ilev] = std::make_unique<MLMGBndry>(m_grids[ilev],
+                                                    m_dmap[ilev],
+                                                    ncomp,
+                                                    m_geom[ilev]);
+    }
+
+    m_bndry_rhs.resize(m_nlevels);
+    for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+        int const ncomp = 1;
+        m_bndry_rhs[ilev]= std::make_unique<BndryRegister>(m_grids[ilev],
+                                                           m_dmap[ilev],
+                                                           1, 0, 0, ncomp);
+    }
+
+    m_fine_masks.resize(m_nlevels-1);
+    for (int ilev = 0; ilev < m_nlevels-1; ++ilev) {
+        m_fine_masks[ilev] = amrex::makeFineMask(m_grids[ilev], m_dmap[ilev], IntVect(1),
+                                                 m_grids[ilev+1], m_ref_ratio[ilev],
+                                                 m_geom[ilev].periodicity(),
+                                                 0, 1);
+    }
+
+    m_c2f_offset_from.resize(m_nlevels-1);
+    m_c2f_total_from.resize(m_nlevels-1);
+    m_c2f_nentries.resize(m_nlevels-1);
+    m_c2f_offset_to.resize(m_nlevels-1);
+    m_c2f_total_to.resize(m_nlevels-1);
+    for (int ilev = 0; ilev < m_nlevels-1; ++ilev) {
+        m_c2f_offset_from[ilev].define(m_grids[ilev], m_dmap[ilev], 1, 0);
+        m_c2f_total_from[ilev].define(m_grids[ilev], m_dmap[ilev]);
+        m_c2f_nentries[ilev].define(m_grids[ilev], m_dmap[ilev], 1, 0);
+        m_c2f_offset_to[ilev].define(m_grids[ilev], m_dmap[ilev], 1, 0);
+        m_c2f_total_to[ilev].define(m_grids[ilev], m_dmap[ilev]);
+    }
+
+    m_offset_cf_bcoefs.resize(m_nlevels-1);
+    m_cf_bcoefs.resize(m_nlevels-1);
+
+    static_assert(std::is_same_v<HYPRE_Real,Real>,
+                  "HYPRE_Real and amrex::Real must be the same type");
+
+    HYPRE_SStructGridCreate(m_comm, AMREX_SPACEDIM, m_nlevels, &m_ss_grid);
+
+    constexpr HYPRE_Int nvars = 1;
+    constexpr HYPRE_Int ivar = 0;
+
+    for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+        // Which hypre solver has the limitation of power of 2 restrictions
+        // for periodic domains?
+        if (m_geom[ilev].isAnyPeriodic()) {
+            Array<HYPRE_Int,AMREX_SPACEDIM> periodic;
+            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                periodic[idim] = m_geom[ilev].isPeriodic(idim)
+                    ? m_geom[ilev].Domain().length(idim) : 0;
+            }
+            HYPRE_SStructGridSetPeriodic(m_ss_grid, ilev, periodic.data());
+        }
+
+        AMREX_ASSERT(m_grids[ilev].ixType().cellCentered());
+
+        for (MFIter mfi(m_grids[ilev], m_dmap[ilev], MFItInfo().DisableDeviceSync());
+             mfi.isValid(); ++mfi)
+        {
+            Box const& b = mfi.validbox();
+            Array<HYPRE_Int,AMREX_SPACEDIM> lo{AMREX_D_DECL(b.smallEnd(0),
+                                                            b.smallEnd(1),
+                                                            b.smallEnd(2))};
+            Array<HYPRE_Int,AMREX_SPACEDIM> hi{AMREX_D_DECL(b.bigEnd(0),
+                                                            b.bigEnd(1),
+                                                            b.bigEnd(2))};
+            HYPRE_SStructGridSetExtents(m_ss_grid, ilev, lo.data(), hi.data());
+        }
+
+        auto vartype = HYPRE_SSTRUCT_VARIABLE_CELL;
+        HYPRE_SStructVariable vars[nvars] = {vartype};
+        HYPRE_SStructGridSetVariables(m_ss_grid, ilev, nvars, vars);
+    }
+
+    HYPRE_SStructGridAssemble(m_ss_grid);
+
+#if (AMREX_SPACEDIM == 2)
+    HYPRE_Int cross_stencil_offset[5][2] = {{ 0,  0},
+                                            {-1,  0},
+                                            { 1,  0},
+                                            { 0, -1},
+                                            { 0,  1}};
+#elif (AMREX_SPACEDIM == 3)
+    HYPRE_Int cross_stencil_offset[7][3] = {{ 0,  0,  0},
+                                            {-1,  0,  0},
+                                            { 1,  0,  0},
+                                            { 0, -1,  0},
+                                            { 0,  1,  0},
+                                            { 0,  0, -1},
+                                            { 0,  0,  1}};
+#endif
+
+    HYPRE_SStructStencilCreate(AMREX_SPACEDIM, 2*AMREX_SPACEDIM+1, &m_ss_stencil);
+    for (HYPRE_Int i = 0; i < 2*AMREX_SPACEDIM+1; ++i) {
+        HYPRE_SStructStencilSetEntry(m_ss_stencil, i, cross_stencil_offset[i], ivar);
+    }
+
+    HYPRE_SStructGraphCreate(m_comm, m_ss_grid, &m_ss_graph);
+    HYPRE_SStructGraphSetObjectType(m_ss_graph, m_hypre_object_type);
+
+    for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+        HYPRE_SStructGraphSetStencil(m_ss_graph, ilev, ivar, m_ss_stencil);
+    }
+
+    addNonStencilEntriesToGraph();
+
+    HYPRE_SStructGraphAssemble(m_ss_graph);
+
+    HYPRE_SStructMatrixCreate(m_comm, m_ss_graph, &m_ss_A);
+    HYPRE_SStructMatrixSetObjectType(m_ss_A, m_hypre_object_type);
+    HYPRE_SStructMatrixInitialize(m_ss_A);
+}
+
+HypreMLABecLap::~HypreMLABecLap ()
+{
+    HYPRE_SStructGridDestroy(m_ss_grid);
+    HYPRE_SStructStencilDestroy(m_ss_stencil);
+    HYPRE_SStructGraphDestroy(m_ss_graph);
+#if 0
+    if (m_ss_precond) {
+        HYPRE_SStructSolverDestroy(m_ss_precond);
+    }
+#endif
+    HYPRE_SStructMatrixDestroy(m_ss_A);
+    if (m_ss_x) {
+        HYPRE_SStructVectorDestroy(m_ss_x);
+    }
+    if (m_ss_b) {
+        HYPRE_SStructVectorDestroy(m_ss_b);
+    }
+    if (m_solver) {
+#ifdef AMREX_FEATURE_HYPRE_SSAMG
+        if (m_hypre_solver_id == HypreSolverID::SSAMG) {
+            if (m_ss_solver) {
+                HYPRE_SStructSSAMGDestroy(m_ss_solver);
+            }
+        } else
+#endif
+        {
+            if (m_solver) {
+                HYPRE_BoomerAMGDestroy(m_solver);
+            }
+        }
+    }
+}
+
+void HypreMLABecLap::addNonStencilEntriesToGraph ()
+{
+    BL_PROFILE("HypreMLABecLap::addNonStencilEntriesToGraph");
+
+    Vector<std::tuple<int,int,IntVect,int,IntVect>> entries;
+
+    for (int ilev = 1; ilev < m_nlevels; ++ilev) {
+        int const clev = ilev-1;
+        int const flev = ilev;
+
+#if (AMREX_SPACEDIM == 3)
+        Box const& cgdomain = m_geom[clev].growPeriodicDomain(1);
+#endif
+
+        IntVect const& refratio = m_ref_ratio[clev];
+
+        auto const& fine_mask = m_fine_masks[clev];
+
+        for (MFIter mfi(fine_mask, MFItInfo().DisableDeviceSync());
+             mfi.isValid(); ++mfi)
+        {
+            auto const lidx = mfi.LocalIndex();
+            int c2f_total_from = 0;
+            Long c2f_total_to = 0;
+#ifdef AMREX_USE_GPU
+            IArrayBox h_mask(fine_mask[mfi].box(), 1, The_Pinned_Arena());
+            IArrayBox h_c2f_offset_from(m_c2f_offset_from[clev][mfi].box(), 1, The_Pinned_Arena());
+            IArrayBox h_c2f_offset_to(m_c2f_offset_to[clev][mfi].box(), 1, The_Pinned_Arena());
+            IArrayBox h_c2f_nentries(m_c2f_nentries[clev][mfi].box(), 1, The_Pinned_Arena());
+            Gpu::dtoh_memcpy_async(h_mask.dataPtr(),
+                                   fine_mask[mfi].dataPtr(),
+                                   h_mask.nBytes());
+            Gpu::streamSynchronize();
+            auto const& mask = h_mask.const_array();
+            auto const& c2f_offset_from = h_c2f_offset_from.array();
+            auto const& c2f_offset_to = h_c2f_offset_to.array();
+            auto const& c2f_nentries = h_c2f_nentries.array();
+#else
+            auto const& mask = fine_mask.const_array(mfi);
+            auto const& c2f_offset_from = m_c2f_offset_from[clev].array(mfi);
+            auto const& c2f_offset_to = m_c2f_offset_to[clev].array(mfi);
+            auto const& c2f_nentries = m_c2f_nentries[clev].array(mfi);
+#endif
+            amrex::LoopOnCpu(mfi.validbox(), [&] (int i, int j, int k)
+            {
+                amrex::ignore_unused(k);
+                int nc2f = 0;
+                if (mask(i,j,k) == 0) { // uncovered coarse cell
+                    IntVect const civ(AMREX_D_DECL(i,j,k));
+                    for (OrientationIter ori; ori; ++ori) {
+                        auto const face = ori();
+                        int const idir = face.coordDir();
+                        IntVect offset(0);
+                        offset[idir] = face.isLow() ? -1 : 1;
+                        IntVect const to_civ = civ + offset;
+                        if (mask(to_civ) == 1) { // covered by fine cells
+                            IntVect lo = to_civ * refratio;
+                            IntVect hi = lo + refratio - 1;
+                            if (face.isLow()) {
+                                lo[idir] = hi[idir] - 1;
+                            } else {
+                                hi[idir] = lo[idir] + 1;
+                            }
+                            // [lo,hi]: two layers of adjacent fine cells
+                            auto len = hi-lo+1;
+                            nc2f += AMREX_D_TERM(len[0], *len[1], *len[2]);
+                            amrex::LoopOnCpu(lo.dim3(), hi.dim3(),
+                                             [&] (int ii, int jj, int kk)
+                            {
+                                amrex::ignore_unused(kk);
+                                entries.emplace_back(clev, lidx, civ, flev,
+                                                     IntVect(AMREX_D_DECL(ii,jj,kk)));
+                            });
+#if (AMREX_SPACEDIM == 3)
+                            int const idir1 = ((idir+1) < AMREX_SPACEDIM)
+                                ? idir+1 : idir+1-AMREX_SPACEDIM;
+                            int const idir2 = ((idir+2) < AMREX_SPACEDIM)
+                                ? idir+2 : idir+2-AMREX_SPACEDIM;
+                            IntVect t1(0); t1[idir1] = 1;
+                            IntVect t2(0); t2[idir2] = 1;
+                            IntVect c1 = civ - t1 - t2;
+                            IntVect c2 = civ + t1 - t2;
+                            IntVect c3 = civ - t1 + t2;
+                            IntVect c4 = civ + t1 + t2;
+                            if (mask(c1) == 0 && cgdomain.contains(c1) &&
+                                mask(c2) == 0 && cgdomain.contains(c2) &&
+                                mask(c3) == 0 && cgdomain.contains(c3) &&
+                                mask(c4) == 0 && cgdomain.contains(c4))
+                            {
+                                entries.emplace_back(clev, lidx, civ, clev, c1);
+                                entries.emplace_back(clev, lidx, civ, clev, c2);
+                                entries.emplace_back(clev, lidx, civ, clev, c3);
+                                entries.emplace_back(clev, lidx, civ, clev, c4);
+                                nc2f += 4;
+                                // Note a corase will not have fine cells on
+                                // both ends of a direction. So we don't
+                                // have to worry about duplication.
+                            }
+#endif
+                        }
+                    }
+                }
+                c2f_offset_from(i,j,k) = c2f_total_from;
+                if (nc2f > 0) { ++c2f_total_from; }
+                c2f_nentries(i,j,k) = nc2f;
+                c2f_offset_to(i,j,k) = int(c2f_total_to);
+                c2f_total_to += nc2f;
+            });
+#ifdef AMREX_USE_GPU
+            Gpu::htod_memcpy_async(m_c2f_offset_from[clev][mfi].dataPtr(),
+                                   h_c2f_offset_from.dataPtr(),
+                                   h_c2f_offset_from.nBytes());
+            Gpu::htod_memcpy_async(m_c2f_offset_to[clev][mfi].dataPtr(),
+                                   h_c2f_offset_to.dataPtr(),
+                                   h_c2f_offset_to.nBytes());
+            Gpu::htod_memcpy_async(m_c2f_nentries[clev][mfi].dataPtr(),
+                                   h_c2f_nentries.dataPtr(),
+                                   h_c2f_nentries.nBytes());
+            Gpu::streamSynchronize();
+#endif
+            AMREX_ASSERT(c2f_total_to < Long(std::numeric_limits<int>::max()));
+            m_c2f_total_from[clev][mfi] = int(c2f_total_from);
+            m_c2f_total_to[clev][mfi] = int(c2f_total_to);
+        }
+
+        for (MFIter mfi(m_grids[flev], m_dmap[flev], MFItInfo().DisableDeviceSync());
+             mfi.isValid(); ++mfi)
+        {
+            auto const lidx = mfi.LocalIndex();
+            Box const& vbx = mfi.validbox();
+            for (OrientationIter ori; ori; ++ori) {
+                auto const face = ori();
+                int const idir = face.coordDir();
+                int const idir1 = ((idir+1) < AMREX_SPACEDIM)
+                    ? idir+1 : idir+1-AMREX_SPACEDIM;
+#if (AMREX_SPACEDIM == 3)
+                int const idir2 = ((idir+2) < AMREX_SPACEDIM)
+                    ? idir+2 : idir+2-AMREX_SPACEDIM;
+#endif
+
+#ifdef AMREX_USE_GPU
+                IArrayBox h_mask(m_bndry[flev]->bndryMasks(face)[mfi].box(), 1, The_Pinned_Arena());
+                Gpu::dtoh_memcpy_async(h_mask.dataPtr(),
+                                       m_bndry[flev]->bndryMasks(face)[mfi].dataPtr(),
+                                       h_mask.nBytes());
+                Gpu::streamSynchronize();
+                auto const& mask = h_mask.const_array();
+#else
+                auto const& mask = m_bndry[flev]->bndryMasks(face).const_array(mfi);
+#endif
+
+                Box bin = vbx;
+                bin.setRange(idir, vbx[face]); // just inside face
+                Box bout = amrex::adjCell(vbx, face); // just outside face
+                IntVect offset_n = bout.smallEnd() - bin.smallEnd();
+                IntVect offset_t1(0), offset_t1_r(0);
+                offset_t1  [idir1] = 1;
+                offset_t1_r[idir1] = refratio[idir1];
+#if (AMREX_SPACEDIM == 3)
+                IntVect offset_t2(0), offset_t2_r(0);
+                offset_t2  [idir2] = 1;
+                offset_t2_r[idir2] = refratio[idir2];
+#endif
+                amrex::LoopOnCpu(bin, [&] (int i, int j, int k)
+                {
+#if (AMREX_SPACEDIM == 2)
+                    amrex::ignore_unused(k);
+#endif
+                    IntVect iv_in(AMREX_D_DECL(i,j,k));
+                    IntVect iv_out = iv_in + offset_n;
+                    if (mask(iv_out) == BndryData::not_covered) {
+                        IntVect civ_out = amrex::coarsen(iv_out, refratio);
+                        entries.emplace_back(flev, lidx, iv_in, clev, civ_out);
+
+                        if (mask(iv_out+offset_t1_r) == BndryData::not_covered) {
+                            entries.emplace_back(flev, lidx, iv_in, clev, civ_out+offset_t1);
+                        }
+                        if (mask(iv_out-offset_t1_r) == BndryData::not_covered) {
+                            entries.emplace_back(flev, lidx, iv_in, clev, civ_out-offset_t1);
+                        }
+#if (AMREX_SPACEDIM == 3)
+                        if (mask(iv_out+offset_t2_r) == BndryData::not_covered) {
+                            entries.emplace_back(flev, lidx, iv_in, clev, civ_out+offset_t2);
+                        }
+                        if (mask(iv_out-offset_t2_r) == BndryData::not_covered) {
+                            entries.emplace_back(flev, lidx, iv_in, clev, civ_out-offset_t2);
+                        }
+                        if (mask(iv_out-offset_t1_r-offset_t2_r) == BndryData::not_covered &&
+                            mask(iv_out+offset_t1_r-offset_t2_r) == BndryData::not_covered &&
+                            mask(iv_out-offset_t1_r+offset_t2_r) == BndryData::not_covered &&
+                            mask(iv_out+offset_t1_r+offset_t2_r) == BndryData::not_covered)
+                        {
+                            entries.emplace_back(flev, lidx, iv_in, clev, civ_out-offset_t1-offset_t2);
+                            entries.emplace_back(flev, lidx, iv_in, clev, civ_out+offset_t1-offset_t2);
+                            entries.emplace_back(flev, lidx, iv_in, clev, civ_out-offset_t1+offset_t2);
+                            entries.emplace_back(flev, lidx, iv_in, clev, civ_out+offset_t1+offset_t2);
+                        }
+#endif
+                    }
+                });
+            }
+        }
+    }
+
+    // There are duplicates at corners.
+    // After this entries is also sorted.
+    amrex::RemoveDuplicates(entries);
+
+    m_f2c_bno.resize(m_nlevels-1);
+    m_f2c_cell.resize(m_nlevels-1);
+    m_f2c_nentries.resize(m_nlevels-1);
+    m_f2c_offset.resize(m_nlevels-1);
+    m_f2c_values.resize(m_nlevels-1);
+
+    for (auto& entry : entries) {
+        auto const from_level = std::get<0>(entry);
+        auto const   to_level = std::get<3>(entry);
+
+        // HYPRE_Int might be a different type than int
+        auto from_iv = std::get<2>(entry);
+        auto   to_iv = std::get<4>(entry);
+        GpuArray<HYPRE_Int,AMREX_SPACEDIM> from_index{AMREX_D_DECL(from_iv[0],
+                                                                   from_iv[1],
+                                                                   from_iv[2])};
+        GpuArray<HYPRE_Int,AMREX_SPACEDIM> to_index{AMREX_D_DECL(to_iv[0],
+                                                                 to_iv[1],
+                                                                 to_iv[2])};
+        constexpr int ivar = 0;
+        HYPRE_SStructGraphAddEntries(m_ss_graph,
+                                     from_level, from_index.data(), ivar,
+                                       to_level,   to_index.data(), ivar);
+
+        if (from_level == to_level + 1) {
+            auto const bno = std::get<1>(entry);
+            if ((! m_f2c_bno[to_level].empty()) &&
+                (m_f2c_bno[to_level].back() == bno) &&
+                (m_f2c_cell[to_level].back() == from_iv)) {
+                ++m_f2c_nentries[to_level].back();
+            } else {
+                m_f2c_bno[to_level].push_back(bno);
+                m_f2c_cell[to_level].push_back(from_iv);
+                m_f2c_nentries[to_level].push_back(1);
+            }
+        }
+    }
+
+    for (int clev = 0; clev < int(m_f2c_nentries.size()); ++clev) {
+        auto const& nentries = m_f2c_nentries[clev];
+        if (!nentries.empty()) {
+            auto& offset = m_f2c_offset[clev];
+            offset.resize(nentries.size());
+            std::exclusive_scan(nentries.begin(), nentries.end(), offset.begin(),
+                                std::size_t(0), std::plus<std::size_t>{});
+            auto nvalues = std::size_t(nentries.back()) + offset.back();
+            m_f2c_values[clev].resize(nvalues,Real(0.0));
+        }
+    }
+}
+
+void HypreMLABecLap::setup (Real a_ascalar, Real a_bscalar,
+                            Vector<MultiFab const*> const& a_acoefs,
+                            Vector<Array<MultiFab const*,AMREX_SPACEDIM>> const& a_bcoefs,
+                            Array<LinOpBCType,AMREX_SPACEDIM> const& a_lobc,
+                            Array<LinOpBCType,AMREX_SPACEDIM> const& a_hibc,
+                            Vector<MultiFab const*> const& a_levelbcdata,
+                            std::pair<MultiFab const*, IntVect> const& a_coarse_bc)
+{
+    BL_PROFILE("HypreMLABecLap::setup");
+
+    constexpr int ncomp = 1;
+    constexpr HYPRE_Int ivar = 0;
+
+    m_ascalar = a_ascalar;
+    m_bscalar = a_bscalar;
+    m_lobc = a_lobc;
+    m_hibc = a_hibc;
+
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        if (m_geom[0].isPeriodic(idim)) {
+            AMREX_ALWAYS_ASSERT(a_lobc[idim] == LinOpBCType::Periodic &&
+                                a_hibc[idim] == LinOpBCType::Periodic);
+        } else {
+            AMREX_ALWAYS_ASSERT((a_lobc[idim] == LinOpBCType::Dirichlet ||
+                                 a_lobc[idim] == LinOpBCType::Neumann  ) &&
+                                (a_hibc[idim] == LinOpBCType::Dirichlet ||
+                                 a_hibc[idim] == LinOpBCType::Neumann  ));
+        }
+    }
+
+    MultiFab empty;
+
+    for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+        MultiFab const* levelbc;
+        if (ilev < a_levelbcdata.size() && a_levelbcdata[ilev]) {
+            levelbc = a_levelbcdata[ilev];
+        } else {
+            levelbc = &empty;
+        }
+
+        IntVect br_ref_ratio;
+
+        if (ilev == 0) {
+            if (m_grids[0].numPts() != m_geom[0].Domain().numPts()) {
+                // Need coarse data for bc
+                br_ref_ratio = a_coarse_bc.second;
+                AMREX_ALWAYS_ASSERT(br_ref_ratio.allGT(0));
+                int const in_rad = 0;
+                int const out_rad = 1;
+                int const extend_rad = 2;
+                BndryRegister crse_br(amrex::coarsen(m_grids[0],br_ref_ratio),
+                                      m_dmap[0], in_rad, out_rad, extend_rad, ncomp);
+                if (a_coarse_bc.first) {
+                    Box const& cbx = amrex::coarsen(m_geom[0].Domain(),br_ref_ratio);
+                    crse_br.copyFrom(*a_coarse_bc.first, 0, 0, 0, ncomp,
+                                      m_geom[0].periodicity(cbx));
+                } else {
+                    crse_br.setVal(Real(0.0));
+                }
+                m_bndry[0]->setBndryValues(crse_br, 0, *levelbc, 0, 0, ncomp, br_ref_ratio,
+                                           InterpBndryData::IBD_max_order_DEF, 1);
+            } else {
+                br_ref_ratio = IntVect(1);
+                m_bndry[0]->setPhysBndryValues(*levelbc, 0, 0, ncomp);
+            }
+        } else {
+            br_ref_ratio = m_ref_ratio[ilev-1];
+            m_bndry[ilev]->setPhysBndryValues(*levelbc, 0, 0, ncomp);
+        }
+
+        RealVect crse_bc_loc;
+        m_bndry[ilev]->setLOBndryConds({a_lobc}, {a_hibc}, br_ref_ratio, crse_bc_loc);
+    }
+
+    // load matrix
+
+    constexpr HYPRE_Int stencil_size = 2*AMREX_SPACEDIM + 1;
+    Array<HYPRE_Int,stencil_size> stencil_indices;
+    std::iota(stencil_indices.begin(), stencil_indices.end(), 0);
+
+    // big enough for 3d w/ refratio of 4. worst case: a coarse cell with
+    // fine neighbors on 3 faces.
+    Vector<HYPRE_Int> nonstencil_indices(4*4*3+3);
+    std::iota(nonstencil_indices.begin(), nonstencil_indices.end(), stencil_size);
+
+    BaseFab<GpuArray<Real,stencil_size>> matfab;
+    for (int ilev = m_nlevels-1; ilev >= 0; --ilev) {
+        auto dx = m_geom[ilev].CellSizeArray();
+
+        for (MFIter mfi(m_grids[ilev], m_dmap[ilev]); mfi.isValid(); ++mfi) {
+            Box const& vbx = mfi.validbox();
+            matfab.resize(vbx);
+
+            Array4<Real const> afab;
+            if (ilev < a_acoefs.size() && a_acoefs[ilev]) {
+                afab = a_acoefs[ilev]->const_array(mfi);
+            }
+
+            GpuArray<Array4<Real const>, AMREX_SPACEDIM> bfabs;
+            if (ilev < a_bcoefs.size()) {
+                for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                    if (a_bcoefs[ilev][idim]) {
+                        bfabs[idim] = a_bcoefs[ilev][idim]->const_array(mfi);
+                    }
+                }
+            }
+
+            GpuArray<int,AMREX_SPACEDIM*2> bctype;
+            GpuArray<Real,AMREX_SPACEDIM*2> bcl;
+            GpuArray<Array4<int const>, AMREX_SPACEDIM*2> bcmsk;
+            GpuArray<Array4<Real const>, AMREX_SPACEDIM*2> bcval;
+            GpuArray<Array4<Real>, AMREX_SPACEDIM*2> bcrhs;
+            for (OrientationIter oit; oit; oit++) {
+                Orientation ori = oit();
+                int cdir(ori);
+                bctype[cdir] = m_bndry[ilev]->bndryConds(mfi)[cdir][0];
+                bcl[cdir] = m_bndry[ilev]->bndryLocs(mfi)[cdir];
+                bcmsk[cdir] = m_bndry[ilev]->bndryMasks(ori)[mfi].const_array();
+                bcval[cdir] = m_bndry[ilev]->bndryValues(ori)[mfi].const_array();
+                bcrhs[cdir] = (*m_bndry_rhs[ilev])[ori][mfi].array();
+            }
+
+            Real sa = a_ascalar;
+            Real sb = a_bscalar;
+            const auto boxlo = amrex::lbound(vbx);
+            const auto boxhi = amrex::ubound(vbx);
+            // Set up stencil part of the matrix
+            amrex::fill(matfab,
+            [=] AMREX_GPU_HOST_DEVICE (GpuArray<Real,stencil_size>& sten,
+                                       int i, int j, int k)
+            {
+                hypmlabeclap_mat(sten, i, j, k, boxlo, boxhi, sa, afab, sb, dx, bfabs,
+                                 bctype, bcl, bcmsk, bcval, bcrhs, ilev);
+            });
+
+            bool need_sync = true;
+
+            // This is always false for the first iteration (ilev == m_nlevels-1).
+            if (ilev < m_nlevels-1) {
+                // As a coarse level, the coarse cells near the coarse/fine
+                // interface have their transverse stencil parts modified
+                // because they have participated in the fine flux
+                // calculation. For example, suppose there is coarse cell
+                // (i,j) in 2d and its left neighbors are two fine cells
+                // (assuming the ref_ratio is 2). The computation of the two
+                // fine fluxes across the coarse/fine interface involves
+                // cells (i,j), (i,j-1) and (i,j+1), if the latter two are
+                // uncovered coarse cells. For 3D, this will also involve
+                // coarse corners cells in the transverse plane, which are
+                // not part of the stencil.
+                //
+                // So we need to do three things. (1) Add reflux
+                // modifications to the stencil entries. (2) Handle
+                // non-stencil entries involving fine cells. (3) For 3D, we
+                // need handle the non-stencil coarse corner entries in the
+                // transverse plane.
+                //
+                // We have received B coeffs from the fine level in the
+                // previous iteration.
+
+                int c2f_total_from = m_c2f_total_from[ilev][mfi];
+                int c2f_total_to = m_c2f_total_to[ilev][mfi];
+                Gpu::DeviceVector<GpuArray<HYPRE_Int,AMREX_SPACEDIM>> civ(c2f_total_from);
+                Gpu::DeviceVector<HYPRE_Int> nentries(c2f_total_from);
+                Gpu::DeviceVector<int> entry_offset(c2f_total_from);
+                Gpu::DeviceVector<Real> entry_values(c2f_total_to,Real(0.0));
+                auto* p_civ = civ.data();
+                auto* p_nentries = nentries.data();
+                auto* p_entry_offset = entry_offset.data();
+                auto* p_entry_values = entry_values.data();
+                auto const& c2f_offset_from_a = m_c2f_offset_from[ilev].const_array(mfi);
+                auto const& c2f_nentries_a = m_c2f_nentries[ilev].const_array(mfi);
+                auto const& c2f_offset_to_a = m_c2f_offset_to[ilev].const_array(mfi);
+                auto const& mat_a = matfab.array();
+                auto const& fine_mask = m_fine_masks[ilev].const_array(mfi);
+                AMREX_D_TERM(auto offset_bx_a = m_offset_cf_bcoefs[ilev][0].isDefined()
+                                              ? m_offset_cf_bcoefs[ilev][0].const_array(mfi)
+                                              : Array4<int const>{};,
+                             auto offset_by_a = m_offset_cf_bcoefs[ilev][1].isDefined()
+                                              ? m_offset_cf_bcoefs[ilev][1].const_array(mfi)
+                                              : Array4<int const>{};,
+                             auto offset_bz_a = m_offset_cf_bcoefs[ilev][2].isDefined()
+                                              ? m_offset_cf_bcoefs[ilev][2].const_array(mfi)
+                                              : Array4<int const>{});
+                AMREX_D_TERM(Real const* p_bx = (!m_cf_bcoefs[ilev][0].empty())
+                                                ? m_cf_bcoefs[ilev][0][mfi]->data()
+                                                : nullptr;,
+                             Real const* p_by = (!m_cf_bcoefs[ilev][1].empty())
+                                                ? m_cf_bcoefs[ilev][1][mfi]->data()
+                                                : nullptr;,
+                             Real const* p_bz = (!m_cf_bcoefs[ilev][2].empty())
+                                                ? m_cf_bcoefs[ilev][2][mfi]->data()
+                                                : nullptr);
+                auto rr = m_ref_ratio[ilev];
+                amrex::ParallelFor(vbx,
+                [=] AMREX_GPU_DEVICE (int i, int j, int k)
+                {
+                    hypmlabeclap_c2f(i,j,k,mat_a,
+                                     p_civ,p_nentries,p_entry_offset,p_entry_values,
+                                     c2f_offset_from_a, c2f_nentries_a,
+                                     c2f_offset_to_a, dx, sb,
+                                     AMREX_D_DECL(offset_bx_a,offset_by_a,offset_bz_a),
+                                     AMREX_D_DECL(p_bx, p_by, p_bz),
+                                     fine_mask,rr);
+                });
+                if (c2f_total_from > 0) {
+#ifdef AMREX_USE_GPU
+                    Gpu::PinnedVector<Real> h_entry_values(entry_values.size());
+                    Gpu::PinnedVector<int> h_entry_offset(entry_offset.size());
+                    Gpu::PinnedVector<GpuArray<HYPRE_Int,AMREX_SPACEDIM>> h_civ(civ.size());
+                    Gpu::PinnedVector<HYPRE_Int> h_nentries(nentries.size());
+                    Gpu::copyAsync(Gpu::deviceToHost,
+                                   entry_values.begin(),
+                                   entry_values.end(),
+                                   h_entry_values.begin());
+                    Gpu::copyAsync(Gpu::deviceToHost,
+                                   entry_offset.begin(),
+                                   entry_offset.end(),
+                                   h_entry_offset.begin());
+                    Gpu::copyAsync(Gpu::deviceToHost,
+                                   civ.begin(),
+                                   civ.end(),
+                                   h_civ.begin());
+                    Gpu::copyAsync(Gpu::deviceToHost,
+                                   nentries.begin(),
+                                   nentries.end(),
+                                   h_nentries.begin());
+                    Gpu::streamSynchronize();
+                    need_sync = false;
+#else
+                    auto& h_entry_values = entry_values;
+                    auto const& h_entry_offset = entry_offset;
+                    auto& h_civ = civ;
+                    auto& h_nentries = nentries;
+#endif
+                    {
+                        BL_PROFILE("HYPRE_SStructMatrixSetValues");
+                        for (int ientry = 0; ientry < c2f_total_from; ++ientry) {
+                            HYPRE_SStructMatrixSetValues(m_ss_A, ilev, h_civ[ientry].data(),
+                                                         ivar, h_nentries[ientry],
+                                                         nonstencil_indices.data(),
+                                                         h_entry_values.data()
+                                                         + h_entry_offset[ientry]);
+                        }
+                    }
+                }
+            }
+
+            if (need_sync) { Gpu::streamSynchronize(); }
+
+            HYPRE_Int vbxlo[] = {AMREX_D_DECL(vbx.smallEnd(0), vbx.smallEnd(1), vbx.smallEnd(2))};
+            HYPRE_Int vbxhi[] = {AMREX_D_DECL(vbx.bigEnd(0), vbx.bigEnd(1), vbx.bigEnd(2))};
+            {
+                BL_PROFILE("HYPRE_SStructMatrixSetBoxValues");
+                HYPRE_SStructMatrixSetBoxValues(m_ss_A, ilev, vbxlo, vbxhi, ivar, stencil_size,
+                                                stencil_indices.data(), (Real*)matfab.dataPtr());
+                Gpu::hypreSynchronize();
+            }
+        }
+
+        if (ilev > 0) {
+            // As a fine level, we have non-stencil entries involving coarse
+            // cells.
+
+            int const flev = ilev;
+            int const clev = ilev - 1;
+
+            // Update m_f2c_values
+            auto const num_f2c_cell = int(m_f2c_cell[clev].size());
+            if (num_f2c_cell > 0) {
+#ifdef AMREX_USE_GPU
+                Gpu::DeviceVector<int> d_f2c_bno(m_f2c_bno[clev].size());
+                Gpu::DeviceVector<IntVect> d_f2c_cell(m_f2c_cell[clev].size());
+                Gpu::DeviceVector<std::size_t> d_f2c_offset(m_f2c_offset[clev].size());
+                Gpu::DeviceVector<Real> d_f2c_values(m_f2c_values[clev].size());
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               m_f2c_bno[clev].begin(),
+                               m_f2c_bno[clev].end(),
+                               d_f2c_bno.begin());
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               m_f2c_cell[clev].begin(),
+                               m_f2c_cell[clev].end(),
+                               d_f2c_cell.begin());
+                Gpu::copyAsync(Gpu::hostToDevice,
+                               m_f2c_offset[clev].begin(),
+                               m_f2c_offset[clev].end(),
+                               d_f2c_offset.begin());
+                auto const* p_f2c_bno = d_f2c_bno.data();
+                auto const* p_f2c_cell = d_f2c_cell.data();
+                auto const* p_f2c_offset = d_f2c_offset.data();
+                auto* p_f2c_values = d_f2c_values.data();
+#else
+                auto const* p_f2c_bno = m_f2c_bno[clev].data();
+                auto const* p_f2c_cell = m_f2c_cell[clev].data();
+                auto const* p_f2c_offset = m_f2c_offset[clev].data();
+                auto* p_f2c_values = m_f2c_values[clev].data();
+#endif
+                AMREX_D_TERM(MultiArray4<Real const> bcx;,
+                             MultiArray4<Real const> bcy;,
+                             MultiArray4<Real const> bcz);
+                if (flev < a_bcoefs.size() && (a_bcoefs[flev][0] != nullptr)) {
+                    AMREX_D_TERM(bcx = a_bcoefs[flev][0]->const_arrays();,
+                                 bcy = a_bcoefs[flev][1]->const_arrays();,
+                                 bcz = a_bcoefs[flev][2]->const_arrays());
+                }
+                GpuArray<MultiArray4<int const>,AMREX_SPACEDIM*2> bmasks;
+                for (OrientationIter ori; ori; ++ori) {
+                    auto const face = ori();
+                    bmasks[face] = m_bndry[flev]->bndryMasks(face).const_arrays();
+                }
+                auto rr = m_ref_ratio[clev];
+                int not_covered = BndryData::not_covered;
+                amrex::ParallelFor(num_f2c_cell, [=] AMREX_GPU_DEVICE (int icell)
+                {
+                    int const bno = p_f2c_bno[icell];
+                    // bcoefs may not exist
+                    hypmlabeclap_f2c_set_values
+                        (p_f2c_cell[icell],
+                         p_f2c_values + p_f2c_offset[icell],
+                         dx, a_bscalar,
+                         {AMREX_D_DECL(bcx ? bcx[bno] : Array4<Real const>{},
+                                       bcy ? bcy[bno] : Array4<Real const>{},
+                                       bcz ? bcz[bno] : Array4<Real const>{})},
+                         {AMREX_D_DECL(bmasks[0][bno],
+                                       bmasks[1][bno],
+                                       bmasks[2][bno]),
+                          AMREX_D_DECL(bmasks[AMREX_SPACEDIM][bno],
+                                       bmasks[AMREX_SPACEDIM+1][bno],
+                                       bmasks[AMREX_SPACEDIM+2][bno])},
+                         rr, not_covered);
+                });
+#ifdef AMREX_USE_GPU
+                Gpu::copyAsync(Gpu::deviceToHost,
+                               d_f2c_values.begin(),
+                               d_f2c_values.end(),
+                               m_f2c_values[clev].begin());
+                Gpu::streamSynchronize();
+#endif
+
+                // This sets non-stencil part for fine cells adjacent to
+                // coarse/fine interface.
+                for (int i = 0; i < num_f2c_cell; ++i) {
+                    auto const& iv = m_f2c_cell[clev][i];
+                    HYPRE_Int index[] = {AMREX_D_DECL(iv[0],iv[1],iv[2])};
+                    auto const nentries = m_f2c_nentries[clev][i];
+                    auto const offset = m_f2c_offset[clev][i];
+                    auto* values = m_f2c_values[clev].data() + offset;
+                    HYPRE_SStructMatrixSetValues(m_ss_A, flev, index, ivar, nentries,
+                                                 nonstencil_indices.data(), values);
+                }
+            }
+
+            if (ilev < a_bcoefs.size() && a_bcoefs[ilev][0]) {
+                commBCoefs(ilev, a_bcoefs[ilev]);
+            }
+        }
+    }
+
+    {
+        BL_PROFILE("HYPRE_SStructMatrixAssemble");
+        HYPRE_SStructMatrixAssemble(m_ss_A);
+        // HYPRE_SStructMatrixPrint("mat", m_ss_A, 0);
+    }
+
+#ifdef AMREX_FEATURE_HYPRE_SSAMG
+    if (m_hypre_solver_id == HypreSolverID::SSAMG)
+    {
+        BL_PROFILE("HYPRE_SSAMG_setup");
+
+        AMREX_ALWAYS_ASSERT(m_solver == nullptr);
+
+        HYPRE_SStructSSAMGCreate(m_comm, &m_ss_solver);
+
+        HYPRE_SStructSSAMGSetNumPreRelax(m_ss_solver, 4);
+        HYPRE_SStructSSAMGSetNumPostRelax(m_ss_solver, 4);
+        HYPRE_SStructSSAMGSetNumCoarseRelax(m_ss_solver, 4);
+
+        HYPRE_SStructSSAMGSetLogging(m_ss_solver, m_verbose);
+        HYPRE_SStructSSAMGSetPrintLevel(m_ss_solver, m_verbose);
+
+//        HYPRE_SStructSSAMGSetup(m_ss_solver, A, b, x);
+
+        HYPRE_SStructSSAMGSetMaxIter(m_ss_solver, m_maxiter);
+    } else
+#endif
+    {
+        BL_PROFILE("HYPRE_BoomerAMG_setup");
+
+        AMREX_ALWAYS_ASSERT(m_solver == nullptr);
+
+        HYPRE_BoomerAMGCreate(&m_solver);
+
+        HYPRE_BoomerAMGSetOldDefault(m_solver); // Falgout coarsening with modified classical interpolation
+        HYPRE_BoomerAMGSetStrongThreshold(m_solver, (AMREX_SPACEDIM == 3) ? 0.6 : 0.25); // default is 0.25
+        HYPRE_BoomerAMGSetRelaxOrder(m_solver, 1);   /* 0: default, natural order, 1: C/F relaxation order */
+        HYPRE_BoomerAMGSetNumSweeps(m_solver, 2);   /* Sweeps on fine levels */
+        // HYPRE_BoomerAMGSetFCycle(m_solver, 1); // default is 0
+        // HYPRE_BoomerAMGSetCoarsenType(m_solver, 6);
+        // HYPRE_BoomerAMGSetRelaxType(m_solver, 6);   /* G-S/Jacobi hybrid relaxation */
+
+        HYPRE_BoomerAMGSetLogging(m_solver, m_verbose);
+        HYPRE_BoomerAMGSetPrintLevel(m_solver, m_verbose);
+
+        HYPRE_ParCSRMatrix par_A;
+        HYPRE_SStructMatrixGetObject(m_ss_A, (void**) &par_A);
+        HYPRE_BoomerAMGSetup(m_solver, par_A, nullptr, nullptr);
+
+        HYPRE_BoomerAMGSetMinIter(m_solver, 1);
+        HYPRE_BoomerAMGSetMaxIter(m_solver, m_maxiter);
+    }
+}
+
+void HypreMLABecLap::solve (Vector<MultiFab*> const& a_sol, Vector<MultiFab const*> const& a_rhs,
+                            Real a_reltol, Real a_abstol)
+{
+    BL_PROFILE("HypreMLABecLap::solve()");
+
+    constexpr int ncomp = 1;
+    constexpr HYPRE_Int ivar = 0;
+
+    // Load vectors
+    {
+        BL_PROFILE("HypreMLABecLap::load_vector");
+
+        // Do we still have to do this repeatedly to avoid a hypre bug?
+        HYPRE_SStructVectorCreate(m_comm, m_ss_grid, &m_ss_x);
+        HYPRE_SStructVectorSetObjectType(m_ss_x, m_hypre_object_type);
+        HYPRE_SStructVectorInitialize(m_ss_x);
+        //
+        HYPRE_SStructVectorCreate(m_comm, m_ss_grid, &m_ss_b);
+        HYPRE_SStructVectorSetObjectType(m_ss_b, m_hypre_object_type);
+        HYPRE_SStructVectorInitialize(m_ss_b);
+
+        for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+            FArrayBox tmp;
+            bool has_ghostcells = (a_sol[ilev]->nGrowVect() != 0);
+            AMREX_ALWAYS_ASSERT(a_rhs[ilev]->nGrowVect() == 0);
+            for (MFIter mfi(*a_rhs[ilev]); mfi.isValid(); ++mfi) {
+                Box const& vbx = mfi.validbox();
+                HYPRE_Int vbxlo[] = {AMREX_D_DECL(vbx.smallEnd(0), vbx.smallEnd(1), vbx.smallEnd(2))};
+                HYPRE_Int vbxhi[] = {AMREX_D_DECL(vbx.bigEnd(0), vbx.bigEnd(1), vbx.bigEnd(2))};
+
+                FArrayBox& solsrc = (*a_sol[ilev])[mfi];
+                Real* psol;
+                if (has_ghostcells) {
+                    tmp.resize(vbx, ncomp);
+                    psol = tmp.dataPtr();
+                    solsrc.template copyToMem<RunOn::Device>(vbx, 0, ncomp, psol);
+                    Gpu::streamSynchronize();
+                } else {
+                    psol = solsrc.dataPtr();
+                }
+                HYPRE_SStructVectorSetBoxValues(m_ss_x, ilev, vbxlo, vbxhi, ivar, psol);
+                Gpu::hypreSynchronize();
+
+                tmp.resize(vbx, ncomp);
+                auto const& rhs1 = tmp.array();
+                auto const& rhs0 = a_rhs[ilev]->const_array(mfi);
+                auto* prhs = tmp.dataPtr();
+                GpuArray<Array4<int const>, AMREX_SPACEDIM*2> bcmsk;
+                GpuArray<Array4<Real const>, AMREX_SPACEDIM*2> bcrhs;
+                for (OrientationIter oit; oit; oit++) {
+                    Orientation ori = oit();
+                    int cdir(ori);
+                    bcmsk[cdir] = m_bndry[ilev]->bndryMasks(ori)[mfi].const_array();
+                    bcrhs[cdir] = (*m_bndry_rhs[ilev])[ori][mfi].const_array();
+                }
+                const auto boxlo = amrex::lbound(vbx);
+                const auto boxhi = amrex::ubound(vbx);
+                amrex::ParallelFor(vbx, [=] AMREX_GPU_DEVICE (int i, int j, int k)
+                {
+                    hypmlabeclap_rhs(i, j, k, boxlo, boxhi, rhs1, rhs0, bcmsk, bcrhs);
+                });
+                Gpu::streamSynchronize();
+                HYPRE_SStructVectorSetBoxValues(m_ss_b, ilev, vbxlo, vbxhi, ivar, prhs);
+                Gpu::hypreSynchronize();
+            }
+        }
+
+        HYPRE_SStructVectorAssemble(m_ss_x);
+        HYPRE_SStructVectorAssemble(m_ss_b);
+
+//        HYPRE_SStructVectorPrint("x", m_ss_x, 0);
+//        HYPRE_SStructVectorPrint("b", m_ss_b, 0);
+    }
+
+    // Solve
+    {
+        BL_PROFILE("HypreMLABecLap::actual_solve");
+
+        auto reltol = a_reltol;
+        if (a_abstol > Real(0.0)) {
+            amrex::Abort("HypreMLABecLap::solve: TODO abstol > 0");
+        }
+
+#ifdef AMREX_FEATURE_HYPRE_SSAMG
+        if (m_hypre_solver_id == HypreSolverID::SSAMG)
+        {
+            HYPRE_SStructSSAMGSetTol(m_ss_solver, reltol);
+
+            HYPRE_SStructSSAMGSetup(m_ss_solver, m_ss_A, m_ss_b, m_ss_x);
+
+            HYPRE_SStructSSAMGSolve(m_ss_solver, m_ss_A, m_ss_b, m_ss_x);
+
+            if (m_verbose) {
+                HYPRE_Int num_iterations;
+                Real res;
+                HYPRE_SStructSSAMGGetNumIterations(m_ss_solver, &num_iterations);
+                HYPRE_SStructSSAMGGetFinalRelativeResidualNorm(m_ss_solver, &res);
+
+                amrex::Print() << "\n" << num_iterations
+                               << " Hypre SSAMG Iterations, Relative Residual "
+                               << res << '\n';
+            }
+        } else
+#endif
+        {
+            HYPRE_BoomerAMGSetTol(m_solver, reltol);
+
+            HYPRE_ParCSRMatrix par_A;
+            HYPRE_ParVector par_b;
+            HYPRE_ParVector par_x;
+
+            HYPRE_SStructMatrixGetObject(m_ss_A, (void**) &par_A);
+            HYPRE_SStructVectorGetObject(m_ss_b, (void**) &par_b);
+            HYPRE_SStructVectorGetObject(m_ss_x, (void**) &par_x);
+
+            HYPRE_BoomerAMGSolve(m_solver, par_A, par_b, par_x);
+
+            if (m_verbose) {
+                HYPRE_Int num_iterations;
+                Real res;
+                HYPRE_BoomerAMGGetNumIterations(m_solver, &num_iterations);
+                HYPRE_BoomerAMGGetFinalRelativeResidualNorm(m_solver, &res);
+
+                amrex::Print() << "\n" << num_iterations
+                               << " Hypre SS BoomerAMG Iterations, Relative Residual "
+                               << res << '\n';
+            }
+        }
+    }
+
+    // Get solution
+    {
+        BL_PROFILE("HypreMLABecLap::get_solution");
+
+        HYPRE_SStructVectorGather(m_ss_x);
+
+        for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+            FArrayBox sol;
+            bool has_ghostcells = (a_sol[ilev]->nGrowVect() != 0);
+            for (MFIter mfi(*a_rhs[ilev]); mfi.isValid(); ++mfi) {
+                Box const& vbx = mfi.validbox();
+                HYPRE_Int vbxlo[] = {AMREX_D_DECL(vbx.smallEnd(0), vbx.smallEnd(1), vbx.smallEnd(2))};
+                HYPRE_Int vbxhi[] = {AMREX_D_DECL(vbx.bigEnd(0), vbx.bigEnd(1), vbx.bigEnd(2))};
+                FArrayBox& dest = (*a_sol[ilev])[mfi];
+                Real* p;
+                if (has_ghostcells) {
+                    sol.resize(vbx, ncomp);
+                    p = sol.dataPtr();
+                } else {
+                    p = dest.dataPtr();
+                }
+
+                HYPRE_SStructVectorGetBoxValues(m_ss_x, ilev, vbxlo, vbxhi, ivar, p);
+                Gpu::hypreSynchronize();
+
+                if (has_ghostcells) {
+                    dest.template copyFromMem<RunOn::Device>(vbx, 0, ncomp, p);
+                    Gpu::streamSynchronize();
+                }
+            }
+        }
+
+        HYPRE_SStructVectorDestroy(m_ss_x);
+        HYPRE_SStructVectorDestroy(m_ss_b);
+        m_ss_x = nullptr;
+        m_ss_b = nullptr;
+    }
+
+    for (int ilev = m_nlevels-2; ilev >= 0; --ilev) {
+        amrex::average_down(*a_sol[ilev+1], *a_sol[ilev], 0, ncomp, m_ref_ratio[ilev]);
+    }
+
+    // xxxxx abort if convergence is not reached.
+}
+
+#ifdef AMREX_USE_GPU
+namespace {
+    struct BCCommTag
+    {
+        Array4<Real const> fsrc;
+        Array4<int const> offset;
+        Real* pdst;
+        Box cbx;
+        IntVect d2s;
+        int idir;
+
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        Box const& box () const noexcept { return cbx; }
+    };
+
+    void unpack_bc (Vector<BCCommTag> const& tags, IntVect const& rr)
+    {
+        amrex::ParallelFor(tags,
+        [=] AMREX_GPU_DEVICE (int i, int j, int k, BCCommTag const& tag)
+        {
+            if (tag.offset(i,j,k) >= 0) {
+                IntVect rrface = rr;
+                rrface[tag.idir] = 1;
+                IntVect fiv(AMREX_D_DECL(i*rr[0]+tag.d2s[0],
+                                         j*rr[1]+tag.d2s[1],
+                                         k*rr[2]+tag.d2s[2]));
+                auto* p = tag.pdst + tag.offset(i,j,k);
+#if (AMREX_SPACEDIM == 3)
+                for (int irz = 0; irz < rrface[2]; ++irz) {
+#endif
+                for (int iry = 0; iry < rrface[1]; ++iry) {
+                for (int irx = 0; irx < rrface[0]; ++irx) {
+                    *p++ = tag.fsrc(fiv+IntVect(AMREX_D_DECL(irx,iry,irz)));
+                }}
+#if (AMREX_SPACEDIM == 3)
+                }
+#endif
+            }
+        });
+    }
+}
+#endif
+
+void HypreMLABecLap::commBCoefs (int flev, Array<MultiFab const*,AMREX_SPACEDIM> const& a_bcoefs)
+{
+    AMREX_ASSERT(AMREX_D_TERM(a_bcoefs[0], && a_bcoefs[1], && a_bcoefs[2]));
+
+    int const ncomp = 1;
+    int const clev = flev-1;
+    IntVect const& rr = m_ref_ratio[clev];
+
+    BoxArray const& ba_dst =                amrex::convert(m_grids[clev],IntVect(1));
+    BoxArray const& ba_src = amrex::coarsen(amrex::convert(m_grids[flev],IntVect(1)),rr);
+    DistributionMapping const& dm_dst = m_dmap[clev];
+    DistributionMapping const& dm_src = m_dmap[flev];
+    auto const& cperiod = m_geom[clev].periodicity();
+
+    auto & offset_bcoefs = m_offset_cf_bcoefs[clev];
+    auto & cf_bcoefs = m_cf_bcoefs[clev];
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        auto const& ba = amrex::convert(ba_dst,IntVect::TheDimensionVector(idim));
+        offset_bcoefs[idim].define(ba, dm_dst, 1, 0);
+        cf_bcoefs[idim].define(ba, dm_dst);
+
+        IntVect rrface = rr;
+        rrface[idim] = 1;
+        int const nfaces = AMREX_D_TERM(rrface[0],*rrface[1],*rrface[2]);
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+        for (MFIter mfi(offset_bcoefs[idim]); mfi.isValid(); ++mfi) {
+            auto const& offset_a = offset_bcoefs[idim].array(mfi);
+            auto const& fmask_a = m_fine_masks[clev].const_array(mfi);
+            BoxIndexer box_indexer(mfi.validbox());
+#ifdef AMREX_USE_GPU
+            auto tot = Scan::PrefixSum<int>
+                (std::uint64_t(mfi.validbox().numPts()),
+                 [=] AMREX_GPU_DEVICE (std::uint64_t index) -> int
+                 {
+                     auto ijk = box_indexer(index);
+                     IntVect ivm(ijk);
+                     ivm[idim] -= 1;
+                     return int(fmask_a(ijk) != fmask_a(ivm));
+                 },
+                 [=] AMREX_GPU_DEVICE (Long index, int psum)
+                 {
+                     auto ijk = box_indexer(index);
+                     IntVect ivm(ijk);
+                     ivm[idim] -= 1;
+                     if (fmask_a(ijk) == fmask_a(ivm)) {
+                         offset_a(ijk) = -1; // not a coarse/fine face
+                     } else {
+                         offset_a(ijk) = psum*nfaces;
+                     }
+                 },
+                 Scan::Type::exclusive, Scan::retSum);
+#else
+            int tot = 0;
+            amrex::Loop(mfi.validbox(), [&] (int i, int j, int k)
+            {
+                IntVect ivm(AMREX_D_DECL(i,j,k));
+                ivm[idim] -= 1;
+                int is_cf = (fmask_a(i,j,k) != fmask_a(ivm));
+                int psum = tot;
+                tot += int(is_cf);
+                offset_a(i,j,k) = is_cf ? psum*nfaces : -1;
+            });
+#endif
+            tot *= nfaces;
+            cf_bcoefs[idim][mfi] = std::make_unique<Gpu::DeviceVector<Real>>(tot);
+        }
+    }
+
+    using Tag = FabArrayBase::CopyComTag;
+    Vector<Tag> loc_tags;
+    FabArrayBase::MapOfCopyComTagContainers send_tags;
+    FabArrayBase::MapOfCopyComTagContainers recv_tags;
+
+    auto const& imap_src = a_bcoefs[0]->IndexArray();
+    auto const& imap_dst = m_fine_masks[clev].IndexArray();
+
+    int const myproc = ParallelDescriptor::MyProc();
+    AMREX_ALWAYS_ASSERT(ParallelDescriptor::TeamSize() == 1);
+
+    if (!(imap_dst.empty() && imap_src.empty())) {
+        auto const nlocal_src = static_cast<int>(imap_src.size());
+        auto const nlocal_dst = static_cast<int>(imap_dst.size());
+
+        std::vector<std::pair<int,Box>> isects;
+        auto const& cpshifts = cperiod.shiftIntVect();
+
+        if (ParallelContext::NProcsSub() > 1) {
+            for (int i = 0; i < nlocal_src; ++i) {
+                int const k_src = imap_src[i];
+                Box const& bx_src = ba_src[k_src];
+                for (auto const& pit : cpshifts) {
+                    Box const& bx_src_shifted = bx_src + pit;
+                    ba_dst.intersections(bx_src_shifted, isects);
+                    for (auto const& is : isects) {
+                        int const k_dst = is.first;
+                        int const dst_owner = dm_dst[k_dst];
+                        if (myproc != dst_owner) { // local copy will be dealt with later
+                            Box const& bx_dst = ba_dst[k_dst];
+                            for (OrientationIter ori; ori; ++ori) {
+                                auto const face = ori();
+                                int const idir = face.coordDir();
+                                auto const ixtype = IntVect::TheDimensionVector(idir);
+                                Box const& face_bx_src_shifted = amrex::convert(bx_src_shifted, ixtype);
+                                Box const& face_bx_dst = amrex::convert(bx_dst, ixtype);
+                                Box const& b = amrex::bdryNode(face_bx_src_shifted, face) & face_bx_dst;
+                                if (b.ok()) {
+                                    send_tags[dst_owner].emplace_back
+                                        (amrex::refine(b,rr), amrex::refine(b-pit,rr), k_dst, k_src);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        for (int i = 0; i < nlocal_dst; ++i) {
+            int const k_dst = imap_dst[i];
+            Box const& bx_dst = ba_dst[k_dst];
+            for (auto const& pit : cpshifts) {
+                Box const& bx_dst_shifted = bx_dst + pit;
+                ba_src.intersections(bx_dst_shifted, isects);
+                for (auto const& is : isects) {
+                    int const k_src = is.first;
+                    int const src_owner = dm_src[k_src];
+                    Box const& bx_src = ba_src[k_src];
+                    auto& tagv = (myproc == src_owner) ? loc_tags : recv_tags[src_owner];
+                    for (OrientationIter ori; ori; ++ori) {
+                        auto const face = ori();
+                        int const idir = face.coordDir();
+                        auto const ixtype = IntVect::TheDimensionVector(idir);
+                        Box const& face_bx_dst_shifted = amrex::convert(bx_dst_shifted, ixtype);
+                        Box const& face_bx_src = amrex::convert(bx_src, ixtype);
+                        Box const& b = amrex::bdryNode(face_bx_src, face) & face_bx_dst_shifted;
+                        if (b.ok()) {
+                            tagv.emplace_back(amrex::refine(b-pit,rr),
+                                              amrex::refine(b,rr), k_dst, k_src);
+                        }
+                    }
+                }
+            }
+        }
+
+        // We need to fix the order so that the send and recv processes match.
+        auto f = [] (Tag const& a, Tag const& b) {
+                     return (a.sbox.ixType() < b.sbox.ixType() )
+                         || ((a.sbox.ixType() == b.sbox.ixType()) && (a < b));
+                 };
+        for (auto& [k, v] : send_tags) {
+            std::sort(v.begin(), v.end(), f);
+        }
+        for (auto& [k, v] : recv_tags) {
+            std::sort(v.begin(), v.end(), f);
+        }
+    }
+
+    if (ParallelContext::NProcsSub() == 1) {
+        commBCoefs_local(flev, a_bcoefs, loc_tags);
+        return;
+    }
+
+#ifdef AMREX_USE_MPI
+    int const mpi_tag = ParallelDescriptor::SeqNum();
+
+    auto const N_snds = int(send_tags.size());
+    auto const N_rcvs = int(recv_tags.size());
+    auto const N_locs = int(loc_tags.size());
+
+    if (N_locs == 0 && N_rcvs == 0 && N_snds == 0) { return; }
+
+    using FA = FabArray<FArrayBox>;
+
+    CommHandler handler{};
+    handler.mpi_tag = mpi_tag;
+    if (N_rcvs > 0) {
+        auto& comm_data = handler.recv;
+        comm_data.the_data = FA::PostRcvs(recv_tags, comm_data.data, comm_data.size, comm_data.rank,
+                                          comm_data.request, ncomp, handler.mpi_tag);
+    }
+
+    if (N_snds > 0) {
+        auto& comm_data = handler.send;
+        comm_data.the_data = FA::PrepareSendBuffers(send_tags, comm_data.data, comm_data.size,
+                                                    comm_data.rank, comm_data.request, comm_data.cctc,
+                                                    ncomp);
+        // pack send buffers
+#ifdef AMREX_USE_GPU
+        Vector<Array4PairTag<Real>> bc_send_tags;
+#endif
+
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+#pragma omp parallel for
+#endif
+        for (int isend = 0; isend < N_snds; ++isend) {
+            char* dptr = comm_data.data[isend];
+            auto const& cctc = *comm_data.cctc[isend];
+            for (auto const& t : cctc) {
+                Box const& bx = t.sbox;
+                auto const type = bx.ixType();
+#if (AMREX_SPACEDIM == 2)
+                int idir = type.nodeCentered(0) ? 0 : 1;
+#else
+                int idir = type.nodeCentered(0) ? 0 : (type.nodeCentered(1) ? 1 : 2);
+#endif
+                auto const& sfab = (*a_bcoefs[idir])[t.srcIndex];
+#ifdef AMREX_USE_GPU
+                bc_send_tags.emplace_back(Array4PairTag<Real>
+                    {makeArray4<Real>((Real*)dptr, bx, ncomp), sfab.const_array(), bx});
+                std::size_t nbytes = bx.numPts()*sizeof(Real)*ncomp;
+#else
+                auto nbytes = sfab.copyToMem(bx, 0, ncomp, dptr);
+#endif
+                dptr += nbytes;
+            }
+            AMREX_ASSERT(dptr <= comm_data.data[isend] + comm_data.size[isend] && ncomp == 1);
+        }
+
+#ifdef AMREX_USE_GPU
+        amrex::ParallelFor(bc_send_tags,
+        [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4PairTag<Real> const& tag)
+        {
+            tag.dfab(i,j,k) = tag.sfab(i,j,k);
+        });
+#endif
+
+        FA::PostSnds(comm_data.data, comm_data.size, comm_data.rank, comm_data.request,
+                     handler.mpi_tag);
+    }
+
+    if (N_locs > 0) {
+        commBCoefs_local(flev, a_bcoefs, loc_tags);
+    }
+
+    if (N_rcvs > 0) {
+        auto& comm_data = handler.recv;
+        comm_data.stats.resize(comm_data.request.size());
+        ParallelDescriptor::Waitall(comm_data.request, comm_data.stats);
+
+        // unpack recv buffers
+#ifdef AMREX_USE_GPU
+        Vector<BCCommTag> bc_recv_tags;
+#endif
+
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+#pragma omp parallel for
+#endif
+        for (int irecv = 0; irecv < N_rcvs; ++irecv) {
+            char const* dptr = comm_data.data[irecv];
+            auto const& cctc = recv_tags.at(comm_data.rank[irecv]);
+            for (auto const& t : cctc) {
+                Box const& bx = t.dbox;
+                Box const& cbx = amrex::coarsen(bx,rr);
+
+                auto const type = cbx.ixType();
+#if (AMREX_SPACEDIM == 2)
+                int idir = type.nodeCentered(0) ? 0 : 1;
+#else
+                int idir = type.nodeCentered(0) ? 0 : (type.nodeCentered(1) ? 1 : 2);
+#endif
+
+                auto const& fsrc = amrex::makeArray4((Real const*)dptr, bx, ncomp);
+                auto const& offset = offset_bcoefs[idir].const_array(t.dstIndex);
+                auto* pdst = cf_bcoefs[idir][t.dstIndex]->data();
+
+#ifdef AMREX_USE_GPU
+                bc_recv_tags.emplace_back(BCCommTag{fsrc, offset, pdst, cbx, IntVect(0), idir});
+#else
+                IntVect rrface = rr;
+                rrface[idir] = 1;
+                Dim3 rrdim3 = rr.dim3();
+                amrex::LoopOnCpu(cbx, [&] (int i, int j, int k)
+                {
+                    if (offset(i,j,k) >= 0) {
+                        int ii = i*rrdim3.x;
+                        int jj = j*rrdim3.y;
+                        int kk = k*rrdim3.z;
+                        auto* p = pdst + offset(i,j,k);
+#if (AMREX_SPACEDIM == 3)
+                        for (int irz = 0; irz < rrface[2]; ++irz) {
+#else
+                        constexpr int irz = 0;
+#endif
+                        for (int iry = 0; iry < rrface[1]; ++iry) {
+                        for (int irx = 0; irx < rrface[0]; ++irx) {
+                            *p++ = fsrc(ii+irx,jj+iry,kk+irz);
+                        }}
+#if (AMREX_SPACEDIM == 3)
+                        }
+#endif
+                    }
+                });
+#endif
+                dptr += bx.numPts() * ncomp * sizeof(Real);
+            }
+            AMREX_ASSERT(dptr <= comm_data.data[irecv] + comm_data.size[irecv] && ncomp == 1);
+        }
+
+#ifdef AMREX_USE_GPU
+        unpack_bc(bc_recv_tags, rr);
+#endif
+    }
+
+    if (N_snds > 0) {
+        auto& comm_data = handler.send;
+        comm_data.stats.resize(comm_data.request.size());
+        ParallelDescriptor::Waitall(comm_data.request, comm_data.stats);
+    }
+#endif
+}
+
+void HypreMLABecLap::commBCoefs_local (int flev,
+                                       Array<MultiFab const*,AMREX_SPACEDIM> const& a_bcoefs,
+                                       Vector<FabArrayBase::CopyComTag> const& tags)
+{
+    if (tags.empty()) { return; }
+
+    int const clev = flev-1;
+    auto const& rr = m_ref_ratio[clev];
+
+    auto const& offset_cf_bcoefs = m_offset_cf_bcoefs[clev];
+    auto const& cf_bcoefs = m_cf_bcoefs[clev];
+
+#ifdef AMREX_USE_GPU
+    Vector<BCCommTag> bc_local_tags;
+#endif
+
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+#pragma omp parallel for
+#endif
+    for (int itag = 0; itag < int(tags.size()); ++itag) { // NOLINT(modernize-loop-convert)
+        auto const& tag = tags[itag];
+        Box const& cbx = amrex::coarsen(tag.dbox,rr);
+        IntVect d2s = tag.dbox.smallEnd() - tag.sbox.smallEnd();
+
+        auto const type = cbx.ixType();
+#if (AMREX_SPACEDIM == 2)
+        int idir = type.nodeCentered(0) ? 0 : 1;
+#else
+        int idir = type.nodeCentered(0) ? 0 : (type.nodeCentered(1) ? 1 : 2);
+#endif
+
+        auto const& fsrc = a_bcoefs[idir]->const_array(tag.srcIndex);
+        auto const& offset = offset_cf_bcoefs[idir].const_array(tag.dstIndex);
+        auto* pdst = cf_bcoefs[idir][tag.dstIndex]->data();
+
+#ifdef AMREX_USE_GPU
+        bc_local_tags.emplace_back(BCCommTag{fsrc, offset, pdst, cbx, d2s, idir});
+#else
+        IntVect rrface = rr;
+        rrface[idir] = 1;
+        amrex::LoopOnCpu(cbx, [&] (int i, int j, int k)
+        {
+            if (offset(i,j,k) >= 0) {
+                IntVect fiv(AMREX_D_DECL(i*rr[0]+d2s[0],
+                                         j*rr[1]+d2s[1],
+                                         k*rr[2]+d2s[2]));
+                auto* p = pdst + offset(i,j,k);
+#if (AMREX_SPACEDIM == 3)
+                for (int irz = 0; irz < rrface[2]; ++irz) {
+#endif
+                for (int iry = 0; iry < rrface[1]; ++iry) {
+                for (int irx = 0; irx < rrface[0]; ++irx) {
+                   *p++ = fsrc(fiv+IntVect(AMREX_D_DECL(irx,iry,irz)));
+                }}
+#if (AMREX_SPACEDIM == 3)
+                }
+#endif
+            }
+        });
+#endif
+    }
+
+#ifdef AMREX_USE_GPU
+    unpack_bc(bc_local_tags, rr);
+#endif
+}
+
+}
diff --git a/Src/Extern/HYPRE/AMReX_HypreMLABecLap_2D_K.H b/Src/Extern/HYPRE/AMReX_HypreMLABecLap_2D_K.H
new file mode 100644
index 00000000000..57a37f19bf3
--- /dev/null
+++ b/Src/Extern/HYPRE/AMReX_HypreMLABecLap_2D_K.H
@@ -0,0 +1,308 @@
+#ifndef AMREX_HYPRE_ML_ABECLAP_2D_K_H_
+#define AMREX_HYPRE_ML_ABECLAP_2D_K_H_
+
+#include <AMReX_Array.H>
+#include <AMReX_Orientation.H>
+
+namespace amrex {
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void hypmlabeclap_f2c_set_values (IntVect const& cell, Real* values,
+                                  GpuArray<Real,AMREX_SPACEDIM> const& dx, Real sb,
+                                  GpuArray<Array4<Real const>,AMREX_SPACEDIM> const& b,
+                                  GpuArray<Array4<int const>,AMREX_SPACEDIM*2> const& bmask,
+                                  IntVect const& refratio, int not_covered)
+{
+    Array2D<Real,-1,1,-1,1> tmp;
+    for (auto& x : tmp) { x = Real(0.0); }
+
+    Array2D<int,-1,1,-1,1> used;
+    for (auto& x : used) { x = 0; }
+
+    for (OrientationIter ori; ori; ++ori) {
+        auto const face = ori();
+        int const idir = face.coordDir();
+        int const idir1 = 1-idir; // the transverse direction
+        IntVect offset(0);
+        offset[idir] = face.isLow() ? -1 : +1;
+        IntVect const cell_out = cell + offset;
+        auto const& msk = bmask[face];
+        if (msk.contains(cell_out) && msk(cell_out) == not_covered) {
+            // There is a coarse cell on the other side of the face. There
+            // are three cases for the coarse cells involved. (1)
+            // Interpolation using 3 coarse cells. (2) Upward biased
+            // interpolation using 2 coarse cells. (3) Doward biased
+            // interpolation using 2 coarse cells. (Here up and down means
+            // the y-dirction, if we assume idir is the x-direction.)
+            IntVect offset_t(0);
+            offset_t[idir1] = refratio[idir1];
+            Real bcoeff = b[idir] ? b[idir](face.isLow() ? cell : cell_out) : Real(1.0); // b is on face
+            Real poly_coef[3];
+            {
+                Real xx[3] = {Real(-0.5)*Real(refratio[idir]), Real(0.5), Real(1.5)};
+                poly_interp_coeff<3>(Real(-0.5), xx, poly_coef);
+            }
+            Real fac = -(sb / (dx[idir]*dx[idir])) * bcoeff * poly_coef[0];
+            int const rr1 = refratio[idir1];
+            int const i1 = cell[idir1];
+            int const i1c = amrex::coarsen(i1, rr1);
+            Real xInt = Real(-0.5) + (i1-i1c*rr1+Real(0.5))/Real(rr1);
+            Real xc[] = {Real(-1.0), Real(0.0), Real(1.0)};
+            Real c[] = {Real(0.0), Real(0.0), Real(0.0)};
+            int cc[] = {0, 0, 0};
+            if (msk(cell_out-offset_t) == not_covered &&
+                msk(cell_out+offset_t) == not_covered)
+            {
+                poly_interp_coeff<3>(xInt, xc, c);
+                cc[0] = cc[1] = cc[2] = 1;
+            } else if (msk(cell_out+offset_t) == not_covered) {
+                poly_interp_coeff<2>(xInt, &(xc[1]), &(c[1]));
+                cc[1] = cc[2] = 1;
+            } else {
+                poly_interp_coeff<2>(xInt, xc, c);
+                cc[0] = cc[1] = 1;
+            }
+            if (face == Orientation(0, Orientation::low)) {
+                for (int m = 0; m < 3; ++m) {
+                    tmp(-1,m-1) += c[m] * fac;
+                    used(-1,m-1) += cc[m];
+                }
+            } else if (face == Orientation(0, Orientation::high)) {
+                for (int m = 0; m < 3; ++m) {
+                    tmp(1,m-1) += c[m] * fac;
+                    used(1,m-1) += cc[m];
+                }
+            } else if (face == Orientation(1, Orientation::low)) {
+                for (int m = 0; m < 3; ++m) {
+                    tmp(m-1,-1) += c[m] * fac;
+                    used(m-1,-1) += cc[m];
+                }
+            } else if (face == Orientation(1, Orientation::high)) {
+                for (int m = 0; m < 3; ++m) {
+                    tmp(m-1,1) += c[m] * fac;
+                    used(m-1,1) += cc[m];
+                }
+            }
+        }
+    }
+
+    auto const* ptmp = tmp.begin();
+    auto const* pused = used.begin();
+    for (int m = 0; m < 9; ++m) {
+        if (pused[m]) {
+            (*values) += ptmp[m];
+            ++values;
+        }
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void hypmlabeclap_c2f (int i, int j, int k,
+                       Array4<GpuArray<Real,2*AMREX_SPACEDIM+1>> const& stencil,
+                       GpuArray<HYPRE_Int,AMREX_SPACEDIM>* civ, HYPRE_Int* nentries,
+                       int* entry_offset, Real* entry_values,
+                       Array4<int const> const& offset_from,
+                       Array4<int const> const& nentries_to,
+                       Array4<int const> const& offset_to,
+                       GpuArray<Real,AMREX_SPACEDIM> const& dx, Real sb,
+                       Array4<int const> const& offset_bx,
+                       Array4<int const> const& offset_by,
+                       Real const* bx, Real const* by,
+                       Array4<int const> const& fine_mask,
+                       IntVect const& rr)
+{
+    if (fine_mask(i,j,k)) {
+        // Let's set off-diagonal elements to zero
+        for (int m = 1; m < 2*AMREX_SPACEDIM+1; ++m) {
+            stencil(i,j,k)[m] = Real(0.0);
+        }
+    } else if (nentries_to(i,j,k) > 0) {
+        int const fromoff = offset_from(i,j,k);
+        civ[fromoff][0] = i;
+        civ[fromoff][1] = j;
+        nentries[fromoff] = nentries_to(i,j,k);
+        int foff = offset_to(i,j,k);
+        entry_offset[fromoff] = foff;
+
+        // We must iterate the faces in the lexicographical order of fine
+        // neighbor cells, because that's the order when non-stencil entries
+        // were added to Hypre's graph. Also note that a coarse cell will
+        // not have entries to fine cells at both ends of a direction. Thus
+        // we do not have to worry about the order between fine cells at the
+        // small and big ends of the same direction.
+
+        if (fine_mask(i,j-1,k)) {
+            stencil(i,j,k)[0] += stencil(i,j,k)[3];
+            stencil(i,j,k)[3] = Real(0.0);
+            // Reflux: sb/h^2*by(i,j,k)*(phi(i,j,k)-phi(i,j-1,k)) is replaced by
+            // sb/h*sum_{fine faces}(dphi/dy*by)/n_fine_faces
+            Real dyf = dx[1] / Real(rr[1]);
+            Real dycinv = Real(1.0) / dx[1];
+            Real dyfinv = Real(1.0) / dyf;
+            Real cc[3];
+            Real yy[3] = {dx[1]*Real(-0.5), dyf*Real(0.5), dyf*Real(1.5)};
+            poly_interp_coeff<3>(dyf*Real(-0.5), yy, cc);
+            for (int irx = 0; irx < rr[0]; ++irx) {
+                Real bym = by ? by[offset_by(i,j,k)+irx] : Real(1.0);
+                Real fac = sb*dycinv*dyfinv*bym/Real(rr[0]);
+                // int ii = i*rr[0] + irx
+                // int jj = j*rr[1]
+                // dphi/dy = (phi_interp - phi_fine(jj-1)) / dy_fine
+                //         = (phi_coarse*cc[0] + phi_fine(jj-1)*(cc[1]-1)
+                //                             + phi_fine(jj-2)* cc[2]) / dy_fine
+                // So the entry for fine cell (jj-1) is (cc[1]-1)*fac
+                //                  fine cell (jj-2) is  cc[2]   *fac
+                entry_values[foff+irx      ] += fac* cc[2];
+                entry_values[foff+irx+rr[0]] += fac*(cc[1]-Real(1.0));
+
+                // The coarse cell's stencils need updates too.
+                Real xInt = Real(-0.5) + (irx+Real(0.5))/Real(rr[0]);
+                Real xc[3] = {Real(-1.0), Real(0.0), Real(1.0)};
+                Real ct[3] = {Real(0.0), Real(0.0), Real(0.0)};
+                if (fine_mask(i-1,j,k)) {
+                    poly_interp_coeff<2>(xInt, &(xc[1]), &(ct[1]));
+                } else if (fine_mask(i+1,j,k)) {
+                    poly_interp_coeff<2>(xInt, xc, ct);
+                } else {
+                    poly_interp_coeff<3>(xInt, xc, ct);
+                }
+                // phi_coarse = ct[0]*phi(i-1) + ct[1]*phi(i) + ct[2]*phi(i+1)
+                stencil(i,j,k)[0] += (fac*cc[0])*ct[1];
+                stencil(i,j,k)[1] += (fac*cc[0])*ct[0];
+                stencil(i,j,k)[2] += (fac*cc[0])*ct[2];
+            }
+            foff += 2*rr[0];
+        }
+
+        if (fine_mask(i-1,j,k)) {
+            stencil(i,j,k)[0] += stencil(i,j,k)[1];
+            stencil(i,j,k)[1] = Real(0.0);
+            // Reflux: sb/h^2*bx(i,j,k)*(phi(i,j,k)-phi(i-1,j,k)) is replaced by
+            // sb/h*sum_{fine faces}(dphi/dx*bx).
+            Real dxf = dx[0] / Real(rr[0]);
+            Real dxcinv = Real(1.0) / dx[0];
+            Real dxfinv = Real(1.0) / dxf;
+            Real cc[3];
+            Real xx[3] = {dx[0]*Real(-0.5), dxf*Real(0.5), dxf*Real(1.5)};
+            poly_interp_coeff<3>(dxf*Real(-0.5), xx, cc);
+            for (int iry = 0; iry < rr[1]; ++iry) {
+                Real bxm = bx ? bx[offset_bx(i,j,k)+iry] : Real(1.0);
+                Real fac = sb*dxcinv*dxfinv*bxm/Real(rr[1]);
+                // int ii = i*rr[0]
+                // int jj = j*rr[1] + iry
+                // dphi/dx = (phi_interp - phi_fine(ii-1)) / dx_fine
+                //         = (phi_coarse*cc[0] + phi_fine(ii-1)*(cc[1]-1)
+                //                             + phi_fine(ii-2)* cc[2]) / dx_fine
+                // So the entry for fine cell(ii-1) is (cc[1]-1)*fac
+                //                  fine cell(ii-2) is  cc[2]   *fac
+                entry_values[foff++] = fac* cc[2];
+                entry_values[foff++] = fac*(cc[1] - Real(1.0));
+
+                // The coarse cell's stencils need updates too.
+                Real yInt = Real(-0.5) + (iry+Real(0.5))/Real(rr[1]);
+                Real yc[3] = {Real(-1.0), Real(0.0), Real(1.0)};
+                Real ct[3] = {Real(0.0), Real(0.0), Real(0.0)};
+                if (fine_mask(i,j-1,k)) {
+                    poly_interp_coeff<2>(yInt, &(yc[1]), &(ct[1]));
+                } else if (fine_mask(i,j+1,k)) {
+                    poly_interp_coeff<2>(yInt, yc, ct);
+                } else {
+                    poly_interp_coeff<3>(yInt, yc, ct);
+                }
+                // phi_coarse = ct[0]*phi(j-1) + ct[1]*phi(j) + ct[2]*phi(j+1)
+                stencil(i,j,k)[0] += (fac*cc[0])*ct[1];
+                stencil(i,j,k)[3] += (fac*cc[0])*ct[0];
+                stencil(i,j,k)[4] += (fac*cc[0])*ct[2];
+            }
+        }
+
+        if (fine_mask(i+1,j,k)) {
+            stencil(i,j,k)[0] += stencil(i,j,k)[2];
+            stencil(i,j,k)[2] = Real(0.0);
+            // Reflux: sb/h^2*bx(i+1,j,k)*(phi(i,j,k)-phi(i+1,j,k)) is replaced by
+            // sb/h*sum_{fine faces}(-dphi/dx*bx).
+            Real dxf = dx[0] / Real(rr[0]);
+            Real dxcinv = Real(1.0) / dx[0];
+            Real dxfinv = Real(1.0) / dxf;
+            Real cc[3];
+            Real xx[3] = {dx[0]*Real(-0.5), dxf*Real(0.5), dxf*Real(1.5)};
+            poly_interp_coeff<3>(dxf*Real(-0.5), xx, cc);
+            for (int iry = 0; iry < rr[1]; ++iry) {
+                Real bxp = bx ? bx[offset_bx(i+1,j,k)+iry] : Real(1.0);
+                Real fac = sb*dxcinv*dxfinv*bxp/Real(rr[1]);
+                // int ii = i*rr[0] + (rr[0]-1)
+                // int jj = j*rr[1] + iry
+                // -dphi/dx = (phi_interp - phi_fine(ii+1)) / dx_fine
+                //          = (phi_coarse*cc[0] + phi_fine(ii+1)*(cc[1]-1)
+                //                              + phi_fine(ii+2)* cc[2]) / dx_fine
+                // So the entry for fine cell(ii+1) is (cc[1]-1)*fac
+                //                  fine cell(ii+2) is  cc[2]   *fac
+                entry_values[foff++] = fac*(cc[1] - Real(1.0));
+                entry_values[foff++] = fac* cc[2];
+
+                // The coarse cell's stencils need updates too.
+                Real yInt = Real(-0.5) + (iry+Real(0.5))/Real(rr[1]);
+                Real yc[3] = {Real(-1.0), Real(0.0), Real(1.0)};
+                Real ct[3] = {Real(0.0), Real(0.0), Real(0.0)};
+                if (fine_mask(i,j-1,k)) {
+                    poly_interp_coeff<2>(yInt, &(yc[1]), &(ct[1]));
+                } else if (fine_mask(i,j+1,k)) {
+                    poly_interp_coeff<2>(yInt, yc, ct);
+                } else {
+                    poly_interp_coeff<3>(yInt, yc, ct);
+                }
+                // phi_coarse = ct[0]*phi(j-1) + ct[1]*phi(j) + ct[2]*phi(j+1)
+                stencil(i,j,k)[0] += (fac*cc[0])*ct[1];
+                stencil(i,j,k)[3] += (fac*cc[0])*ct[0];
+                stencil(i,j,k)[4] += (fac*cc[0])*ct[2];
+            }
+        }
+
+        if (fine_mask(i,j+1,k)) {
+            stencil(i,j,k)[0] += stencil(i,j,k)[4];
+            stencil(i,j,k)[4] = Real(0.0);
+            // Reflux: sb/h^2*by(i,j+1,k)*(phi(i,j,k)-phi(i,j+1,k)) is replaced by
+            // sb/h*sum_{fine faces}(-dphi/dy*by)
+            Real dyf = dx[1] / Real(rr[1]);
+            Real dycinv = Real(1.0) / dx[1];
+            Real dyfinv = Real(1.0) / dyf;
+            Real cc[3];
+            Real yy[3] = {dx[1]*Real(-0.5), dyf*Real(0.5), dyf*Real(1.5)};
+            poly_interp_coeff<3>(dyf*Real(-0.5), yy, cc);
+            for (int irx = 0; irx < rr[0]; ++irx) {
+                Real byp = by ? by[offset_by(i,j+1,k)+irx] : Real(1.0);
+                Real fac = sb*dycinv*dyfinv*byp/Real(rr[0]);
+                // int ii = i*rr[0] + irx
+                // int jj = j*rr[1] + (rr[1]-1)
+                // -dphi/dy = (phi_interp - phi_fine(jj+1)) / dy_fine
+                //          = (phi_coarse*cc[0] + phi_fine(jj+1)*(cc[1]-1)
+                //                              + phi_fine(jj+2)* cc[2]) / dy_fine
+                // So the entry for fine cell (jj+1) is (cc[1]-1)*fac
+                //                  fine cell (jj+2) is  cc[2]   *fac
+                entry_values[foff+irx      ] += fac*(cc[1]-Real(1.0));
+                entry_values[foff+irx+rr[0]] += fac* cc[2];
+
+                // The coarse cell's stencils need updates too.
+                Real xInt = Real(-0.5) + (irx+Real(0.5))/Real(rr[0]);
+                Real xc[3] = {Real(-1.0), Real(0.0), Real(1.0)};
+                Real ct[3] = {Real(0.0), Real(0.0), Real(0.0)};
+                if (fine_mask(i-1,j,k)) {
+                    poly_interp_coeff<2>(xInt, &(xc[1]), &(ct[1]));
+                } else if (fine_mask(i+1,j,k)) {
+                    poly_interp_coeff<2>(xInt, xc, ct);
+                } else {
+                    poly_interp_coeff<3>(xInt, xc, ct);
+                }
+                // phi_coarse = ct[0]*phi(i-1) + ct[1]*phi(i) + ct[2]*phi(i+1)
+                stencil(i,j,k)[0] += (fac*cc[0])*ct[1];
+                stencil(i,j,k)[1] += (fac*cc[0])*ct[0];
+                stencil(i,j,k)[2] += (fac*cc[0])*ct[2];
+            }
+            // not needed: foff += 2*rr[0];
+        }
+    }
+}
+
+}
+
+#endif
diff --git a/Src/Extern/HYPRE/AMReX_HypreMLABecLap_3D_K.H b/Src/Extern/HYPRE/AMReX_HypreMLABecLap_3D_K.H
new file mode 100644
index 00000000000..8e6e1a39b14
--- /dev/null
+++ b/Src/Extern/HYPRE/AMReX_HypreMLABecLap_3D_K.H
@@ -0,0 +1,649 @@
+#ifndef AMREX_HYPRE_ML_ABECLAP_3D_K_H_
+#define AMREX_HYPRE_ML_ABECLAP_3D_K_H_
+
+namespace amrex {
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void hypmlabeclap_f2c_set_values (IntVect const& cell, Real* values,
+                                  GpuArray<Real,AMREX_SPACEDIM> const& dx, Real sb,
+                                  GpuArray<Array4<Real const>,AMREX_SPACEDIM> const& b,
+                                  GpuArray<Array4<int const>,AMREX_SPACEDIM*2> const& bmask,
+                                  IntVect const& refratio, int not_covered)
+{
+    Array3D<Real,-1,1,-1,1,-1,1> tmp;
+    for (auto& x : tmp) { x = Real(0.0); }
+
+    Array3D<bool,-1,1,-1,1,-1,1> used;
+    for (auto& x : used) { x = false; }
+
+    for (OrientationIter ori; ori; ++ori) {
+        auto const face = ori();
+        int const idir = face.coordDir();
+        int const idir1 = (idir+1 < AMREX_SPACEDIM) ? idir+1 : idir+1-AMREX_SPACEDIM;
+        int const idir2 = (idir+2 < AMREX_SPACEDIM) ? idir+2 : idir+2-AMREX_SPACEDIM;
+        IntVect offset(0);
+        offset[idir] = face.isLow() ? -1 : +1;
+        IntVect const cell_out = cell + offset;
+        auto const& msk = bmask[face];
+        if (msk.contains(cell_out) && msk(cell_out) == not_covered) {
+            // There is a coarse cell on the other side of the face.
+            int const rr1 = refratio[idir1];
+            int const rr2 = refratio[idir2];
+            IntVect offset_t1(0);
+            IntVect offset_t2(0);
+            IntVect offset_tr1(0);
+            IntVect offset_tr2(0);
+            offset_t1 [idir1] = 1;
+            offset_t2 [idir2] = 1;
+            offset_tr1[idir1] = rr1;
+            offset_tr2[idir2] = rr2;
+
+            Real bcoeff = b[idir] ? b[idir](face.isLow() ? cell : cell_out) : Real(1.0); // b is on face
+            Real poly_coef[3];
+            {
+                Real xx[3] = {Real(-0.5)*Real(refratio[idir]), Real(0.5), Real(1.5)};
+                poly_interp_coeff<3>(Real(-0.5), xx, poly_coef);
+            }
+            Real fac = -(sb / (dx[idir]*dx[idir])) * bcoeff * poly_coef[0];
+
+            used(offset[0],offset[1],offset[2]) = true;
+            tmp (offset[0],offset[1],offset[2]) = fac;
+
+            int it  = cell[idir1];
+            int itc = amrex::coarsen(it, rr1);
+            Real const xt1 = Real(-0.5) + (it-itc*rr1+Real(0.5))/Real(rr1);
+            it = cell[idir2];
+            itc = amrex::coarsen(it, rr2);
+            Real const xt2 = Real(-0.5) + (it-itc*rr2+Real(0.5))/Real(rr2);
+
+            if (msk(cell_out-offset_tr1) == not_covered &&
+                msk(cell_out+offset_tr1) == not_covered)
+            {
+                IntVect iv = offset - offset_t1;
+                used(iv[0],iv[1],iv[2]) = true;
+                tmp(iv[0],iv[1],iv[2]) += fac*(Real(-0.5)*xt1 + Real(0.5)*xt1*xt1);
+
+                iv = offset + offset_t1;
+                used(iv[0],iv[1],iv[2]) = true;
+                tmp(iv[0],iv[1],iv[2]) += fac*(Real(0.5)*xt1 + Real(0.5)*xt1*xt1);
+
+                tmp(offset[0],offset[1],offset[2]) -= fac*(xt1*xt1);
+            }
+            else if (msk(cell_out+offset_tr1) == not_covered)
+            {
+                IntVect iv = offset + offset_t1;
+                used(iv[0],iv[1],iv[2]) = true;
+                tmp(iv[0],iv[1],iv[2]) += fac*xt1;
+
+                tmp(offset[0],offset[1],offset[2]) -= fac*xt1;
+            }
+            else
+            {
+                IntVect iv = offset - offset_t1;
+                used(iv[0],iv[1],iv[2]) = true;
+                tmp(iv[0],iv[1],iv[2]) -= fac*xt1;
+
+                tmp(offset[0],offset[1],offset[2]) += fac*xt1;
+            }
+
+            if (msk(cell_out-offset_tr2) == not_covered &&
+                msk(cell_out+offset_tr2) == not_covered)
+            {
+                IntVect iv = offset - offset_t2;
+                used(iv[0],iv[1],iv[2]) = true;
+                tmp(iv[0],iv[1],iv[2]) += fac*(Real(-0.5)*xt2 + Real(0.5)*xt2*xt2);
+
+                iv = offset + offset_t2;
+                used(iv[0],iv[1],iv[2]) = true;
+                tmp(iv[0],iv[1],iv[2]) += fac*(Real(0.5)*xt2 + Real(0.5)*xt2*xt2);
+
+                tmp(offset[0],offset[1],offset[2]) -= fac*(xt2*xt2);
+            }
+            else if (msk(cell_out+offset_tr2) == not_covered)
+            {
+                IntVect iv = offset + offset_t2;
+                used(iv[0],iv[1],iv[2]) = true;
+                tmp(iv[0],iv[1],iv[2]) += fac*xt2;
+
+                tmp(offset[0],offset[1],offset[2]) -= fac*xt2;
+            }
+            else
+            {
+                IntVect iv = offset - offset_t2;
+                used(iv[0],iv[1],iv[2]) = true;
+                tmp(iv[0],iv[1],iv[2]) -= fac*xt2;
+
+                tmp(offset[0],offset[1],offset[2]) += fac*xt2;
+            }
+
+            if (msk(cell_out-offset_tr1-offset_tr2) == not_covered &&
+                msk(cell_out+offset_tr1-offset_tr2) == not_covered &&
+                msk(cell_out-offset_tr1+offset_tr2) == not_covered &&
+                msk(cell_out+offset_tr1+offset_tr2) == not_covered)
+            {
+                Real tmp2 = fac*xt1*xt2*Real(0.25);
+
+                IntVect iv = offset - offset_t1 - offset_t2;
+                used(iv[0],iv[1],iv[2]) = true;
+                tmp(iv[0],iv[1],iv[2]) += tmp2;
+
+                iv = offset + offset_t1 + offset_t2;
+                used(iv[0],iv[1],iv[2]) = true;
+                tmp(iv[0],iv[1],iv[2]) += tmp2;
+
+                iv = offset - offset_t1 + offset_t2;
+                used(iv[0],iv[1],iv[2]) = true;
+                tmp(iv[0],iv[1],iv[2]) -= tmp2;
+
+                iv = offset + offset_t1 - offset_t2;
+                used(iv[0],iv[1],iv[2]) = true;
+                tmp(iv[0],iv[1],iv[2]) -= tmp2;
+            }
+        }
+    }
+
+    auto const* ptmp = tmp.begin();
+    auto const* pused = used.begin();
+    for (int m = 0; m < 27; ++m) {
+        if (pused[m]) {
+            (*values) += ptmp[m];
+            ++values;
+        }
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void hypmlabeclap_c2f (int i, int j, int k,
+                       Array4<GpuArray<Real,2*AMREX_SPACEDIM+1>> const& stencil,
+                       GpuArray<HYPRE_Int,AMREX_SPACEDIM>* civ, HYPRE_Int* nentries,
+                       int* entry_offset, Real* entry_values,
+                       Array4<int const> const& offset_from,
+                       Array4<int const> const& nentries_to,
+                       Array4<int const> const& offset_to,
+                       GpuArray<Real,AMREX_SPACEDIM> const& dx, Real sb,
+                       Array4<int const> const& offset_bx,
+                       Array4<int const> const& offset_by,
+                       Array4<int const> const& offset_bz,
+                       Real const* bx, Real const* by, Real const* bz,
+                       Array4<int const> const& fine_mask,
+                       IntVect const& rr)
+{
+    if (fine_mask(i,j,k)) {
+        // Let's set off-diagonal elements to zero
+        for (int m = 1; m < 2*AMREX_SPACEDIM+1; ++m) {
+            stencil(i,j,k)[m] = Real(0.0);
+        }
+    } else if (nentries_to(i,j,k) > 0) {
+        int const fromoff = offset_from(i,j,k);
+        civ[fromoff][0] = i;
+        civ[fromoff][1] = j;
+        civ[fromoff][2] = k;
+        nentries[fromoff] = nentries_to(i,j,k);
+        int const tooff = offset_to(i,j,k);
+        entry_offset[fromoff] = tooff;
+
+        // Fist, we need to figure out how many corner coarse cells are
+        // involved. The coarse cell entries must be stored ahead of fine
+        // cell entries because that's how we sorted the entried when
+        // building the hypre graph.
+        bool corner[3] = {false, false, false};
+        if ((fine_mask(i-1,j,k) || fine_mask(i+1,j,k)) &&
+            (! fine_mask(i,j-1,k-1)) &&
+            (! fine_mask(i,j+1,k-1)) &&
+            (! fine_mask(i,j-1,k+1)) &&
+            (! fine_mask(i,j+1,k+1)))
+        {
+            corner[0] = true;
+        }
+        if((fine_mask(i,j-1,k) || fine_mask(i,j+1,k)) &&
+            (! fine_mask(i-1,j,k-1)) &&
+            (! fine_mask(i+1,j,k-1)) &&
+            (! fine_mask(i-1,j,k+1)) &&
+            (! fine_mask(i+1,j,k+1)))
+        {
+            corner[1] = true;
+        }
+        if((fine_mask(i,j,k-1) || fine_mask(i,j,k+1)) &&
+            (! fine_mask(i-1,j-1,k)) &&
+            (! fine_mask(i+1,j-1,k)) &&
+            (! fine_mask(i-1,j+1,k)) &&
+            (! fine_mask(i+1,j+1,k)))
+        {
+            corner[2] = true;
+        }
+        int nentries_c = 4 * (int(corner[0]) + int(corner[1]) + int(corner[2]));
+        int foff = tooff + nentries_c;
+
+        // We must iterate the faces in the lexicographical order of fine
+        // neighbor cells, because that's the order when non-stencil entries
+        // were added to Hypre's graph. Also note that a coarse cell will
+        // not have entries to fine cells at both ends of a direction. Thus
+        // we do not have to worry about the order between fine cells at the
+        // small and big ends of the same direction.
+
+        if (fine_mask(i,j,k-1)) {
+            stencil(i,j,k)[0] += stencil(i,j,k)[5];
+            stencil(i,j,k)[5] = Real(0.0);
+            // Reflux: sb/h^2*bz(i,j,k)*(phi(i,j,k)-phi(i,j,k-1)) is replaced by
+            // sb/h*sum_{fine_faces}(dphi/dz*bz)/n_fine_faces
+            Real dzf = dx[2] / Real(rr[2]);
+            Real dzcinv = Real(1.0) / dx[2];
+            Real dzfinv = Real(1.0) / dzf;
+            Real cc[3];
+            Real zz[3] = {dx[2]*Real(-0.5), dzf*Real(0.5), dzf*Real(1.5)};
+            poly_interp_coeff<3>(dzf*Real(-0.5), zz, cc);
+            for (int iry = 0; iry < rr[1]; ++iry) {
+            for (int irx = 0; irx < rr[0]; ++irx) {
+                Real bzm = bz ? bz[offset_bz(i,j,k)+irx+iry*rr[0]] : Real(1.0);
+                Real fac = sb*dzcinv*dzfinv*bzm/Real(rr[0]*rr[1]);
+                // int ii = i*rr[0] + irx
+                // int jj = j*rr[1] + iry
+                // int kk = k*rr[2]
+                // dphi/dz = (phi_interp - phi_fine(kk-1))/dz_fine
+                //         = (phi_coarse*cc[0] + phi_fine(kk-1)*(cc[1]-1)
+                //                             + phi_fine(kk-2)* cc[2]) / dz_fine
+                // So the entry for fine cell (kk-1) is (cc[1]-1)*fac
+                //                  fine cell (kk-2) is  cc[2]   *fac
+                entry_values[foff+irx+iry*rr[0]            ] += fac* cc[2];
+                entry_values[foff+irx+iry*rr[0]+rr[0]*rr[1]] += fac*(cc[1]-Real(1.0));
+
+                // The stencil and non-stencil coarse cells need updates too.
+                Real x = Real(-0.5) + (irx+Real(0.5))/Real(rr[0]);
+                Real y = Real(-0.5) + (iry+Real(0.5))/Real(rr[1]);
+                Real fac0 = fac*cc[0];
+                Real s0 = Real(1.0);
+
+                if (!fine_mask(i-1,j,k) && !fine_mask(i+1,j,k)) {
+                    s0 -= x*x;
+                    stencil(i,j,k)[1] += fac0*Real(0.5)*x*(x-Real(1.0));
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x*(x+Real(1.0));
+                } else if (!fine_mask(i-1,j,k)) {
+                    s0 += Real(0.5)*x;
+                    stencil(i,j,k)[1] += fac0*Real(-0.5)*x;
+                } else {
+                    s0 += Real(-0.5)*x;
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x;
+                }
+
+                if (!fine_mask(i,j-1,k) && !fine_mask(i,j+1,k)) {
+                    s0 -= y*y;
+                    stencil(i,j,k)[3] += fac0*Real(0.5)*y*(y-Real(1.0));
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y*(y+Real(1.0));
+                } else if (!fine_mask(i,j-1,k)) {
+                    s0 += Real(0.5)*y;
+                    stencil(i,j,k)[3] += fac0*Real(-0.5)*y;
+                } else {
+                    s0 += Real(-0.5)*y;
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y;
+                }
+
+                stencil(i,j,k)[0] += fac0*s0;
+
+                if (corner[2]) {
+                    int offset = tooff + (corner[0] ? 2 : 0) + (corner[1] ? 2 : 0);
+                    entry_values[offset++] += fac0*Real( 0.25)*x*y;
+                    entry_values[offset++] += fac0*Real(-0.25)*x*y;
+                    entry_values[offset++] += fac0*Real(-0.25)*x*y;
+                    entry_values[offset  ] += fac0*Real( 0.25)*x*y;
+                }
+            }}
+            foff += 2*rr[0]*rr[1];
+        }
+
+        if (fine_mask(i,j-1,k)) {
+            stencil(i,j,k)[0] += stencil(i,j,k)[3];
+            stencil(i,j,k)[3] = Real(0.0);
+            // Reflux: sb/h^2*by(i,j,k)*(phi(i,j,k)-phi(i,j-1,k)) is replaced by
+            // sb/h*sum_{fine faces}(dphi/dy*by)/n_fine_faces
+            Real dyf = dx[1] / Real(rr[1]);
+            Real dycinv = Real(1.0) / dx[1];
+            Real dyfinv = Real(1.0) / dyf;
+            Real cc[3];
+            Real yy[3] = {dx[1]*Real(-0.5), dyf*Real(0.5), dyf*Real(1.5)};
+            poly_interp_coeff<3>(dyf*Real(-0.5), yy, cc);
+            for (int irz = 0; irz < rr[2]; ++irz) {
+            for (int irx = 0; irx < rr[0]; ++irx) {
+                Real bym = by ? by[offset_by(i,j,k)+irx+irz*rr[0]] : Real(1.0);
+                Real fac = sb*dycinv*dyfinv*bym/Real(rr[0]*rr[2]);
+                // int ii = i*rr[0] + irx
+                // int jj = j*rr[1]
+                // int kk = k*rr[2] + irz
+                // dphi/dy = (phi_interp - phi_fine(jj-1)) / dy_fine
+                //         = (phi_coarse*cc[0] + phi_fine(jj-1)*(cc[1]-1)
+                //                             + phi_fine(jj-2)* cc[2]) / dy_fine
+                // So the entry for fine cell (jj-1) is (cc[1]-1)*fac
+                //                  fine cell (jj-2) is  cc[2]   *fac
+                entry_values[foff+irx      +irz*rr[0]*2] += fac* cc[2];
+                entry_values[foff+irx+rr[0]+irz*rr[0]*2] += fac*(cc[1]-Real(1.0));
+
+                // The stencil and non-stencil coarse cells need updates too.
+                Real x = Real(-0.5) + (irx+Real(0.5))/Real(rr[0]);
+                Real z = Real(-0.5) + (irz+Real(0.5))/Real(rr[2]);
+                Real fac0 = fac*cc[0];
+                Real s0 = Real(1.0);
+
+                if (!fine_mask(i-1,j,k) && !fine_mask(i+1,j,k)) {
+                    s0 -= x*x;
+                    stencil(i,j,k)[1] += fac0*Real(0.5)*x*(x-Real(1.0));
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x*(x+Real(1.0));
+                } else if (!fine_mask(i-1,j,k)) {
+                    s0 += Real(0.5)*x;
+                    stencil(i,j,k)[1] += fac0*Real(-0.5)*x;
+                } else {
+                    s0 += Real(-0.5)*x;
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x;
+                }
+
+                if (!fine_mask(i,j,k-1) && !fine_mask(i,j,k+1)) {
+                    s0 -= z*z;
+                    stencil(i,j,k)[5] += fac0*Real(0.5)*z*(z-Real(1.0));
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z*(z+Real(1.0));
+                } else if (!fine_mask(i,j,k-1)) {
+                    s0 += Real(0.5)*z;
+                    stencil(i,j,k)[5] += fac0*Real(-0.5)*z;
+                } else {
+                    s0 += Real(-0.5)*z;
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z;
+                }
+
+                stencil(i,j,k)[0] += fac0*s0;
+
+                if (corner[1]) {
+                    int offset = tooff + (corner[0] ? 1 : 0);
+                    entry_values[offset++] += fac0*Real( 0.25)*x*z;
+                    entry_values[offset++] += fac0*Real(-0.25)*x*z;
+                    if (corner[0]) { offset += 2; }
+                    if (corner[2]) { offset += 4; }
+                    entry_values[offset++] += fac0*Real(-0.25)*x*z;
+                    entry_values[offset  ] += fac0*Real( 0.25)*x*z;
+                }
+            }}
+            foff += 2*rr[0]*rr[2];
+        }
+
+        if (fine_mask(i-1,j,k)) {
+            stencil(i,j,k)[0] += stencil(i,j,k)[1];
+            stencil(i,j,k)[1] = Real(0.0);
+            // Reflux: sb/h^2*bx(i,j,k)*(phi(i,j,k)-phi(i-1,j,k)) is replaced by
+            // sb/h*sum_{fine faces}(dphi/dx*bx).
+            Real dxf = dx[0] / Real(rr[0]);
+            Real dxcinv = Real(1.0) / dx[0];
+            Real dxfinv = Real(1.0) / dxf;
+            Real cc[3];
+            Real xx[3] = {dx[0]*Real(-0.5), dxf*Real(0.5), dxf*Real(1.5)};
+            poly_interp_coeff<3>(dxf*Real(-0.5), xx, cc);
+            for (int irz = 0; irz < rr[2]; ++irz) {
+            for (int iry = 0; iry < rr[1]; ++iry) {
+                Real bxm = bx ? bx[offset_bx(i,j,k)+iry+irz*rr[1]] : Real(1.0);
+                Real fac = sb*dxcinv*dxfinv*bxm/Real(rr[1]*rr[2]);
+                // int ii = i*rr[0]
+                // int jj = j*rr[1] + iry
+                // int kk = k*rr[2]
+                // dphi/dx = (phi_interp - phi_fine(ii-1)) / dx_fine
+                //         = (phi_coarse*cc[0] + phi_fine(ii-1)*(cc[1]-1)
+                //                             + phi_fine(ii-2)* cc[2]) / dx_fine
+                // So the entry for fine cell(ii-1) is (cc[1]-1)*fac
+                //                  fine cell(ii-2) is  cc[2]   *fac
+                entry_values[foff++] = fac* cc[2];
+                entry_values[foff++] = fac*(cc[1] - Real(1.0));
+
+                // The stencil and non-stencil coarse cells need updates too.
+                Real y = Real(-0.5) + (iry+Real(0.5))/Real(rr[1]);
+                Real z = Real(-0.5) + (irz+Real(0.5))/Real(rr[2]);
+                Real fac0 = fac*cc[0];
+                Real s0 = Real(1.0);
+
+                if (!fine_mask(i,j-1,k) && !fine_mask(i,j+1,k)) {
+                    s0 -= y*y;
+                    stencil(i,j,k)[3] += fac0*Real(0.5)*y*(y-Real(1.0));
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y*(y+Real(1.0));
+                } else if (!fine_mask(i,j-1,k)) {
+                    s0 += Real(0.5)*y;
+                    stencil(i,j,k)[3] += fac0*Real(-0.5)*y;
+                } else {
+                    s0 += Real(-0.5)*y;
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y;
+                }
+
+                if (!fine_mask(i,j,k-1) && !fine_mask(i,j,k+1)) {
+                    s0 -= z*z;
+                    stencil(i,j,k)[5] += fac0*Real(0.5)*z*(z-Real(1.0));
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z*(z+Real(1.0));
+                } else if (!fine_mask(i,j,k-1)) {
+                    s0 += Real(0.5)*z;
+                    stencil(i,j,k)[5] += fac0*Real(-0.5)*z;
+                } else {
+                    s0 += Real(-0.5)*z;
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z;
+                }
+
+                stencil(i,j,k)[0] += fac0*s0;
+
+                if (corner[0]) {
+                    int offset = tooff;
+                    entry_values[offset++] += fac0*Real( 0.25)*y*z;
+                    if (corner[1]) { offset += 2; }
+                    entry_values[offset++] += fac0*Real(-0.25)*y*z;
+                    if (corner[2]) { offset += 4; }
+                    entry_values[offset++] += fac0*Real(-0.25)*y*z;
+                    if (corner[1]) { offset += 2; }
+                    entry_values[offset  ] += fac0*Real( 0.25)*y*z;
+                }
+            }}
+        }
+
+        if (fine_mask(i+1,j,k)) {
+            stencil(i,j,k)[0] += stencil(i,j,k)[2];
+            stencil(i,j,k)[2] = Real(0.0);
+            // Reflux: sb/h^2*bx(i+1,j,k)*(phi(i,j,k)-phi(i+1,j,k)) is replaced by
+            // sb/h*sum_{fine faces}(-dphi/dx*bx).
+            Real dxf = dx[0] / Real(rr[0]);
+            Real dxcinv = Real(1.0) / dx[0];
+            Real dxfinv = Real(1.0) / dxf;
+            Real cc[3];
+            Real xx[3] = {dx[0]*Real(-0.5), dxf*Real(0.5), dxf*Real(1.5)};
+            poly_interp_coeff<3>(dxf*Real(-0.5), xx, cc);
+            for (int irz = 0; irz < rr[2]; ++irz) {
+            for (int iry = 0; iry < rr[1]; ++iry) {
+                Real bxp = bx ? bx[offset_bx(i+1,j,k)+iry+irz*rr[1]] : Real(1.0);
+                Real fac = sb*dxcinv*dxfinv*bxp/Real(rr[1]*rr[2]);
+                // int ii = i*rr[0] + (rr[0]-1)
+                // int jj = j*rr[1] + iry
+                // -dphi/dx = (phi_interp - phi_fine(ii+1)) / dx_fine
+                //          = (phi_coarse*cc[0] + phi_fine(ii+1)*(cc[1]-1)
+                //                              + phi_fine(ii+2)* cc[2]) / dx_fine
+                // So the entry for fine cell(ii+1) is (cc[1]-1)*fac
+                //                  fine cell(ii+2) is  cc[2]   *fac
+                entry_values[foff++] = fac*(cc[1] - Real(1.0));
+                entry_values[foff++] = fac* cc[2];
+
+                // The stencil and non-stencil coarse cells need updates too.
+                Real y = Real(-0.5) + (iry+Real(0.5))/Real(rr[1]);
+                Real z = Real(-0.5) + (irz+Real(0.5))/Real(rr[2]);
+                Real fac0 = fac*cc[0];
+                Real s0 = Real(1.0);
+
+                if (!fine_mask(i,j-1,k) && !fine_mask(i,j+1,k)) {
+                    s0 -= y*y;
+                    stencil(i,j,k)[3] += fac0*Real(0.5)*y*(y-Real(1.0));
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y*(y+Real(1.0));
+                } else if (!fine_mask(i,j-1,k)) {
+                    s0 += Real(0.5)*y;
+                    stencil(i,j,k)[3] += fac0*Real(-0.5)*y;
+                } else {
+                    s0 += Real(-0.5)*y;
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y;
+                }
+
+                if (!fine_mask(i,j,k-1) && !fine_mask(i,j,k+1)) {
+                    s0 -= z*z;
+                    stencil(i,j,k)[5] += fac0*Real(0.5)*z*(z-Real(1.0));
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z*(z+Real(1.0));
+                } else if (!fine_mask(i,j,k-1)) {
+                    s0 += Real(0.5)*z;
+                    stencil(i,j,k)[5] += fac0*Real(-0.5)*z;
+                } else {
+                    s0 += Real(-0.5)*z;
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z;
+                }
+
+                stencil(i,j,k)[0] += fac0*s0;
+
+                if (corner[0]) {
+                    int offset = tooff;
+                    entry_values[offset++] += fac0*Real( 0.25)*y*z;
+                    if (corner[1]) { offset += 2; }
+                    entry_values[offset++] += fac0*Real(-0.25)*y*z;
+                    if (corner[2]) { offset += 4; }
+                    entry_values[offset++] += fac0*Real(-0.25)*y*z;
+                    if (corner[1]) { offset += 2; }
+                    entry_values[offset  ] += fac0*Real( 0.25)*y*z;
+                }
+            }}
+        }
+
+        if (fine_mask(i,j+1,k)) {
+            stencil(i,j,k)[0] += stencil(i,j,k)[4];
+            stencil(i,j,k)[4] = Real(0.0);
+            // Reflux: sb/h^2*by(i,j+1,k)*(phi(i,j,k)-phi(i,j+1,k)) is replaced by
+            // sb/h*sum_{fine faces}(-dphi/dy*by)
+            Real dyf = dx[1] / Real(rr[1]);
+            Real dycinv = Real(1.0) / dx[1];
+            Real dyfinv = Real(1.0) / dyf;
+            Real cc[3];
+            Real yy[3] = {dx[1]*Real(-0.5), dyf*Real(0.5), dyf*Real(1.5)};
+            poly_interp_coeff<3>(dyf*Real(-0.5), yy, cc);
+            for (int irz = 0; irz < rr[2]; ++irz) {
+            for (int irx = 0; irx < rr[0]; ++irx) {
+                Real byp = by ? by[offset_by(i,j+1,k)+irx+irz*rr[0]] : Real(1.0);
+                Real fac = sb*dycinv*dyfinv*byp/Real(rr[0]*rr[2]);
+                // int ii = i*rr[0] + irx
+                // int jj = j*rr[1] + (rr[1]-1)
+                // int kk = k*rr[2] + irz;
+                // -dphi/dy = (phi_interp - phi_fine(jj+1)) / dy_fine
+                //          = (phi_coarse*cc[0] + phi_fine(jj+1)*(cc[1]-1)
+                //                              + phi_fine(jj+2)* cc[2]) / dy_fine
+                // So the entry for fine cell (jj+1) is (cc[1]-1)*fac
+                //                  fine cell (jj+2) is  cc[2]   *fac
+                entry_values[foff+irx      +irz*rr[0]*2] += fac*(cc[1]-Real(1.0));
+                entry_values[foff+irx+rr[0]+irz*rr[0]*2] += fac* cc[2];
+
+                // The stencil and non-stencil coarse cells need updates too.
+                Real x = Real(-0.5) + (irx+Real(0.5))/Real(rr[0]);
+                Real z = Real(-0.5) + (irz+Real(0.5))/Real(rr[2]);
+                Real fac0 = fac*cc[0];
+                Real s0 = Real(1.0);
+
+                if (!fine_mask(i-1,j,k) && !fine_mask(i+1,j,k)) {
+                    s0 -= x*x;
+                    stencil(i,j,k)[1] += fac0*Real(0.5)*x*(x-Real(1.0));
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x*(x+Real(1.0));
+                } else if (!fine_mask(i-1,j,k)) {
+                    s0 += Real(0.5)*x;
+                    stencil(i,j,k)[1] += fac0*Real(-0.5)*x;
+                } else {
+                    s0 += Real(-0.5)*x;
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x;
+                }
+
+                if (!fine_mask(i,j,k-1) && !fine_mask(i,j,k+1)) {
+                    s0 -= z*z;
+                    stencil(i,j,k)[5] += fac0*Real(0.5)*z*(z-Real(1.0));
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z*(z+Real(1.0));
+                } else if (!fine_mask(i,j,k-1)) {
+                    s0 += Real(0.5)*z;
+                    stencil(i,j,k)[5] += fac0*Real(-0.5)*z;
+                } else {
+                    s0 += Real(-0.5)*z;
+                    stencil(i,j,k)[6] += fac0*Real(0.5)*z;
+                }
+
+                stencil(i,j,k)[0] += fac0*s0;
+
+                if (corner[1]) {
+                    int offset = tooff + (corner[0] ? 1 : 0);
+                    entry_values[offset++] += fac0*Real( 0.25)*x*z;
+                    entry_values[offset++] += fac0*Real(-0.25)*x*z;
+                    if (corner[0]) { offset += 2; }
+                    if (corner[2]) { offset += 4; }
+                    entry_values[offset++] += fac0*Real(-0.25)*x*z;
+                    entry_values[offset  ] += fac0*Real( 0.25)*x*z;
+                }
+            }}
+            foff += 2*rr[0]*rr[2];
+        }
+
+        if (fine_mask(i,j,k+1)) {
+            stencil(i,j,k)[0] += stencil(i,j,k)[6];
+            stencil(i,j,k)[6] = Real(0.0);
+            // Reflux: sb/h^2*bz(i,j,k+1)*(phi(i,j,k)-phi(i,j,k+1)) is replaced by
+            // sb/h*sum_{fine_faces}(-dphi/dz*bz)/n_fine_faces
+            Real dzf = dx[2] / Real(rr[2]);
+            Real dzcinv = Real(1.0) / dx[2];
+            Real dzfinv = Real(1.0) / dzf;
+            Real cc[3];
+            Real zz[3] = {dx[2]*Real(-0.5), dzf*Real(0.5), dzf*Real(1.5)};
+            poly_interp_coeff<3>(dzf*Real(-0.5), zz, cc);
+            for (int iry = 0; iry < rr[1]; ++iry) {
+            for (int irx = 0; irx < rr[0]; ++irx) {
+                Real bzp = bz ? bz[offset_bz(i,j,k+1)+irx+iry*rr[0]] : Real(1.0);
+                Real fac = sb*dzcinv*dzfinv*bzp/Real(rr[0]*rr[1]);
+                // int ii = i*rr[0] + irx
+                // int jj = j*rr[1] + iry
+                // int kk = k*rr[2] + (rr[2]-1)
+                // -dphi/dz = (phi_interp - phi_fine(kk+1))/dz_fine
+                //          = (phi_coarse*cc[0] + phi_fine(kk+1)*(cc[1]-1)
+                //                              + phi_fine(kk+2)* cc[2]) / dz_fine
+                // So the entry for fine cell (kk+1) is (cc[1]-1)*fac
+                //                  fine cell (kk+2) is  cc[2]   *fac
+                entry_values[foff+irx+iry*rr[0]            ] += fac*(cc[1]-Real(1.0));
+                entry_values[foff+irx+iry*rr[0]+rr[0]*rr[1]] += fac* cc[2];
+
+                // The stencil and non-stencil coarse cells need updates too.
+                Real x = Real(-0.5) + (irx+Real(0.5))/Real(rr[0]);
+                Real y = Real(-0.5) + (iry+Real(0.5))/Real(rr[1]);
+                Real fac0 = fac*cc[0];
+                Real s0 = Real(1.0);
+
+                if (!fine_mask(i-1,j,k) && !fine_mask(i+1,j,k)) {
+                    s0 -= x*x;
+                    stencil(i,j,k)[1] += fac0*Real(0.5)*x*(x-Real(1.0));
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x*(x+Real(1.0));
+                } else if (!fine_mask(i-1,j,k)) {
+                    s0 += Real(0.5)*x;
+                    stencil(i,j,k)[1] += fac0*Real(-0.5)*x;
+                } else {
+                    s0 += Real(-0.5)*x;
+                    stencil(i,j,k)[2] += fac0*Real(0.5)*x;
+                }
+
+                if (!fine_mask(i,j-1,k) && !fine_mask(i,j+1,k)) {
+                    s0 -= y*y;
+                    stencil(i,j,k)[3] += fac0*Real(0.5)*y*(y-Real(1.0));
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y*(y+Real(1.0));
+                } else if (!fine_mask(i,j-1,k)) {
+                    s0 += Real(0.5)*y;
+                    stencil(i,j,k)[3] += fac0*Real(-0.5)*y;
+                } else {
+                    s0 += Real(-0.5)*y;
+                    stencil(i,j,k)[4] += fac0*Real(0.5)*y;
+                }
+
+                stencil(i,j,k)[0] += fac0*s0;
+
+                if (corner[2]) {
+                    int offset = tooff + (corner[0] ? 2 : 0) + (corner[1] ? 2 : 0);
+                    entry_values[offset++] += fac0*Real( 0.25)*x*y;
+                    entry_values[offset++] += fac0*Real(-0.25)*x*y;
+                    entry_values[offset++] += fac0*Real(-0.25)*x*y;
+                    entry_values[offset  ] += fac0*Real( 0.25)*x*y;
+                }
+            }}
+            // no need to foff += 2*rr[0]*rr[1];
+        }
+    }
+}
+
+}
+
+#endif
diff --git a/Src/Extern/HYPRE/AMReX_HypreMLABecLap_K.H b/Src/Extern/HYPRE/AMReX_HypreMLABecLap_K.H
new file mode 100644
index 00000000000..ea38bf5037f
--- /dev/null
+++ b/Src/Extern/HYPRE/AMReX_HypreMLABecLap_K.H
@@ -0,0 +1,298 @@
+#ifndef AMREX_HYPRE_ML_ABECLAP_K_H_
+#define AMREX_HYPRE_ML_ABECLAP_K_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_Array4.H>
+#include <AMReX_LO_BCTYPES.H>
+#include <AMReX_LOUtil_K.H>
+#include <AMReX_REAL.H>
+
+#include <HYPRE_utilities.h>
+
+namespace amrex {
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void hypmlabeclap_mat (GpuArray<Real,2*AMREX_SPACEDIM+1>& sten, int i, int j, int k,
+                       Dim3 const& boxlo, Dim3 const& boxhi,
+                       Real sa, Array4<Real const> const& a,
+                       Real sb, GpuArray<Real,AMREX_SPACEDIM> const& dx,
+                       GpuArray<Array4<Real const>, AMREX_SPACEDIM> const& b,
+                       GpuArray<int,AMREX_SPACEDIM*2> const& bctype,
+                       GpuArray<Real,AMREX_SPACEDIM*2> const& bcl,
+                       GpuArray<Array4<int const>, AMREX_SPACEDIM*2> const& bcmsk,
+                       GpuArray<Array4<Real const>, AMREX_SPACEDIM*2> const& bcval,
+                       GpuArray<Array4<Real>, AMREX_SPACEDIM*2> const& bcrhs,
+                       int level)
+{
+    Real bxm = b[0] ? b[0](i  ,j  ,k  ) : Real(1.0);
+    Real bxp = b[0] ? b[0](i+1,j  ,k  ) : Real(1.0);
+    Real bym = b[1] ? b[1](i  ,j  ,k  ) : Real(1.0);
+    Real byp = b[1] ? b[1](i  ,j+1,k  ) : Real(1.0);
+#if (AMREX_SPACEDIM > 2)
+    Real bzm = b[2] ? b[2](i  ,j  ,k  ) : Real(1.0);
+    Real bzp = b[2] ? b[2](i  ,j  ,k+1) : Real(1.0);
+#endif
+    Real ac = a ? a(i,j,k) : Real(0.0);
+
+    sten[1] = -(sb / (dx[0]*dx[0])) * bxm;
+    sten[2] = -(sb / (dx[0]*dx[0])) * bxp;
+    sten[3] = -(sb / (dx[1]*dx[1])) * bym;
+    sten[4] = -(sb / (dx[1]*dx[1])) * byp;
+#if (AMREX_SPACEDIM == 2)
+    sten[0] = -(sten[1] + sten[2] + sten[3] + sten[4]) + sa*ac;
+#else
+    sten[5] = -(sb / (dx[2]*dx[2])) * bzm;
+    sten[6] = -(sb / (dx[2]*dx[2])) * bzp;
+    sten[0] = -(sten[1] + sten[2] + sten[3] + sten[4] + sten[5] + sten[6]) + sa*ac;
+#endif
+
+    // xlo
+    if (i == boxlo.x) {
+        int const cdir = Orientation(Direction::x, Orientation::low);
+        int const bcmk = bcmsk[cdir](i-1,j,k);
+        if (bcmk > 0) {
+            int const bct = bctype[cdir];
+            Real cc[3];
+            if (bct == AMREX_LO_DIRICHLET) {
+                Real xx[3] = {-bcl[cdir], dx[0]*Real(0.5), dx[0]*Real(1.5)};
+                poly_interp_coeff<3>(dx[0]*Real(-0.5), xx, cc);
+            } else { // Neumann
+                cc[0] = Real(0.0);
+                cc[1] = Real(1.0);
+                cc[2] = Real(0.0);
+            }
+            Real fac = (sb / (dx[0]*dx[0])) * bxm;
+            if (bct == AMREX_LO_DIRICHLET && (level == 0 || bcmk == 2)) {
+                // bcmk == 2 means outside the domain.
+                // We need to modify RHS at external Dirichlet boundaries.
+                bcrhs[cdir](i,j,k) = fac * cc[0] * bcval[cdir](i-1,j,k);
+            } else {
+                bcrhs[cdir](i,j,k) = Real(0.0);
+            }
+            sten[0] -= fac * cc[1];
+            sten[1] = Real(0.0);
+            sten[2] -= fac * cc[2];
+        }
+    }
+
+    // xhi
+    if (i == boxhi.x) {
+        int const cdir = Orientation(Direction::x, Orientation::high);
+        int const bcmk = bcmsk[cdir](i+1,j,k);
+        if (bcmk > 0) {
+            int const bct = bctype[cdir];
+            Real cc[3];
+            if (bct == AMREX_LO_DIRICHLET) {
+                Real xx[3] = {-bcl[cdir], dx[0]*Real(0.5), dx[0]*Real(1.5)};
+                poly_interp_coeff<3>(dx[0]*Real(-0.5), xx, cc);
+            } else { // Neumann
+                cc[0] = Real(0.0);
+                cc[1] = Real(1.0);
+                cc[2] = Real(0.0);
+            }
+            Real fac = (sb / (dx[0]*dx[0])) * bxp;
+            if (bct == AMREX_LO_DIRICHLET && (level == 0 || bcmk == 2)) {
+                // bcmk == 2 means outside the domain.
+                // We need to modify RHS at external Dirichlet boundaries.
+                bcrhs[cdir](i,j,k) = fac * cc[0] * bcval[cdir](i+1,j,k);
+            } else {
+                bcrhs[cdir](i,j,k) = Real(0.0);
+            }
+            sten[0] -= fac * cc[1];
+            sten[1] -= fac * cc[2];
+            sten[2] = Real(0.0);
+        }
+    }
+
+    // ylo
+    if (j == boxlo.y) {
+        int const cdir = Orientation(Direction::y, Orientation::low);
+        int const bcmk = bcmsk[cdir](i,j-1,k);
+        if (bcmk > 0) {
+            int const bct = bctype[cdir];
+            Real cc[3];
+            if (bct == AMREX_LO_DIRICHLET) {
+                Real xx[3] = {-bcl[cdir], dx[1]*Real(0.5), dx[1]*Real(1.5)};
+                poly_interp_coeff<3>(dx[1]*Real(-0.5), xx, cc);
+            } else { // Neumann
+                cc[0] = Real(0.0);
+                cc[1] = Real(1.0);
+                cc[2] = Real(0.0);
+            }
+            Real fac = (sb / (dx[1]*dx[1])) * bym;
+            if (bct == AMREX_LO_DIRICHLET && (level == 0 || bcmk == 2)) {
+                // bcmk == 2 means outside the domain.
+                // We need to modify RHS at external Dirichlet boundaries.
+                bcrhs[cdir](i,j,k) = fac * cc[0] * bcval[cdir](i,j-1,k);
+            } else {
+                bcrhs[cdir](i,j,k) = Real(0.0);
+            }
+            sten[0] -= fac * cc[1];
+            sten[3] = Real(0.0);
+            sten[4] -= fac * cc[2];
+        }
+    }
+
+    // yhi
+    if (j == boxhi.y) {
+        int const cdir = Orientation(Direction::y, Orientation::high);
+        int const bcmk = bcmsk[cdir](i,j+1,k);
+        if (bcmk > 0) {
+            int const bct = bctype[cdir];
+            Real cc[3];
+            if (bct == AMREX_LO_DIRICHLET) {
+                Real xx[3] = {-bcl[cdir], dx[1]*Real(0.5), dx[1]*Real(1.5)};
+                poly_interp_coeff<3>(dx[1]*Real(-0.5), xx, cc);
+            } else { // Neumann
+                cc[0] = Real(0.0);
+                cc[1] = Real(1.0);
+                cc[2] = Real(0.0);
+            }
+            Real fac = (sb / (dx[1]*dx[1])) * byp;
+            if (bct == AMREX_LO_DIRICHLET && (level == 0 || bcmk == 2)) {
+                // bcmk == 2 means outside the domain.
+                // We need to modify RHS at external Dirichlet boundaries.
+                bcrhs[cdir](i,j,k) = fac * cc[0] * bcval[cdir](i,j+1,k);
+            } else {
+                bcrhs[cdir](i,j,k) = Real(0.0);
+            }
+            sten[0] -= fac * cc[1];
+            sten[3] -= fac * cc[2];
+            sten[4] = Real(0.0);
+        }
+    }
+
+#if (AMREX_SPACEDIM > 2)
+
+    // zlo
+    if (k == boxlo.z) {
+        int const cdir = Orientation(Direction::z, Orientation::low);
+        int const bcmk = bcmsk[cdir](i,j,k-1);
+        if (bcmk > 0) {
+            int const bct = bctype[cdir];
+            Real cc[3];
+            if (bct == AMREX_LO_DIRICHLET) {
+                Real xx[3] = {-bcl[cdir], dx[2]*Real(0.5), dx[2]*Real(1.5)};
+                poly_interp_coeff<3>(dx[2]*Real(-0.5), xx, cc);
+            } else { // Neumann
+                cc[0] = Real(0.0);
+                cc[1] = Real(1.0);
+                cc[2] = Real(0.0);
+            }
+            Real fac = (sb / (dx[2]*dx[2])) * bzm;
+            if (bct == AMREX_LO_DIRICHLET && (level == 0 || bcmk == 2)) {
+                // bcmk == 2 means outside the domain.
+                // We need to modify RHS at external Dirichlet boundaries.
+                bcrhs[cdir](i,j,k) = fac * cc[0] * bcval[cdir](i,j,k-1);
+            } else {
+                bcrhs[cdir](i,j,k) = Real(0.0);
+            }
+            sten[0] -= fac * cc[1];
+            sten[5] = Real(0.0);
+            sten[6] -= fac * cc[2];
+        }
+    }
+
+    // zhi
+    if (k == boxhi.z) {
+        int const cdir = Orientation(Direction::z, Orientation::high);
+        int const bcmk = bcmsk[cdir](i,j,k+1);
+        if (bcmk > 0) {
+            int const bct = bctype[cdir];
+            Real cc[3];
+            if (bct == AMREX_LO_DIRICHLET) {
+                Real xx[3] = {-bcl[cdir], dx[2]*Real(0.5), dx[2]*Real(1.5)};
+                poly_interp_coeff<3>(dx[2]*Real(-0.5), xx, cc);
+            } else { // Neumann
+                cc[0] = Real(0.0);
+                cc[1] = Real(1.0);
+                cc[2] = Real(0.0);
+            }
+            Real fac = (sb / (dx[2]*dx[2])) * bzp;
+            if (bct == AMREX_LO_DIRICHLET && (level == 0 || bcmk == 2)) {
+                // bcmk == 2 means outside the domain.
+                // We need to modify RHS at external Dirichlet boundaries.
+                bcrhs[cdir](i,j,k) = fac * cc[0] * bcval[cdir](i,j,k+1);
+            } else {
+                bcrhs[cdir](i,j,k) = Real(0.0);
+            }
+            sten[0] -= fac * cc[1];
+            sten[5] -= fac * cc[2];
+            sten[6] = Real(0.0);
+        }
+    }
+
+#endif
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void hypmlabeclap_rhs (int i, int j, int k, Dim3 const& boxlo, Dim3 const& boxhi,
+                       Array4<Real> const& rhs1,
+                       Array4<Real const> const& rhs0,
+                       GpuArray<Array4<int const>, AMREX_SPACEDIM*2> const& bcmsk,
+                       GpuArray<Array4<Real const>, AMREX_SPACEDIM*2> const& bcrhs)
+{
+    rhs1(i,j,k) = rhs0(i,j,k);
+
+    // xlo
+    if (i == boxlo.x) {
+        int cdir = Orientation(Direction::x, Orientation::low);
+        if (bcmsk[cdir](i-1,j,k) > 0) {
+            rhs1(i,j,k) += bcrhs[cdir](i,j,k);
+        }
+    }
+
+    // xhi
+    if (i == boxhi.x) {
+        int cdir = Orientation(Direction::x, Orientation::high);
+        if (bcmsk[cdir](i+1,j,k) > 0) {
+            rhs1(i,j,k) += bcrhs[cdir](i,j,k);
+        }
+    }
+
+    // ylo
+    if (j == boxlo.y) {
+        int cdir = Orientation(Direction::y, Orientation::low);
+        if (bcmsk[cdir](i,j-1,k) > 0) {
+            rhs1(i,j,k) += bcrhs[cdir](i,j,k);
+        }
+    }
+
+    // yhi
+    if (j == boxhi.y) {
+        int cdir = Orientation(Direction::y, Orientation::high);
+        if (bcmsk[cdir](i,j+1,k) > 0) {
+            rhs1(i,j,k) += bcrhs[cdir](i,j,k);
+        }
+    }
+
+#if (AMREX_SPACEDIM > 2)
+
+    // zlo
+    if (k == boxlo.z) {
+        int cdir = Orientation(Direction::z, Orientation::low);
+        if (bcmsk[cdir](i,j,k-1) > 0) {
+            rhs1(i,j,k) += bcrhs[cdir](i,j,k);
+        }
+    }
+
+    // zhi
+    if (k == boxhi.z) {
+        int cdir = Orientation(Direction::z, Orientation::high);
+        if (bcmsk[cdir](i,j,k+1) > 0) {
+            rhs1(i,j,k) += bcrhs[cdir](i,j,k);
+        }
+    }
+
+#endif
+}
+
+}
+
+#if (AMREX_SPACEDIM == 2)
+#include <AMReX_HypreMLABecLap_2D_K.H>
+#else
+#include <AMReX_HypreMLABecLap_3D_K.H>
+#endif
+
+#endif
diff --git a/Src/Extern/HYPRE/CMakeLists.txt b/Src/Extern/HYPRE/CMakeLists.txt
index 9a7df5f8877..d4eabe8548d 100644
--- a/Src/Extern/HYPRE/CMakeLists.txt
+++ b/Src/Extern/HYPRE/CMakeLists.txt
@@ -42,6 +42,10 @@ foreach(D IN LISTS AMReX_SPACEDIM)
           AMReX_HypreIJIface.cpp
           AMReX_HypreIJIface.H
           AMReX_HypreSolver.H
+          AMReX_HypreMLABecLap.H
+          AMReX_HypreMLABecLap.cpp
+          AMReX_HypreMLABecLap_K.H
+          AMReX_HypreMLABecLap_${D}D_K.H
           )
 
     endif ()
diff --git a/Src/Extern/HYPRE/Make.package b/Src/Extern/HYPRE/Make.package
index 15795059ef8..bc14d32d5c7 100644
--- a/Src/Extern/HYPRE/Make.package
+++ b/Src/Extern/HYPRE/Make.package
@@ -1,9 +1,11 @@
 
 ifneq ($(DIM), 1)
 
-CEXE_sources += AMReX_HypreABecLap.cpp AMReX_HypreABecLap2.cpp AMReX_HypreABecLap3.cpp AMReX_Hypre.cpp
+CEXE_sources += AMReX_HypreABecLap.cpp AMReX_HypreABecLap2.cpp AMReX_HypreABecLap3.cpp AMReX_Hypre.cpp AMReX_HypreMLABecLap.cpp
 
-CEXE_headers += AMReX_HypreABecLap.H AMReX_HypreABecLap2.H AMReX_HypreABecLap3.H AMReX_Hypre.H
+CEXE_headers += AMReX_HypreABecLap.H AMReX_HypreABecLap2.H AMReX_HypreABecLap3.H AMReX_Hypre.H AMReX_HypreMLABecLap.H
+
+CEXE_headers += AMReX_HypreMLABecLap_K.H AMReX_HypreMLABecLap_$(DIM)D_K.H
 
 CEXE_headers += AMReX_Habec_$(DIM)D_K.H
 CEXE_headers += AMReX_Habec_K.H
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/GNUmakefile b/Tests/LinearSolvers/ABecLaplacian_C/GNUmakefile
index e26200bd94f..70892ebfa42 100644
--- a/Tests/LinearSolvers/ABecLaplacian_C/GNUmakefile
+++ b/Tests/LinearSolvers/ABecLaplacian_C/GNUmakefile
@@ -6,6 +6,11 @@ USE_OMP  = FALSE
 USE_HYPRE = FALSE
 USE_PETSC = FALSE
 
+USE_HYPRE_SSAMG = FALSE
+ifeq ($(USE_HYPRE_SSAMG),TRUE)
+  DEFINES += -DAMREX_FEATURE_HYPRE_SSAMG
+endif
+
 COMP = gnu
 
 DIM = 3
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H
index 97c928a88da..5834e9dc624 100644
--- a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H
+++ b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H
@@ -32,6 +32,10 @@ private:
     void solveNodeABecLaplacian ();
     void solveABecLaplacianGMRES ();
 
+#ifdef AMREX_USE_HYPRE
+    void solveMLHypre ();
+#endif
+
     int max_level = 1;
     int ref_ratio = 2;
     int n_cell = 128;
@@ -43,6 +47,9 @@ private:
     // 4. NodeABecLaplacian
     int prob_type = 1;
 
+    bool use_mlhypre = false; // use multi-level hypre solver
+    bool use_hypre_ssamg = false; // this option is for mlhypre only
+
     // For MLMG solver
     int verbose = 2;
     int bottom_verbose = 0;
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
index 9999fe7d121..27cd0b7a4b4 100644
--- a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
+++ b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
@@ -8,6 +8,10 @@
 #include <AMReX_ParmParse.H>
 #include <AMReX_MultiFabUtil.H>
 
+#ifdef AMREX_USE_HYPRE
+#include <AMReX_HypreMLABecLap.H>
+#endif
+
 using namespace amrex;
 
 MyTest::MyTest ()
@@ -19,6 +23,13 @@ MyTest::MyTest ()
 void
 MyTest::solve ()
 {
+#ifdef AMREX_USE_HYPRE
+    if (use_mlhypre) {
+        solveMLHypre();
+        return;
+    }
+#endif
+
     if (prob_type == 1) {
         solvePoisson();
     } else if (prob_type == 2) {
@@ -520,6 +531,9 @@ MyTest::readParameters ()
 
     pp.query("composite_solve", composite_solve);
 
+    pp.query("use_mlhypre", use_mlhypre);
+    pp.query("use_hypre_ssamg", use_hypre_ssamg);
+
     pp.query("prob_type", prob_type);
 
     pp.query("verbose", verbose);
@@ -620,3 +634,120 @@ MyTest::initData ()
         amrex::Abort("Unknown prob_type "+std::to_string(prob_type));
     }
 }
+
+#ifdef AMREX_USE_HYPRE
+void
+MyTest::solveMLHypre ()
+{
+    const auto tol_rel = Real(1.e-10);
+    const auto tol_abs = Real(0.0);
+
+    const auto nlevels = static_cast<int>(geom.size());
+
+#ifdef AMREX_FEATURE_HYPRE_SSAMG
+    auto hypre_solver_id = use_hypre_ssamg ? HypreSolverID::SSAMG
+                                           : HypreSolverID::BoomerAMG;
+#else
+    auto hypre_solver_id = HypreSolverID::BoomerAMG;
+#endif
+
+    if (prob_type == 1) { // Poisson
+        if (composite_solve) {
+            HypreMLABecLap hypre_mlabeclap(geom, grids, dmap, hypre_solver_id);
+            hypre_mlabeclap.setVerbose(verbose);
+
+            hypre_mlabeclap.setup(Real(0.0), Real(-1.0), {}, {},
+                                  {AMREX_D_DECL(LinOpBCType::Dirichlet,
+                                                LinOpBCType::Dirichlet,
+                                                LinOpBCType::Dirichlet)},
+                                  {AMREX_D_DECL(LinOpBCType::Dirichlet,
+                                                LinOpBCType::Dirichlet,
+                                                LinOpBCType::Dirichlet)},
+                                  GetVecOfConstPtrs(solution));
+
+            hypre_mlabeclap.solve(GetVecOfPtrs(solution), GetVecOfConstPtrs(rhs),
+                                  tol_rel, tol_abs);
+        } else {
+            for (int ilev = 0; ilev < nlevels; ++ilev) {
+                HypreMLABecLap hypre_mlabeclap({geom[ilev]}, {grids[ilev]}, {dmap[ilev]}, hypre_solver_id);
+                hypre_mlabeclap.setVerbose(verbose);
+
+                std::pair<MultiFab const*, IntVect> coarse_bc{nullptr,IntVect(0)};
+                if (ilev > 0) {
+                    coarse_bc.first = &solution[ilev-1];
+                    coarse_bc.second = IntVect(ref_ratio);
+                }
+
+                hypre_mlabeclap.setup(Real(0.0), Real(-1.0), {}, {},
+                                      {AMREX_D_DECL(LinOpBCType::Dirichlet,
+                                                    LinOpBCType::Dirichlet,
+                                                    LinOpBCType::Dirichlet)},
+                                      {AMREX_D_DECL(LinOpBCType::Dirichlet,
+                                                    LinOpBCType::Dirichlet,
+                                                    LinOpBCType::Dirichlet)},
+                                      {&solution[ilev]},
+                                      coarse_bc);
+
+                hypre_mlabeclap.solve({&solution[ilev]}, {&rhs[ilev]}, tol_rel, tol_abs);
+            }
+        }
+    } else if (prob_type == 2) { // ABecLaplacian
+        Vector<Array<MultiFab,AMREX_SPACEDIM>> face_bcoef(nlevels);
+        for (int ilev = 0; ilev < nlevels; ++ilev) {
+            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                const BoxArray& ba = amrex::convert(bcoef[ilev].boxArray(),
+                                                    IntVect::TheDimensionVector(idim));
+                face_bcoef[ilev][idim].define(ba, bcoef[ilev].DistributionMap(), 1, 0);
+            }
+            amrex::average_cellcenter_to_face(GetArrOfPtrs(face_bcoef[ilev]),
+                                              bcoef[ilev], geom[ilev]);
+        }
+
+        if (composite_solve) {
+            HypreMLABecLap hypre_mlabeclap(geom, grids, dmap, hypre_solver_id);
+            hypre_mlabeclap.setVerbose(verbose);
+
+            hypre_mlabeclap.setup(ascalar, bscalar,
+                                  GetVecOfConstPtrs(acoef),
+                                  GetVecOfArrOfConstPtrs(face_bcoef),
+                                  {AMREX_D_DECL(LinOpBCType::Dirichlet,
+                                                LinOpBCType::Neumann,
+                                                LinOpBCType::Neumann)},
+                                  {AMREX_D_DECL(LinOpBCType::Neumann,
+                                                LinOpBCType::Dirichlet,
+                                                LinOpBCType::Neumann)},
+                                  GetVecOfConstPtrs(solution));
+
+            hypre_mlabeclap.solve(GetVecOfPtrs(solution), GetVecOfConstPtrs(rhs),
+                                  tol_rel, tol_abs);
+        } else {
+            for (int ilev = 0; ilev < nlevels; ++ilev) {
+                HypreMLABecLap hypre_mlabeclap({geom[ilev]}, {grids[ilev]}, {dmap[ilev]}, hypre_solver_id);
+                hypre_mlabeclap.setVerbose(verbose);
+
+                std::pair<MultiFab const*, IntVect> coarse_bc{nullptr,IntVect(0)};
+                if (ilev > 0) {
+                    coarse_bc.first = &solution[ilev-1];
+                    coarse_bc.second = IntVect(ref_ratio);
+                }
+
+                hypre_mlabeclap.setup(ascalar, bscalar,
+                                      {&acoef[ilev]},
+                                      {GetArrOfConstPtrs(face_bcoef[ilev])},
+                                      {AMREX_D_DECL(LinOpBCType::Dirichlet,
+                                                    LinOpBCType::Neumann,
+                                                    LinOpBCType::Neumann)},
+                                      {AMREX_D_DECL(LinOpBCType::Neumann,
+                                                    LinOpBCType::Dirichlet,
+                                                    LinOpBCType::Neumann)},
+                                      {&solution[ilev]},
+                                      coarse_bc);
+
+                hypre_mlabeclap.solve({&solution[ilev]}, {&rhs[ilev]}, tol_rel, tol_abs);
+            }
+        }
+    } else {
+        amrex::Abort("Unsupported prob_type: " + std::to_string(prob_type));
+    }
+}
+#endif
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/inputs-mlhypre b/Tests/LinearSolvers/ABecLaplacian_C/inputs-mlhypre
new file mode 100644
index 00000000000..70489076387
--- /dev/null
+++ b/Tests/LinearSolvers/ABecLaplacian_C/inputs-mlhypre
@@ -0,0 +1,22 @@
+
+max_level = 1
+ref_ratio = 2
+n_cell = 8
+max_grid_size = 32
+
+linop_maxorder = 3 # mlhypre is hardwired to use maxorder=3
+
+composite_solve = 1
+
+# In this tutorial, we set up two examples.
+prob_type = 1
+#prob_type = 2
+
+amrex.fpe_trap_invalid = 1
+amrex.fpe_trap_overflow = 1
+amrex.fpe_trap_zero = 1
+
+use_mlhypre = 1
+
+verbose = 2
+