IntelPython
diff --git a/‎.pre-commit-config.yaml
+3-3 b/‎.pre-commit-config.yaml
+3-3
diff --git a/‎CMakeLists.txt
+1 b/‎CMakeLists.txt
+1
diff --git a/‎examples/transpose.py
+184 b/‎examples/transpose.py
+184
diff --git a/‎imex_version.txt
+1-1 b/‎imex_version.txt
+1-1
diff --git a/‎setup.py
+5-1 b/‎setup.py
+5-1
diff --git a/‎sharpy/__init__.py
+4 b/‎sharpy/__init__.py
+4
diff --git a/‎sharpy/array_api.py
+1 b/‎sharpy/array_api.py
+1
diff --git a/‎src/EWBinOp.cpp
+1-1 b/‎src/EWBinOp.cpp
+1-1
diff --git a/‎src/EWUnyOp.cpp
+1-1 b/‎src/EWUnyOp.cpp
+1-1
diff --git a/‎src/IEWBinOp.cpp
+1-1 b/‎src/IEWBinOp.cpp
+1-1
@@ -19,13 +19,13 @@ repos:
         -   id: trailing-whitespace
             exclude: '.*\.patch'
 -   repo: https://github.com/psf/black
-    rev: 24.3.0
+    rev: 24.8.0
     hooks:
     - id: black
       args: ["--line-length", "80"]
       language_version: python3
 -   repo: https://github.com/PyCQA/bandit
-    rev: '1.7.8'
+    rev: '1.7.9'
     hooks:
     -   id: bandit
         args: ["-c", ".bandit.yml"]
@@ -35,7 +35,7 @@ repos:
     -   id: isort
         name: isort (python)
 -   repo: https://github.com/pycqa/flake8
-    rev: 7.0.0
+    rev: 7.1.1
     hooks:
     -   id: flake8
 -   repo: https://github.com/pocc/pre-commit-hooks
 
@@ -149,6 +149,7 @@ include_directories(
   ${PROJECT_SOURCE_DIR}/third_party/bitsery/include
   ${MPI_INCLUDE_PATH}
   ${pybind11_INCLUDE_DIRS}
+  ${LLVM_INCLUDE_DIRS}
   ${MLIR_INCLUDE_DIRS}
   ${IMEX_INCLUDE_DIRS})
 
 
@@ -0,0 +1,184 @@
+"""
+Transpose benchmark
+
+    Matrix transpose benchmark for sharpy and numpy backends.
+
+Examples:
+
+    # Run 1000 iterations of 1000*1000 matrix on sharpy backend
+    python transpose.py -r 10 -c 1000 -b sharpy -i 1000
+
+    # MPI parallel run
+    mpiexec -n 3 python transpose.py -r 1000 -c 1000 -b sharpy -i 1000
+
+"""
+
+import argparse
+import time as time_mod
+
+import numpy
+
+import sharpy
+
+try:
+    import mpi4py
+
+    mpi4py.rc.finalize = False
+    from mpi4py import MPI
+
+    comm_rank = MPI.COMM_WORLD.Get_rank()
+    comm = MPI.COMM_WORLD
+except ImportError:
+    comm_rank = 0
+    comm = None
+
+
+def info(s):
+    if comm_rank == 0:
+        print(s)
+
+
+def sp_transpose(arr):
+    brr = sharpy.permute_dims(arr, [1, 0])
+    return brr
+
+
+def np_transpose(arr):
+    brr = arr.transpose()
+    return brr.copy()
+
+
+def initialize(np, row, col, dtype):
+    arr = np.arange(0, row * col, 1, dtype=dtype)
+    return np.reshape(arr, (row, col))
+
+
+def run(row, col, backend, iterations, datatype):
+    if backend == "sharpy":
+        import sharpy as np
+        from sharpy import fini, init, sync
+
+        transpose = sp_transpose
+
+        init(False)
+    elif backend == "numpy":
+        import numpy as np
+
+        if comm is not None:
+            assert (
+                comm.Get_size() == 1
+            ), "Numpy backend only supports serial execution."
+
+        fini = sync = lambda x=None: None
+        transpose = np_transpose
+    else:
+        raise ValueError(f'Unknown backend: "{backend}"')
+
+    dtype = {
+        "f32": np.float32,
+        "f64": np.float64,
+    }[datatype]
+
+    info(f"Using backend: {backend}")
+    info(f"Number of row: {row}")
+    info(f"Number of column: {col}")
+    info(f"Datatype: {datatype}")
+
+    arr = initialize(np, row, col, dtype)
+    sync()
+
+    # verify
+    if backend == "sharpy":
+        brr = sp_transpose(arr)
+        crr = np_transpose(sharpy.to_numpy(arr))
+        assert numpy.allclose(sharpy.to_numpy(brr), crr)
+
+    def eval():
+        tic = time_mod.perf_counter()
+        transpose(arr)
+        sync()
+        toc = time_mod.perf_counter()
+        return toc - tic
+
+    # warm-up run
+    t_warm = eval()
+
+    # evaluate
+    info(f"Running {iterations} iterations")
+    time_list = []
+    for i in range(iterations):
+        time_list.append(eval())
+
+    # get max time over mpi ranks
+    if comm is not None:
+        t_warm = comm.allreduce(t_warm, MPI.MAX)
+        time_list = comm.allreduce(time_list, MPI.MAX)
+
+    t_min = numpy.min(time_list)
+    t_max = numpy.max(time_list)
+    t_med = numpy.median(time_list)
+    init_overhead = t_warm - t_med
+    if backend == "sharpy":
+        info(f"Estimated initialization overhead: {init_overhead:.5f} s")
+    info(f"Min.   duration: {t_min:.5f} s")
+    info(f"Max.   duration: {t_max:.5f} s")
+    info(f"Median duration: {t_med:.5f} s")
+
+    fini()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Run transpose benchmark",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "-r",
+        "--row",
+        type=int,
+        default=10000,
+        help="Number of row.",
+    )
+    parser.add_argument(
+        "-c",
+        "--column",
+        type=int,
+        default=10000,
+        help="Number of column.",
+    )
+
+    parser.add_argument(
+        "-b",
+        "--backend",
+        type=str,
+        default="sharpy",
+        choices=["sharpy", "numpy"],
+        help="Backend to use.",
+    )
+
+    parser.add_argument(
+        "-i",
+        "--iterations",
+        type=int,
+        default=10,
+        help="Number of iterations to run.",
+    )
+
+    parser.add_argument(
+        "-d",
+        "--datatype",
+        type=str,
+        default="f64",
+        choices=["f32", "f64"],
+        help="Datatype for model state variables",
+    )
+
+    args = parser.parse_args()
+    run(
+        args.row,
+        args.column,
+        args.backend,
+        args.iterations,
+        args.datatype,
+    )
@@ -1 +1 @@
-5a7bb80ede5fe4fa8d56ee0dd77c4e5c1327fe09
+8ae485bbfb1303a414b375e25130fcaa4c02127a
@@ -1,3 +1,4 @@
+import multiprocessing
 import os
 import pathlib
 
@@ -44,7 +45,10 @@ def build_cmake(self, ext):
         os.chdir(str(build_temp))
         self.spawn(["cmake", str(cwd)] + cmake_args)
         if not self.dry_run:
-            self.spawn(["cmake", "--build", ".", "-j5"] + build_args)
+            self.spawn(
+                ["cmake", "--build", ".", f"-j{multiprocessing.cpu_count()}"]
+                + build_args
+            )
         # Troubleshooting: if fail on line above then delete all possible
         # temporary CMake files including "CMakeCache.txt" in top level dir.
         os.chdir(str(cwd))
 
@@ -130,6 +130,10 @@ def _validate_device(device):
         exec(
             f"{func} = lambda this, shape, cp=None: ndarray(_csp.ManipOp.reshape(this._t, shape, cp))"
         )
+    elif func == "permute_dims":
+        exec(
+            f"{func} = lambda this, axes: ndarray(_csp.ManipOp.permute_dims(this._t, axes))"
+        )
 
 for func in api.api_categories["ReduceOp"]:
     FUNC = func.upper()
 
@@ -179,6 +179,7 @@
             "roll",  # (x, /, shift, *, axis=None)
             "squeeze",  # (x, /, axis)
             "stack",  # (arrays, /, *, axis=0)
+            "permute_dims",  # (x: array, /, axes: Tuple[int, ...]) → array
         ],
         "LinAlgOp": [
             "matmul",  # (x1, x2, /)
 
@@ -120,7 +120,7 @@ struct DeferredEWBinOp : public Deferred {
     auto av = dm.getDependent(builder, Registry::get(_a));
     auto bv = dm.getDependent(builder, Registry::get(_b));
 
-    auto aTyp = av.getType().cast<::imex::ndarray::NDArrayType>();
+    auto aTyp = ::mlir::cast<::imex::ndarray::NDArrayType>(av.getType());
     auto outElemType =
         ::imex::ndarray::toMLIR(builder, SHARPY::jit::getPTDType(_dtype));
     auto outTyp = aTyp.cloneWith(shape(), outElemType);
 
@@ -105,7 +105,7 @@ struct DeferredEWUnyOp : public Deferred {
                      jit::DepManager &dm) override {
     auto av = dm.getDependent(builder, Registry::get(_a));
 
-    auto aTyp = av.getType().cast<::imex::ndarray::NDArrayType>();
+    auto aTyp = ::mlir::cast<::imex::ndarray::NDArrayType>(av.getType());
     auto outTyp = aTyp.cloneWith(shape(), aTyp.getElementType());
 
     auto ndOpId = sharpy(_op);
 
@@ -71,7 +71,7 @@ struct DeferredIEWBinOp : public Deferred {
     auto av = dm.getDependent(builder, Registry::get(_a));
     auto bv = dm.getDependent(builder, Registry::get(_b));
 
-    auto aTyp = av.getType().cast<::imex::ndarray::NDArrayType>();
+    auto aTyp = ::mlir::cast<::imex::ndarray::NDArrayType>(av.getType());
     auto outTyp = aTyp.cloneWith(shape(), aTyp.getElementType());
 
     auto binop = builder.create<::imex::ndarray::EWBinOp>(
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-5a7bb80ede5fe4fa8d56ee0dd77c4e5c1327fe09`
	`1`	`+8ae485bbfb1303a414b375e25130fcaa4c02127a`