diff --git a/CMakeLists.txt b/CMakeLists.txt index a7eaff700..98b18fbc7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -154,12 +154,32 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER) endif() find_program(LLD_PROGRAM lld) -if(LLD_PROGRAM) +if(LLD_PROGRAM AND NOT APPLE) message(STATUS "Using lld") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fuse-ld=lld") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=lld") endif() +# On macOS, LLVM/MLIR's cmake config creates partial zstd IMPORTED targets +# (shared + static) but not the zstd::libzstd alias. Homebrew's zstd cmake +# config then fails when Arrow tries to re-find zstd because some targets +# already exist. Work around by telling Arrow that zstd is already found. +# Also fix the INTERFACE_INCLUDE_DIRECTORIES: LLVM's Findzstd.cmake sets it +# to /opt/homebrew/include (all of Homebrew), which pulls in wrong versions +# of headers (e.g. fmt 11.2 vs our thirdparty fmt 11.0). Narrow it to the +# zstd-specific include path. +if(APPLE AND TARGET zstd::libzstd_shared) + set(zstdAlt_FOUND TRUE) + find_path(_ZSTD_SPECIFIC_INCLUDE zstd.h PATHS /opt/homebrew/opt/zstd/include NO_DEFAULT_PATH) + if(_ZSTD_SPECIFIC_INCLUDE) + set_target_properties(zstd::libzstd_shared PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${_ZSTD_SPECIFIC_INCLUDE}") + if(TARGET zstd::libzstd_static) + set_target_properties(zstd::libzstd_static PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${_ZSTD_SPECIFIC_INCLUDE}") + endif() + endif() +endif() find_package(Arrow REQUIRED) find_package(Parquet REQUIRED) @@ -205,7 +225,7 @@ if(USE_HDFS) endif() include_directories(${PROJECT_SOURCE_DIR} ${HDFS_LIB_LOCATION}) - find_library(hdfs3 NAMES libhdfs3.so HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED) + find_library(hdfs3 NAMES hdfs3 HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED) message(STATUS "HDFS_LIB_FOUND: ${HDFS_LIB_FOUND}") if(DEFINED HDFS_LIB_FOUND) add_definitions(-DUSE_HDFS) diff --git a/build.sh b/build.sh index d9596ba92..38e6683f9 100755 --- a/build.sh +++ b/build.sh @@ -26,6 +26,32 @@ # Stop immediately if any command fails. set -e +#****************************************************************************** +# Platform detection and portable helpers +#****************************************************************************** +IS_DARWIN=0 +if [ "$(uname -s)" == "Darwin" ]; then + IS_DARWIN=1 +fi + +# Portable replacement for nproc (not available on macOS) +get_nproc() { + if [ "$IS_DARWIN" == "1" ]; then + sysctl -n hw.ncpu + else + nproc + fi +} + +# Portable replacement for sed -i (different syntax on macOS) +sed_inplace() { + if [ "$IS_DARWIN" == "1" ]; then + sed -i '' "$@" + else + sed -i "$@" + fi +} + build_ts_begin=$(date +%s%N) #****************************************************************************** @@ -454,6 +480,10 @@ BUILD_MPI="-DUSE_MPI=OFF" BUILD_HDFS="-DUSE_HDFS=OFF" BUILD_IO_URING="-DUSE_IO_URING=OFF" BUILD_PAPI="-DUSE_PAPI=ON" +# Default PAPI to OFF on macOS (not well supported) +if [ "$IS_DARWIN" == "1" ]; then + BUILD_PAPI="-DUSE_PAPI=OFF" +fi WITH_DEPS=1 WITH_SUBMODULE_UPDATE=1 @@ -572,12 +602,26 @@ fi #****************************************************************************** # #8 Download and install third-party dependencies if requested (default is yes, omit with --no-deps)) #****************************************************************************** +# On macOS, use Apple Clang for third-party dependency builds to avoid +# boringssl/GCC incompatibilities. DAPHNE itself uses GCC-15 (set below). + if [ $WITH_DEPS -gt 0 ]; then + # CMake 4.x removed compatibility with cmake_minimum_required < 3.5. + # Many third-party deps use older versions; this global setting allows them to configure. + export CMAKE_POLICY_VERSION_MINIMUM=3.5 + + # On macOS, use GCC-15 for all builds (deps + DAPHNE) for ABI compatibility. + # The boringssl patches below handle GCC-specific issues. + if [ "$IS_DARWIN" == "1" ]; then + export CC=gcc-15 + export CXX=g++-15 + fi + LLVM_ARCH=X86 # optimizes for multiple x86_64 architectures PAPI_OBLAS_ARCH=NEHALEM # Determine CPU architecture to compile for - if [ $(arch) == 'armv*' ] || [ $(arch) == 'aarch64' ]; then + if [ $(arch) == 'armv*' ] || [ $(arch) == 'aarch64' ] || [ $(arch) == 'arm64' ]; then echo "Building for ARMv8 architecture" LLVM_ARCH=AArch64 PAPI_OBLAS_ARCH=ARMV8 @@ -625,7 +669,7 @@ if [ $WITH_DEPS -gt 0 ]; then --with-components="coretemp infiniband io lustre net powercap rapl sde stealtime" \ - CFLAGS="-fPIC -DPIC" make -j"$(nproc)" DYNAMIC_ARCH=1 TARGET="$PAPI_OBLAS_ARCH" + CFLAGS="-fPIC -DPIC" make -j"$(get_nproc)" DYNAMIC_ARCH=1 TARGET="$PAPI_OBLAS_ARCH" make install cd - > /dev/null dependency_install_success "papi_v${papiVersion}" @@ -650,7 +694,7 @@ if [ $WITH_DEPS -gt 0 ]; then if ! is_dependency_installed "hwloc_v${hwlocVersion}"; then cd "$sourcePrefix/$hwlocDirName/" ./configure --prefix="$hwlocInstDirName" - make -j"$(nproc)" DYNAMIC_ARCH=1 TARGET="$PAPI_OBLAS_ARCH" + make -j"$(get_nproc)" DYNAMIC_ARCH=1 TARGET="$PAPI_OBLAS_ARCH" make install cd - > /dev/null dependency_install_success "hwloc_v${hwlocVersion}" @@ -691,7 +735,11 @@ if [ $WITH_DEPS -gt 0 ]; then -d "$sourcePrefix/$antlrCppRuntimeDirName" # Github disabled the unauthenticated git:// protocol, patch antlr4 to use https:// # until we upgrade to antlr4-4.9.3+ - sed -i 's#git://github.com#https://github.com#' "$sourcePrefix/$antlrCppRuntimeDirName/runtime/CMakeLists.txt" + sed_inplace 's#git://github.com#https://github.com#' "$sourcePrefix/$antlrCppRuntimeDirName/runtime/CMakeLists.txt" + + # CMake 4.x no longer supports setting deprecated policies to OLD. Remove them. + sed_inplace '/CMAKE_POLICY(SET CMP00[2-5][0-9] OLD)/d' \ + "$sourcePrefix/$antlrCppRuntimeDirName/CMakeLists.txt" daphne_msg "Build Antlr v${antlrVersion}" cmake -S "$sourcePrefix/$antlrCppRuntimeDirName" -B "${buildPrefix}/${antlrCppRuntimeDirName}" \ @@ -753,7 +801,15 @@ if [ $WITH_DEPS -gt 0 ]; then if ! is_dependency_installed "${dep_openBlas[@]}"; then cd "$sourcePrefix/$openBlasDirName" make clean - make -j"$(nproc)" DYNAMIC_ARCH=1 TARGET="$PAPI_OBLAS_ARCH" + # GCC 15+ treats -Wincompatible-pointer-types as error; OpenBLAS 0.3.23 triggers it + # macOS doesn't support DYNAMIC_ARCH (clang assembler lacks many -mtune targets) + if [ "$IS_DARWIN" == "1" ]; then + make -j"$(get_nproc)" TARGET="$PAPI_OBLAS_ARCH" \ + CFLAGS="-Wno-error=incompatible-pointer-types" \ + COMMON_OPT="-Wno-error=incompatible-pointer-types" + else + make -j"$(get_nproc)" DYNAMIC_ARCH=1 TARGET="$PAPI_OBLAS_ARCH" + fi make PREFIX="$openBlasInstDirName" install cd - >/dev/null dependency_install_success "${dep_openBlas[@]}" @@ -782,7 +838,7 @@ if [ $WITH_DEPS -gt 0 ]; then # abseil (compiled separately to apply a patch) #------------------------------------------------------------------------------ abslPath=$sourcePrefix/abseil-cpp - if [ $(arch) == 'armv64' ] || [ $(arch) == 'aarch64' ]; then + if [ $(arch) == 'armv64' ] || [ $(arch) == 'aarch64' ] || [ $(arch) == 'arm64' ]; then abslVersion=20211102.0 fi dep_absl=("absl_v${abslVersion}" "v1") @@ -791,7 +847,7 @@ if [ $WITH_DEPS -gt 0 ]; then daphne_msg "Get abseil version ${abslVersion}" rm -rf "$abslPath" git clone --depth 1 --branch "$abslVersion" https://github.com/abseil/abseil-cpp.git "$abslPath" - if [ $(arch) == 'armv*' ] || [ $(arch) == 'aarch64' ]; then + if [ $(arch) == 'armv*' ] || [ $(arch) == 'aarch64' ] || [ $(arch) == 'arm64' ]; then daphne_msg "Applying 0002-absl-stdmax-params.patch" patch -Np1 -i "${patchDir}/0002-absl-stdmax-params.patch" -d "$abslPath" fi @@ -809,26 +865,28 @@ if [ $WITH_DEPS -gt 0 ]; then #------------------------------------------------------------------------------ # MPI (Default is MPI library is OpenMPI but cut can be any) #------------------------------------------------------------------------------ - MPIZipName=openmpi-$openMPIVersion.tar.gz - MPIInstDirName=$installPrefix - dep_mpi=("openmpi_v${openMPIVersion}" "v1") - - if ! is_dependency_downloaded "${dep_mpi[@]}"; then - daphne_msg "Get openmpi version ${openMPIVersion}" - wget "https://download.open-mpi.org/release/open-mpi/v4.1/$MPIZipName" -qO "${cacheDir}/${MPIZipName}" - tar -xf "$cacheDir/$MPIZipName" --directory "$sourcePrefix" - dependency_download_success "${dep_mpi[@]}" - mkdir -p "$MPIInstDirName" - fi - if ! is_dependency_installed "${dep_mpi[@]}"; then - cd "$sourcePrefix/openmpi-$openMPIVersion" - ./configure --prefix="$MPIInstDirName" - make -j"$(nproc)" all - make install - cd - - dependency_install_success "${dep_mpi[@]}" - else - daphne_msg "No need to build OpenMPI again" + if [ "$BUILD_MPI" == "-DUSE_MPI=ON" ]; then + MPIZipName=openmpi-$openMPIVersion.tar.gz + MPIInstDirName=$installPrefix + dep_mpi=("openmpi_v${openMPIVersion}" "v1") + + if ! is_dependency_downloaded "${dep_mpi[@]}"; then + daphne_msg "Get openmpi version ${openMPIVersion}" + wget "https://download.open-mpi.org/release/open-mpi/v4.1/$MPIZipName" -qO "${cacheDir}/${MPIZipName}" + tar -xf "$cacheDir/$MPIZipName" --directory "$sourcePrefix" + dependency_download_success "${dep_mpi[@]}" + mkdir -p "$MPIInstDirName" + fi + if ! is_dependency_installed "${dep_mpi[@]}"; then + cd "$sourcePrefix/openmpi-$openMPIVersion" + ./configure --prefix="$MPIInstDirName" + make -j"$(get_nproc)" all + make install + cd - + dependency_install_success "${dep_mpi[@]}" + else + daphne_msg "No need to build OpenMPI again" + fi fi #------------------------------------------------------------------------------ # gRPC @@ -855,8 +913,21 @@ if [ $WITH_DEPS -gt 0 ]; then dependency_download_success "${dep_grpc[@]}" fi if ! is_dependency_installed "${dep_grpc[@]}"; then + # Patch boringssl for GCC compatibility on macOS: + # 1) __builtin_available() is a Clang-only builtin; on macOS 10.12+ + # getentropy() is always available so the check can be removed. + # 2) -stdlib=libc++ is a Clang-only flag; GCC uses libstdc++ natively. + if [ "$IS_DARWIN" == "1" ]; then + sed_inplace 's/if (__builtin_available(macos 10\.12, \*))/if (1)/g' \ + "$sourcePrefix/$grpcDirName/third_party/boringssl-with-bazel/src/crypto/fipsmodule/rand/urandom.c" + sed_inplace '/stdlib=libc++/d' \ + "$sourcePrefix/$grpcDirName/third_party/boringssl-with-bazel/CMakeLists.txt" + sed_inplace '/stdlib=libc++/d' \ + "$sourcePrefix/$grpcDirName/third_party/boringssl-with-bazel/src/CMakeLists.txt" + fi cmake -G Ninja -S "$sourcePrefix/$grpcDirName" -B "$buildPrefix/$grpcDirName" \ -DCMAKE_INSTALL_PREFIX="$grpcInstDir" \ + -DCMAKE_PREFIX_PATH="$installPrefix" \ -DCMAKE_BUILD_TYPE=Release \ -DgRPC_INSTALL=ON \ -DgRPC_BUILD_TESTS=OFF \ @@ -886,8 +957,8 @@ if [ $WITH_DEPS -gt 0 ]; then fi # this works around a build error that occurs on Ubuntu with Boost installed - if [ $(lsb_release -is) == "Ubuntu" ]; then - if [ $(dpkg -l | grep libboost | wc -l) == "" ]; then + if [ "$IS_DARWIN" != "1" ] && [ "$(lsb_release -is 2>/dev/null)" == "Ubuntu" ]; then + if [ "$(dpkg -l | grep libboost | wc -l)" == "" ]; then daphne_msg "Setting BOOST_ROOT=/usr on Ubuntu Linux with libboost installed" sleep 5 export BOOST_ROOT=/usr @@ -895,6 +966,14 @@ if [ $WITH_DEPS -gt 0 ]; then fi if ! is_dependency_installed "${dep_arrow[@]}"; then + # GCC-15 is stricter about missing includes; Arrow 13 uses std::find + # without #include in filesystem/util_internal.cc. + if [ "$IS_DARWIN" == "1" ]; then + arrow_file="${sourcePrefix}/${arrowDirName}/cpp/src/arrow/filesystem/util_internal.cc" + if ! grep -q '#include ' "$arrow_file" 2>/dev/null; then + sed_inplace '1s/^/#include \n/' "$arrow_file" + fi + fi cmake -G Ninja -S "${sourcePrefix}/${arrowDirName}/cpp" -B "${buildPrefix}/${arrowDirName}" \ -DCMAKE_INSTALL_PREFIX="${installPrefix}" -DARROW_CSV=ON -DARROW_FILESYSTEM=ON -DARROW_PARQUET=ON \ -DARROW_WITH_BROTLI=ON -DARROW_WITH_BZ2=ON -DARROW_WITH_LZ4=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_ZLIB=ON \ @@ -940,7 +1019,8 @@ if [ $WITH_DEPS -gt 0 ]; then fi if ! is_dependency_installed "spdlog_v${spdlogVersion}"; then cmake -G Ninja -S "${sourcePrefix}/${spdlogDirName}" -B "${buildPrefix}/${spdlogDirName}" \ - -DSPDLOG_FMT_EXTERNAL=ON -DCMAKE_INSTALL_PREFIX="${installPrefix}" -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DSPDLOG_FMT_EXTERNAL=ON -DCMAKE_INSTALL_PREFIX="${installPrefix}" -DCMAKE_PREFIX_PATH="${installPrefix}" \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON cmake --build "${buildPrefix}/${spdlogDirName}" --target install/strip dependency_install_success "spdlog_v${spdlogVersion}" else @@ -1036,13 +1116,23 @@ if [ $WITH_DEPS -gt 0 ]; then daphne_msg "Building LLVM/MLIR from ${llvmCommit}" cd "${thirdpartyPath}/${llvmName}" echo "Need to build MLIR/LLVM." + # On macOS, use GCC-15 for LLVM/MLIR build to ensure ABI compatibility + # with the rest of the project (all using libstdc++). LLD is disabled + # because it is a Clang/LLVM-specific linker. + if [ "$IS_DARWIN" == "1" ]; then + LLVM_COMPILER_FLAGS="-DCMAKE_C_COMPILER=gcc-15 -DCMAKE_CXX_COMPILER=g++-15" + LLVM_LLD_FLAG="" + else + LLVM_COMPILER_FLAGS="-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++" + LLVM_LLD_FLAG="-DLLVM_ENABLE_LLD=ON" + fi cmake -G Ninja -S llvm -B "$buildPrefix/$llvmName" \ -DLLVM_ENABLE_PROJECTS=mlir \ -DLLVM_BUILD_EXAMPLES=OFF \ -DLLVM_TARGETS_TO_BUILD="$LLVM_ARCH" \ -DCMAKE_BUILD_TYPE=Release \ -DLLVM_ENABLE_ASSERTIONS=ON \ - -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DLLVM_ENABLE_LLD=ON \ + $LLVM_COMPILER_FLAGS $LLVM_LLD_FLAG \ -DLLVM_ENABLE_RTTI=ON \ -DCMAKE_INSTALL_PREFIX="$installPrefix" cmake --build "$buildPrefix/$llvmName" --target check-mlir @@ -1074,7 +1164,7 @@ if [ $WITH_DEPS -gt 0 ]; then if ! is_dependency_installed "liburing_v${liburingVersion}"; then cd "$sourcePrefix/$liburingDirName" ./configure --cc="$liburing_cc" --cxx="$liburing_cxx" --prefix="$liburingInstDirName" - make -j"$(nproc)" + make -j"$(get_nproc)" cp ./src/liburing.a "$installPrefix/lib/" cp -r ./src/include/* "$installPrefix/include" cd - > /dev/null @@ -1107,9 +1197,15 @@ fi daphne_msg "Build Daphne" +DAPHNE_CMAKE_EXTRA="" +if [ "$IS_DARWIN" == "1" ]; then + # GCC-15 is already exported; explicitly pass to cmake as well. + DAPHNE_CMAKE_EXTRA="-DCMAKE_C_COMPILER=gcc-15 -DCMAKE_CXX_COMPILER=g++-15" +fi + cmake -S "$projectRoot" -B "$daphneBuildDir" -G Ninja -DANTLR_VERSION="$antlrVersion" \ -DCMAKE_PREFIX_PATH="$installPrefix" \ - $BUILD_CUDA $BUILD_FPGAOPENCL $BUILD_DEBUG $BUILD_MPI $BUILD_HDFS $BUILD_PAPI + $BUILD_CUDA $BUILD_FPGAOPENCL $BUILD_DEBUG $BUILD_MPI $BUILD_HDFS $BUILD_PAPI $DAPHNE_CMAKE_EXTRA cmake --build "$daphneBuildDir" --target "$target" diff --git a/doc/development/BuildingDaphne.md b/doc/development/BuildingDaphne.md index 5722410ad..adb76e6db 100644 --- a/doc/development/BuildingDaphne.md +++ b/doc/development/BuildingDaphne.md @@ -132,6 +132,37 @@ All possible options for the build script: --- +## Building on macOS (Experimental) + +DAPHNE can be built natively on macOS with Apple Silicon (ARM64). This support is experimental. + +### Prerequisites + +Install [Homebrew](https://brew.sh/), then install the required packages: + +```bash +brew install gcc@15 cmake ninja wget zstd libomp +``` + +GCC is required because DAPHNE and all its dependencies must be built with the +same C++ standard library (libstdc++). Apple Clang uses libc++, which is +ABI-incompatible and causes link failures. + +### Building + +```bash +./build.sh --no-papi +``` + +The build script automatically detects macOS and configures GCC-15 as the compiler. PAPI is not supported on macOS, so `--no-papi` is required. + +### Known Limitations + +- CPU pinning (thread affinity) is disabled on macOS +- PAPI-based profiling is not available +- CUDA and FPGA options are not supported on macOS +- Only Apple Silicon (ARM64) has been tested + ## Building on WSL When using Windows Subsystems for Linux (WSL), the default memory limit for WSL is 50% of the total memory of the underlying Windows host. This can lead to build fails due to SIGKILL for DAPHNE builds. [Advanced settings configuration in WSL](https://learn.microsoft.com/en-us/windows/wsl/wsl-config) describes how the memory limit can be configured. diff --git a/src/api/cli/DaphneUserConfig.h b/src/api/cli/DaphneUserConfig.h index 4b7e326e2..d09ee0889 100644 --- a/src/api/cli/DaphneUserConfig.h +++ b/src/api/cli/DaphneUserConfig.h @@ -24,6 +24,11 @@ #include class DaphneLogger; +#ifdef __APPLE__ +#include +#include +#endif + #include #include #include @@ -148,8 +153,14 @@ struct DaphneUserConfig { void resolveLibDir() { const std::string exedirPlaceholder = "{exedir}/"; if (libdir.substr(0, exedirPlaceholder.size()) == exedirPlaceholder) { - // This next line adds to our Linux platform lock-in. +#ifdef __APPLE__ + char pathBuf[PATH_MAX]; + uint32_t size = sizeof(pathBuf); + _NSGetExecutablePath(pathBuf, &size); + std::filesystem::path daphneExeDir(std::filesystem::canonical(pathBuf).parent_path()); +#else std::filesystem::path daphneExeDir(std::filesystem::canonical("/proc/self/exe").parent_path()); +#endif libdir = daphneExeDir / libdir.substr(exedirPlaceholder.size()); } } diff --git a/src/compiler/lowering/CMakeLists.txt b/src/compiler/lowering/CMakeLists.txt index d55f06259..f6a0fd0b1 100644 --- a/src/compiler/lowering/CMakeLists.txt +++ b/src/compiler/lowering/CMakeLists.txt @@ -37,7 +37,7 @@ add_mlir_dialect_library(MLIRDaphneTransforms LINK_COMPONENTS Core ) -find_library(HWLOC_LIB NAMES libhwloc.so HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED) +find_library(HWLOC_LIB NAMES hwloc HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED) target_link_libraries(MLIRDaphneTransforms PUBLIC CompilerUtils @@ -52,6 +52,7 @@ target_link_libraries(MLIRDaphneTransforms PUBLIC MLIRFuncTransforms SQLParser ${HWLOC_LIB} + fmt::fmt ) diff --git a/src/compiler/lowering/RewriteToCallKernelOpPass.cpp b/src/compiler/lowering/RewriteToCallKernelOpPass.cpp index 292a0e30a..fb38e6583 100644 --- a/src/compiler/lowering/RewriteToCallKernelOpPass.cpp +++ b/src/compiler/lowering/RewriteToCallKernelOpPass.cpp @@ -498,8 +498,8 @@ class DistributedPipelineKernelReplacement : public OpConversionPattern(loc, cvpInputs, op.getInputs()[i], rewriter.getI64IntegerAttr(i)); // Constants for #inputs. - auto coNumInputs = rewriter.create(loc, numInputs); - [[maybe_unused]] auto coNumOutputs = rewriter.create(loc, numOutputs); + auto coNumInputs = rewriter.create(loc, static_cast(numInputs)); + [[maybe_unused]] auto coNumOutputs = rewriter.create(loc, static_cast(numOutputs)); // Variadic pack for out_rows. auto cvpOutRows = rewriter.create(loc, vptSize, rewriter.getI64IntegerAttr(numOutputs)); diff --git a/src/compiler/utils/CompilerUtils.cpp b/src/compiler/utils/CompilerUtils.cpp index 662f8bdd0..f863f4735 100644 --- a/src/compiler/utils/CompilerUtils.cpp +++ b/src/compiler/utils/CompilerUtils.cpp @@ -93,6 +93,20 @@ template <> std::pair CompilerUtils::isConstant(mlir::Value v) return isConstantHelper(v, [](mlir::BoolAttr attr) { return attr.getValue(); }); } +#if defined(__APPLE__) && defined(__aarch64__) +// On macOS ARM64, long/unsigned long are distinct types from int64_t/uint64_t +// (long = long, int64_t = long long), so we need separate instantiations. +template <> std::pair CompilerUtils::isConstant(mlir::Value v) { + return isConstantHelper( + v, [](mlir::IntegerAttr attr) { return static_cast(attr.getValue().getLimitedValue()); }); +} + +template <> std::pair CompilerUtils::isConstant(mlir::Value v) { + return isConstantHelper( + v, [](mlir::IntegerAttr attr) { return static_cast(attr.getValue().getLimitedValue()); }); +} +#endif + // ************************************************************************************************** // Specializations of constantOrThrow for various types // ************************************************************************************************** @@ -160,6 +174,18 @@ template <> bool CompilerUtils::constantOrDefault(mlir::Value v, bool d) { return constantOrDefaultHelper(v, d, [](mlir::BoolAttr attr) { return attr.getValue(); }); } +#if defined(__APPLE__) && defined(__aarch64__) +template <> long CompilerUtils::constantOrDefault(mlir::Value v, long d) { + return constantOrDefaultHelper( + v, d, [](mlir::IntegerAttr attr) { return static_cast(attr.getValue().getLimitedValue()); }); +} + +template <> unsigned long CompilerUtils::constantOrDefault(mlir::Value v, unsigned long d) { + return constantOrDefaultHelper( + v, d, [](mlir::IntegerAttr attr) { return static_cast(attr.getValue().getLimitedValue()); }); +} +#endif + // ************************************************************************************************** // Other // ************************************************************************************************** diff --git a/src/ir/daphneir/DaphneVectorizableOpInterface.cpp b/src/ir/daphneir/DaphneVectorizableOpInterface.cpp index 1c3a9c024..23e278262 100644 --- a/src/ir/daphneir/DaphneVectorizableOpInterface.cpp +++ b/src/ir/daphneir/DaphneVectorizableOpInterface.cpp @@ -300,7 +300,7 @@ template std::vector getVectorSplits_AllAg } template std::vector> createOpsOutputSizes_AllAggOp(AllAggOp *op, OpBuilder &builder) { - auto cst1 = builder.create(op->getLoc(), size_t(1)); + auto cst1 = builder.create(op->getLoc(), static_cast(1)); return {{cst1, cst1}}; } diff --git a/src/parser/daphnedsl/DaphneDSLVisitor.cpp b/src/parser/daphnedsl/DaphneDSLVisitor.cpp index af56baf5d..ac4402274 100644 --- a/src/parser/daphnedsl/DaphneDSLVisitor.cpp +++ b/src/parser/daphnedsl/DaphneDSLVisitor.cpp @@ -1652,8 +1652,8 @@ antlrcpp::Any DaphneDSLVisitor::visitMatrixLiteralExpr(DaphneDSLGrammarParser::M // Missing dimensions are inferred (defaults to column matrix). if (!ctx->rows && !ctx->cols) { numMatElems = ctx->expr().size(); - cols = builder.create(loc, static_cast(1)); - rows = builder.create(loc, static_cast(ctx->expr().size())); + cols = builder.create(loc, static_cast(1)); + rows = builder.create(loc, static_cast(ctx->expr().size())); } else { numMatElems = (ctx->rows && ctx->cols) ? ctx->expr().size() - 2 : ctx->expr().size() - 1; if (ctx->cols && ctx->rows) { @@ -1663,12 +1663,12 @@ antlrcpp::Any DaphneDSLVisitor::visitMatrixLiteralExpr(DaphneDSLGrammarParser::M cols = valueOrErrorOnVisit(ctx->cols); rows = builder.create(loc, builder.getIntegerType(64, false), - builder.create(loc, numMatElems), cols); + builder.create(loc, static_cast(numMatElems)), cols); } else { rows = valueOrErrorOnVisit(ctx->rows); cols = builder.create(loc, builder.getIntegerType(64, false), - builder.create(loc, numMatElems), rows); + builder.create(loc, static_cast(numMatElems)), rows); } } cols = utils.castSizeIf(cols); @@ -1939,17 +1939,17 @@ antlrcpp::Any DaphneDSLVisitor::visitLiteral(DaphneDSLGrammarParser::LiteralCont litStr = std::regex_replace(litStr, std::regex("_|'"), ""); if (litStr.back() == 'u') - return static_cast(builder.create(loc, std::stoul(litStr))); + return static_cast(builder.create(loc, static_cast(std::stoul(litStr)))); else if ((litStr.length() > 2) && std::string_view(litStr).substr(litStr.length() - 3) == "ull") { // The suffix "ull" must be checked before the suffix "l", since "l" // is a suffix of "ull". return static_cast( builder.create(loc, static_cast(std::stoull(litStr)))); } else if (litStr.back() == 'l') - return static_cast(builder.create(loc, std::stol(litStr))); + return static_cast(builder.create(loc, static_cast(std::stol(litStr)))); else if (litStr.back() == 'z') { return static_cast( - builder.create(loc, static_cast(std::stoll(litStr)))); + builder.create(loc, static_cast(std::stoll(litStr)))); } else { // Note that a leading minus of a numeric literal is not parsed as // part of the literal itself, but handled separately as a unary diff --git a/src/runtime/distributed/worker/CMakeLists.txt b/src/runtime/distributed/worker/CMakeLists.txt index 4420cfc6e..7a729d578 100644 --- a/src/runtime/distributed/worker/CMakeLists.txt +++ b/src/runtime/distributed/worker/CMakeLists.txt @@ -31,7 +31,7 @@ get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS) if(USE_HDFS) - find_library(LIBHDFS3 NAMES libhdfs3.so HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED) + find_library(LIBHDFS3 NAMES hdfs3 HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED) else() set(LIBHDFS3 "") endif() diff --git a/src/runtime/local/datastructures/CSRMatrix.cpp b/src/runtime/local/datastructures/CSRMatrix.cpp index e49f88ff0..04abee04c 100644 --- a/src/runtime/local/datastructures/CSRMatrix.cpp +++ b/src/runtime/local/datastructures/CSRMatrix.cpp @@ -31,3 +31,7 @@ template class CSRMatrix; template class CSRMatrix; template class CSRMatrix; template class CSRMatrix; +#if defined(__APPLE__) && defined(__aarch64__) +template class CSRMatrix; +template class CSRMatrix; +#endif diff --git a/src/runtime/local/datastructures/ChunkedTensor.cpp b/src/runtime/local/datastructures/ChunkedTensor.cpp index fb1bb08b2..a1a8a28ff 100644 --- a/src/runtime/local/datastructures/ChunkedTensor.cpp +++ b/src/runtime/local/datastructures/ChunkedTensor.cpp @@ -37,3 +37,7 @@ template class ChunkedTensor; template class ChunkedTensor; template class ChunkedTensor; template class ChunkedTensor; +#if defined(__APPLE__) && defined(__aarch64__) +template class ChunkedTensor; +template class ChunkedTensor; +#endif diff --git a/src/runtime/local/datastructures/ContiguousTensor.cpp b/src/runtime/local/datastructures/ContiguousTensor.cpp index c62019fef..bc3293291 100644 --- a/src/runtime/local/datastructures/ContiguousTensor.cpp +++ b/src/runtime/local/datastructures/ContiguousTensor.cpp @@ -37,3 +37,7 @@ template class ContiguousTensor; template class ContiguousTensor; template class ContiguousTensor; template class ContiguousTensor; +#if defined(__APPLE__) && defined(__aarch64__) +template class ContiguousTensor; +template class ContiguousTensor; +#endif diff --git a/src/runtime/local/datastructures/DenseMatrix.cpp b/src/runtime/local/datastructures/DenseMatrix.cpp index 463a5ca0c..a5f2b8bc0 100644 --- a/src/runtime/local/datastructures/DenseMatrix.cpp +++ b/src/runtime/local/datastructures/DenseMatrix.cpp @@ -341,6 +341,11 @@ template class DenseMatrix; template class DenseMatrix; template class DenseMatrix; template class DenseMatrix; +// On macOS ARM64, long long / unsigned long long are distinct from long / unsigned long. +#if defined(__APPLE__) && defined(__aarch64__) +template class DenseMatrix; +template class DenseMatrix; +#endif template class DenseMatrix; template class DenseMatrix; template class DenseMatrix; diff --git a/src/runtime/local/datastructures/ValueTypeUtils.cpp b/src/runtime/local/datastructures/ValueTypeUtils.cpp index 4c91b976f..fb3020c08 100644 --- a/src/runtime/local/datastructures/ValueTypeUtils.cpp +++ b/src/runtime/local/datastructures/ValueTypeUtils.cpp @@ -97,6 +97,12 @@ template <> const ValueTypeCode ValueTypeUtils::codeFor = ValueTypeCode:: template <> const ValueTypeCode ValueTypeUtils::codeFor = ValueTypeCode::F64; template <> const ValueTypeCode ValueTypeUtils::codeFor = ValueTypeCode::STR; template <> const ValueTypeCode ValueTypeUtils::codeFor = ValueTypeCode::FIXEDSTR16; +// On macOS ARM64, long/unsigned long are distinct from long long/unsigned long long +// (int64_t/uint64_t). Add explicit specializations so DenseMatrix etc. can compile. +#if defined(__APPLE__) && defined(__aarch64__) +template <> const ValueTypeCode ValueTypeUtils::codeFor = ValueTypeCode::SI64; +template <> const ValueTypeCode ValueTypeUtils::codeFor = ValueTypeCode::UI64; +#endif template <> const std::string ValueTypeUtils::cppNameFor = "int8_t"; template <> const std::string ValueTypeUtils::cppNameFor = "int32_t"; @@ -110,6 +116,10 @@ template <> const std::string ValueTypeUtils::cppNameFor = "bool"; template <> const std::string ValueTypeUtils::cppNameFor = "const char*"; template <> const std::string ValueTypeUtils::cppNameFor = "std::string"; template <> const std::string ValueTypeUtils::cppNameFor = "FixedStr"; +#if defined(__APPLE__) && defined(__aarch64__) +template <> const std::string ValueTypeUtils::cppNameFor = "int64_t"; +template <> const std::string ValueTypeUtils::cppNameFor = "uint64_t"; +#endif template <> const std::string ValueTypeUtils::irNameFor = "si8"; template <> const std::string ValueTypeUtils::irNameFor = "si32"; @@ -119,6 +129,10 @@ template <> const std::string ValueTypeUtils::irNameFor = "ui32"; template <> const std::string ValueTypeUtils::irNameFor = "ui64"; template <> const std::string ValueTypeUtils::irNameFor = "f32"; template <> const std::string ValueTypeUtils::irNameFor = "f64"; +#if defined(__APPLE__) && defined(__aarch64__) +template <> const std::string ValueTypeUtils::irNameFor = "si64"; +template <> const std::string ValueTypeUtils::irNameFor = "ui64"; +#endif template <> const int8_t ValueTypeUtils::defaultValue = 0; template <> const int32_t ValueTypeUtils::defaultValue = 0; @@ -132,6 +146,10 @@ template <> const bool ValueTypeUtils::defaultValue = false; template <> const char *ValueTypeUtils::defaultValue = ""; template <> const std::string ValueTypeUtils::defaultValue = std::string(""); template <> const FixedStr16 ValueTypeUtils::defaultValue = FixedStr16(); +#if defined(__APPLE__) && defined(__aarch64__) +template <> const long ValueTypeUtils::defaultValue = 0; +template <> const unsigned long ValueTypeUtils::defaultValue = 0; +#endif const std::string ValueTypeUtils::cppNameForCode(ValueTypeCode type) { switch (type) { diff --git a/src/runtime/local/kernels/BinaryOpCode.h b/src/runtime/local/kernels/BinaryOpCode.h index 6d23f322b..c439e91bb 100644 --- a/src/runtime/local/kernels/BinaryOpCode.h +++ b/src/runtime/local/kernels/BinaryOpCode.h @@ -89,7 +89,7 @@ static std::string_view binary_op_codes[] = { * @tparam op The binary operation. */ template -static constexpr bool supportsBinaryOp = false; +inline constexpr bool supportsBinaryOp = false; // Macros for concisely specifying which binary operations should be // supported on which value types. @@ -97,12 +97,12 @@ static constexpr bool supportsBinaryOp = false; // Generates code specifying that the binary operation `Op` should be supported // on the value type `VT` (for the result and the two arguments, for // simplicity). -#define SUPPORT(Op, VT) template <> constexpr bool supportsBinaryOp = true; +#define SUPPORT(Op, VT) template <> inline constexpr bool supportsBinaryOp = true; // Generates code specifying that the binary operation `Op` should be supported on // the value types `VTLhs` and `VTRhs` with result `VTRes`. #define SUPPORT_RLR(Op, VTRes, VTLhs, VTRhs) \ - template <> constexpr bool supportsBinaryOp = true; + template <> inline constexpr bool supportsBinaryOp = true; // Generates code specifying that all binary operations of a certain category // should be supported on the given value type `VT` (for the result and the two diff --git a/src/runtime/local/kernels/CMakeLists.txt b/src/runtime/local/kernels/CMakeLists.txt index 425b7c49a..4d9cf82b0 100644 --- a/src/runtime/local/kernels/CMakeLists.txt +++ b/src/runtime/local/kernels/CMakeLists.txt @@ -19,6 +19,18 @@ set(CMAKE_CXX_STANDARD 20 CACHE STRING "C++ standard to conform to") set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) find_package(Python3 REQUIRED COMPONENTS Interpreter) +# Apple Clang needs Homebrew's libomp for OpenMP support +if(APPLE) + execute_process(COMMAND brew --prefix libomp + OUTPUT_VARIABLE LIBOMP_PREFIX OUTPUT_STRIP_TRAILING_WHITESPACE) + if(LIBOMP_PREFIX) + set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I${LIBOMP_PREFIX}/include" CACHE STRING "" FORCE) + set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I${LIBOMP_PREFIX}/include" CACHE STRING "" FORCE) + set(OpenMP_C_LIB_NAMES "omp" CACHE STRING "" FORCE) + set(OpenMP_CXX_LIB_NAMES "omp" CACHE STRING "" FORCE) + set(OpenMP_omp_LIBRARY "${LIBOMP_PREFIX}/lib/libomp.dylib" CACHE FILEPATH "" FORCE) + endif() +endif() find_package(OpenMP REQUIRED) # The library of pre-compiled CUDA kernels @@ -129,14 +141,14 @@ list(APPEND LIBS DaphneMetaDataParser MLIRDaphne MLIRDaphneTransforms) list(APPEND LIBS Eigen3::Eigen Arrow::arrow_shared Parquet::parquet_shared) if(USE_PAPI) - find_library(PAPI_LIB NAMES libpapi.so HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED) + find_library(PAPI_LIB NAMES papi HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED) endif() -find_library(HWLOC_LIB NAMES libhwloc.so HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED) +find_library(HWLOC_LIB NAMES hwloc HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED) if(USE_HDFS) target_include_directories(AllKernels PUBLIC ${PROJECT_SOURCE_DIR}/thirdparty/installed/include/hdfs) - find_library(LIBHDFS3 NAMES libhdfs3.so HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED) + find_library(LIBHDFS3 NAMES hdfs3 HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED) endif() target_link_libraries(KernelObjLib PUBLIC ${LIBS} ${MPI_LIBRARIES} ${PAPI_LIB} ${HWLOC_LIB} ${LIBHDFS3}) diff --git a/src/runtime/local/kernels/CTable.h b/src/runtime/local/kernels/CTable.h index 8b1344c0c..d25ab4901 100644 --- a/src/runtime/local/kernels/CTable.h +++ b/src/runtime/local/kernels/CTable.h @@ -133,7 +133,7 @@ struct CTable, DenseMatrix, DenseMatrix, V if (isResNumColsFromRhs) resNumCols = *std::max_element(rhsVals, &rhsVals[rhsNumRows]) + 1; res = DataObjectFactory::create>( - resNumRows, resNumCols, std::min(static_cast(lhsNumRows), resNumRows * resNumCols), true); + resNumRows, resNumCols, std::min(static_cast(lhsNumRows), resNumRows * resNumCols), true); } if (isResNumRowsFromLhs && isResNumColsFromRhs) { diff --git a/src/runtime/local/kernels/UnaryOpCode.h b/src/runtime/local/kernels/UnaryOpCode.h index 45a9287af..0b0b1cfb4 100644 --- a/src/runtime/local/kernels/UnaryOpCode.h +++ b/src/runtime/local/kernels/UnaryOpCode.h @@ -85,7 +85,7 @@ static std::string_view unary_op_codes[] = { * @tparam VTRes The result value type. * @tparam VTArg The argument value type. */ -template static constexpr bool supportsUnaryOp = false; +template inline constexpr bool supportsUnaryOp = false; // Macros for concisely specifying which unary operations should be // supported on which value types. @@ -93,7 +93,7 @@ template static constexpr bool // Generates code specifying that the unary operation `Op` should be supported // on the value type `VT` (for both the result and the argument, for // simplicity). -#define SUPPORT(Op, VT) template <> constexpr bool supportsUnaryOp = true; +#define SUPPORT(Op, VT) template <> inline constexpr bool supportsUnaryOp = true; // Generates code specifying that all unary operations typically supported on // numeric value types should be supported on the given value type `VT` diff --git a/src/runtime/local/kernels/genKernelInst.py b/src/runtime/local/kernels/genKernelInst.py index 2c70157ea..c0ee7b02c 100755 --- a/src/runtime/local/kernels/genKernelInst.py +++ b/src/runtime/local/kernels/genKernelInst.py @@ -43,6 +43,7 @@ import io import json +import platform import re import sys from typing import List, Tuple @@ -280,7 +281,7 @@ def generateFunction(opCode): "backend": API, # Assumes that the generated catalog file is saved in # the same directory as the kernels libraries. - "libPath": "libAllKernels.so" if API == "CPP" else f"lib{API}Kernels.so" + "libPath": ("libAllKernels.dylib" if platform.system() == "Darwin" else "libAllKernels.so") if API == "CPP" else (f"lib{API}Kernels.dylib" if platform.system() == "Darwin" else f"lib{API}Kernels.so") }) # Generate the function(s). diff --git a/src/runtime/local/vectorized/MTWrapper_dense.cpp b/src/runtime/local/vectorized/MTWrapper_dense.cpp index 711f6ad15..fbcf758a0 100644 --- a/src/runtime/local/vectorized/MTWrapper_dense.cpp +++ b/src/runtime/local/vectorized/MTWrapper_dense.cpp @@ -94,10 +94,12 @@ template std::vector qvector; if (ctx->getUserConfig().pinWorkers) { for (int i = 0; i < this->_numQueues; i++) { +#ifdef __linux__ cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(i, &cpuset); sched_setaffinity(0, sizeof(cpu_set_t), &cpuset); +#endif std::unique_ptr tmp = std::make_unique(len); q.push_back(std::move(tmp)); qvector.push_back(q[i].get()); @@ -260,10 +262,12 @@ template // Multiple Queues addition if (ctx->getUserConfig().pinWorkers) { for (int i = 0; i < this->_numQueues; i++) { +#ifdef __linux__ cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(i, &cpuset); sched_setaffinity(0, sizeof(cpu_set_t), &cpuset); +#endif std::unique_ptr tmp = std::make_unique(cpu_task_len); q.push_back(std::move(tmp)); qvector.push_back(q[i].get()); diff --git a/src/runtime/local/vectorized/MTWrapper_sparse.cpp b/src/runtime/local/vectorized/MTWrapper_sparse.cpp index 82d8f3b39..a9706a706 100644 --- a/src/runtime/local/vectorized/MTWrapper_sparse.cpp +++ b/src/runtime/local/vectorized/MTWrapper_sparse.cpp @@ -33,10 +33,12 @@ void MTWrapper>::executeCpuQueues( std::vector qvector; if (ctx->getUserConfig().pinWorkers) { for (int i = 0; i < this->_numQueues; i++) { +#ifdef __linux__ cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(i, &cpuset); sched_setaffinity(0, sizeof(cpu_set_t), &cpuset); +#endif std::unique_ptr tmp = std::make_unique(len); q.push_back(std::move(tmp)); qvector.push_back(q[i].get()); diff --git a/src/runtime/local/vectorized/TaskQueues.h b/src/runtime/local/vectorized/TaskQueues.h index 801fdde64..228343793 100644 --- a/src/runtime/local/vectorized/TaskQueues.h +++ b/src/runtime/local/vectorized/TaskQueues.h @@ -64,11 +64,13 @@ class BlockingTaskQueue : public TaskQueue { } void enqueueTask(Task *t, int targetCPU) override { +#ifdef __linux__ // Change CPU pinning before enqueue to utilize NUMA first-touch policy cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(targetCPU, &cpuset); sched_setaffinity(0, sizeof(cpu_set_t), &cpuset); +#endif enqueueTask(t); } diff --git a/src/runtime/local/vectorized/Tasks.h b/src/runtime/local/vectorized/Tasks.h index f8d67677b..8686f15dc 100644 --- a/src/runtime/local/vectorized/Tasks.h +++ b/src/runtime/local/vectorized/Tasks.h @@ -65,10 +65,9 @@ template struct CompiledPipelineTaskData { DCTX(_ctx); [[maybe_unused]] CompiledPipelineTaskData
withDifferentRange(uint64_t newRl, uint64_t newRu) { - CompiledPipelineTaskData
flatCopy = *this; - flatCopy._rl = newRl; - flatCopy._ru = newRu; - return flatCopy; + return CompiledPipelineTaskData
{_funcs, _isScalar, _inputs, _numInputs, _numOutputs, _outRows, _outCols, + _splits, _combines, newRl, newRu, _wholeResultRows, _wholeResultCols, + _offset, _ctx}; } }; diff --git a/src/runtime/local/vectorized/Worker.h b/src/runtime/local/vectorized/Worker.h index 650032c9c..574f130d7 100644 --- a/src/runtime/local/vectorized/Worker.h +++ b/src/runtime/local/vectorized/Worker.h @@ -19,7 +19,9 @@ #include #include +#ifdef __linux__ #include +#endif #include class Worker { diff --git a/src/runtime/local/vectorized/WorkerCPU.h b/src/runtime/local/vectorized/WorkerCPU.h index d9fdd9578..43017e965 100644 --- a/src/runtime/local/vectorized/WorkerCPU.h +++ b/src/runtime/local/vectorized/WorkerCPU.h @@ -53,11 +53,13 @@ class WorkerCPU : public Worker { void run() override { if (_pinWorkers) { +#ifdef __linux__ // pin worker to CPU core cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(_threadID, &cpuset); sched_setaffinity(0, sizeof(cpu_set_t), &cpuset); +#endif } int currentDomain = _physical_ids[_threadID];