diff --git a/CMakeLists.txt b/CMakeLists.txt index de51c0a17b2f6..44a930b98af33 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,6 +119,7 @@ llama_option_depr(WARNING LLAMA_RPC GGML_RPC) llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) llama_option_depr(WARNING LLAMA_CANN GGML_CANN) +llama_option_depr(WARNING LLAMA_QNN GGML_QNN) if (NOT MSVC) if (LLAMA_SANITIZE_THREAD) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 61fe15a15f074..f65c9add08fce 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -197,6 +197,7 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING "ggml: sycl target device") set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING "ggml: sycl device architecture") +option(GGML_QNN "ggml: use QNN" OFF) option(GGML_OPENCL "ggml: use OpenCL" OFF) option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF) @@ -269,6 +270,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-rpc.h include/ggml-sycl.h include/ggml-vulkan.h + include/ggml-qnn.h include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h new file mode 100644 index 0000000000000..6d3e66d3d3cbd --- /dev/null +++ b/ggml/include/ggml-qnn.h @@ -0,0 +1,13 @@ +#pragma once + +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +GGML_API ggml_backend_reg_t ggml_backend_qnn_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 43d9fc4fe25e0..8e8cb81bda0a7 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -313,6 +313,7 @@ ggml_add_backend(RPC) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) ggml_add_backend(OpenCL) +ggml_add_backend(QNN) foreach (target ggml-base ggml) target_include_directories(${target} PUBLIC $ $) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 405d8e31514b5..9c6cc32786d52 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -65,6 +65,10 @@ #include "ggml-kompute.h" #endif +#ifdef GGML_USE_QNN +#include "ggml-qnn.h" +#endif + // disable C++17 deprecation warning for std::codecvt_utf8 #if defined(__clang__) # pragma clang diagnostic push @@ -187,6 +191,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_KOMPUTE register_backend(ggml_backend_kompute_reg()); #endif +#ifdef GGML_USE_QNN + register_backend(ggml_backend_qnn_reg()); +#endif #ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); #endif diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt new file mode 100644 index 0000000000000..3e8fa3a1b8117 --- /dev/null +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -0,0 +1,161 @@ +message(STATUS "Using QNN backend") + +option(GGML_HEXAGON_NPU_ONLY "ggml-qnn: Only use Hexagon NPU" OFF) +option(GGML_QNN_ENABLE_HEXAGON_BACKEND "ggml-qnn: Enable Hexagon custom package" ${GGML_HEXAGON_NPU_ONLY}) + +if(CMAKE_SYSTEM_NAME STREQUAL "Android") + find_library(LOG_LIB log) + set(QNN_LINK_LIBRARIES ${LOG_LIB}) + set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") + add_compile_options(-g -O0) +elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend") +else() + message(FATAL_ERROR "QNN now only available on Android, Windows and Linux") +endif() + +if(NOT DEFINED GGML_QNN_SDK_PATH) + # try read from environment variable + # TODO: create a function to search for the SDK path + if(DEFINED ENV{QNN_SDK_PATH}) + set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH}) + else() + message(FATAL_ERROR "GGML_QNN_SDK_PATH not defined") + endif() +endif() + +message("CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") +message("CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}") +message("CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}") +message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") + +file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/qnn/*.cpp") +file(GLOB COMMON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp") +ggml_add_backend_library(ggml-qnn + ${QNN_SOURCES} + ${COMMON_SOURCES} +) + +target_include_directories(ggml-qnn PRIVATE + ${GGML_QNN_SDK_PATH}/include/QNN + ${CMAKE_CURRENT_LIST_DIR}/qnn + ${CMAKE_CURRENT_LIST_DIR} +) +target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES}) + +if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "") + string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") +endif() + +message("GGML_QNN_DEFAULT_LIB_SEARCH_PATH: ${QNN_DEFAULT_LIB_SEARCH_PATH}") +target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}") + +if(GGML_QNN_ENABLE_CPU_BACKEND) + message("GGML_QNN_ENABLE_CPU_BACKEND is enabled") + target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_CPU_BACKEND) +else() + message("GGML_QNN_ENABLE_CPU_BACKEND is disabled") +endif() + +if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING) + message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled") + target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_PERFORMANCE_TRACKING) +else() + message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is disabled") +endif() + +add_subdirectory(shared) + +if(GGML_HEXAGON_NPU_ONLY) + message("GGML_HEXAGON_NPU_ONLY is enabled") + add_compile_definitions(GGML_HEXAGON_NPU_ONLY) + set(GGML_QNN_ENABLE_HEXAGON_BACKEND ON) +else() + message("GGML_HEXAGON_NPU_ONLY is disabled") +endif() + +if(GGML_QNN_ENABLE_HEXAGON_BACKEND) + message("GGML_QNN_ENABLE_HEXAGON_BACKEND is enabled") + add_subdirectory(npu) + target_link_libraries(hexagon-npu-host runtime-common) + target_link_libraries(ggml-qnn PRIVATE hexagon-npu-host) +else() + message("GGML_QNN_ENABLE_HEXAGON_BACKEND is disabled") + target_link_libraries(ggml-qnn PRIVATE runtime-common) +endif() + +# Copy QNN dynamic libraries +set(QNN_DYNAMIC_LIBS "") + +if(CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") + if(CMAKE_SYSTEM_NAME STREQUAL "Android") + # Android + set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-android") + elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + # Linux x86_64 + set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/x86_64-linux-clang") + else() + # Linux aarch64 + set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-oe-linux-gcc11.2") + endif() + + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnSystem.so") + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnCpu.so") + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnGpu.so") + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp.so") + file(GLOB HTP_STUB_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp*.so") + list(APPEND QNN_DYNAMIC_LIBS ${HTP_STUB_LIBS}) + + if(CMAKE_SYSTEM_NAME STREQUAL "Android") + file(GLOB HTP_SKEL_LIBS "${GGML_QNN_SDK_PATH}/lib/hexagon-*/unsigned/libQnnHtp*Skel.so") + list(APPEND QNN_DYNAMIC_LIBS ${HTP_SKEL_LIBS}) + + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + if(EXISTS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver") + list(APPEND QNN_DYNAMIC_LIBS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver") + message("old ndk, copy gdbserver") + else() + file(GLOB LLDB_SERVER "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/aarch64/lldb-server") + list(APPEND QNN_DYNAMIC_LIBS ${LLDB_SERVER}) + message("new ndk, copy lldb-server") + endif() + + file(GLOB OMP_LIBS "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/aarch64/libomp.so") + file(GLOB ASAN_LIBS "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/libclang_rt.asan-aarch64-android.so") + list(APPEND QNN_DYNAMIC_LIBS ${OMP_LIBS}) + list(APPEND QNN_DYNAMIC_LIBS ${ASAN_LIBS}) + endif() + else() + # Linux + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libHtpPrepare.so") + endif() +elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + # x86_64 + set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/x86_64-windows-msvc") + else() + # aarch64 + set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-windows-msvc") + endif() + + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnSystem.dll") + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnCpu.dll") + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnGpu.dll") + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp.dll") + file(GLOB HTP_STUB_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp*.dll") + + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/HtpPrepare.dll") + endif() + + list(APPEND QNN_DYNAMIC_LIBS ${HTP_STUB_LIBS}) +endif() + +foreach(QNN_DYNAMIC_LIB ${QNN_DYNAMIC_LIBS}) + message("Copy: ${QNN_DYNAMIC_LIB} -> ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + add_custom_command( + TARGET ggml-qnn POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${QNN_DYNAMIC_LIB} + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +endforeach() diff --git a/ggml/src/ggml-qnn/npu/CMakeLists.txt b/ggml/src/ggml-qnn/npu/CMakeLists.txt new file mode 100644 index 0000000000000..4c734bb098999 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/CMakeLists.txt @@ -0,0 +1,147 @@ +enable_language(ASM) +cmake_policy(SET CMP0115 OLD) + +if(DEFINED ENV{HEXAGON_SDK_ROOT}) + set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT}) + message("HEXAGON_SDK_ROOT: ${HEXAGON_SDK_ROOT}") +else() + message(FATAL_ERROR "HEXAGON_SDK_ROOT not defined") +endif() + +if(HEXAGON_SDK_ROOT) + include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake) +else() + include(${HEXAGON_CMAKE_ROOT}/hexagon_fun.cmake) +endif() + +# Base Include dirs for the Project +set(common_incs + ${CMAKE_CURRENT_BINARY_DIR}/ + ${HEXAGON_SDK_ROOT}/incs/ + ${HEXAGON_SDK_ROOT}/incs/stddef/ + ${HEXAGON_SDK_ROOT}/incs/HAP/ + ${HEXAGON_SDK_ROOT}/rtos/qurt/ + ${HEXAGON_SDK_ROOT}/utils/examples/ +) + +include_directories(${common_incs}) + +if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows") + # host build + file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/common/*.cpp") + file(GLOB host_srcs "${CMAKE_CURRENT_LIST_DIR}/host/*.cpp") + set(stub_srcs "${CMAKE_CURRENT_BINARY_DIR}/npu_device_stub.c") + add_library(hexagon-npu-host STATIC + ${common_srcs} + ${host_srcs} + ${stub_srcs} + ) + + # disable warnings for the stub + set_source_files_properties( + ${stub_srcs} + PROPERTIES + COMPILE_FLAGS "-w" + ) + + build_idl(idl/hexagon_npu.idl hexagon-npu-host) + + # Add compile definitions to the target + target_compile_definitions(hexagon-npu-host PUBLIC + VERIFY_PRINT_ERROR + GGML_QNN_ENABLE_HEXAGON_BACKEND + ) + + target_include_directories(hexagon-npu-host PRIVATE + ${HEXAGON_SDK_ROOT}/ipc/fastrpc/rpcmem/inc/ + ${QNN_SDK_ROOT}/include/QNN/ + ${CMAKE_CURRENT_LIST_DIR}/host/ + ${CMAKE_CURRENT_LIST_DIR}/ + ) + + target_include_directories(hexagon-npu-host PUBLIC + ${HEXAGON_SDK_ROOT}/incs/ # TODO: this is for rpc-mem + ) + + if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Windows") + set_target_properties(hexagon-npu-host PROPERTIES OUTPUT_NAME "hexagon_npu") + endif() + + if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux") + target_link_options(hexagon-npu-host PUBLIC -pie) + endif() + + link_options(hexagon-npu-host) + + if(${CMAKE_SYSTEM_NAME} MATCHES "Android") + set(PREBUILT_LIB_DIR "android_aarch64") + elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux") + set(PREBUILT_LIB_DIR "UbuntuARM_aarch64") + else() + # Windows + set(PREBUILT_LIB_DIR "windows_aarch64") + endif() + + choose_dsprpc("3" dsprpc) # cdsprpc + link_custom_library(hexagon-npu-host ${dsprpc}) +else() + # hexagon npu build + cmake_minimum_required(VERSION 3.14.3) + project(hexagon_npu C CXX ASM) + + # check if QNN_SDK_ROOT is set + if(NOT DEFINED ENV{QNN_SDK_ROOT}) + message(FATAL_ERROR "QNN_SDK_ROOT not defined") + endif() + + set(QNN_SDK_ROOT $ENV{QNN_SDK_ROOT}) + message("QNN_SDK_ROOT: ${QNN_SDK_ROOT}") + include_directories( + ${QNN_SDK_ROOT}/include/QNN/ + ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") + + file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/common/*.cpp") + file(GLOB device_srcs "${CMAKE_CURRENT_LIST_DIR}/device/*.cpp") + set(skel_srcs "${CMAKE_CURRENT_BINARY_DIR}/npu_device_skel.c") + add_library(hexagon_npu_skel_OBJS OBJECT + ${common_srcs} + ${device_srcs} + ${skel_srcs} + ) + + if(CMAKE_BUILD_TYPE MATCHES "Debug|Dbg") + message("Debug build, enable all logging") + target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC + _DEBUG + DEBUG_LOGGING + ) + else() + message("Release build, disable debug logging") + target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC + NDEBUG + RELEASE_LOGGING + ) + endif() + + build_idl(idl/hexagon_npu.idl hexagon_npu_skel_OBJS) + + # disable warnings for the skel + set_source_files_properties( + ${skel_srcs} + PROPERTIES + COMPILE_FLAGS "-w" + ) + + add_library(hexagon_npu_skel SHARED $) + + target_link_libraries(hexagon_npu_skel + ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a + ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a + ) + set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}") + + copy_binaries(hexagon_npu_skel) +endif() + +# vim: set noet fenc=utf-8 ff=unix ft=cmake : diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp new file mode 100644 index 0000000000000..2368d44f671ef --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/device.cpp @@ -0,0 +1,173 @@ + +#include +#include +#include + +#include + +#include "graph.hpp" +#include "hexagon_npu.h" +#include "op_impl.hpp" +#include "remote.h" +#include "tensor.hpp" +#include "util.hpp" + +#define NPU_UNUSED(x) (void) (x) + +namespace { + +struct npu_device_context { + int unused = 0; + // TODO: should we add tensor context here? +}; + +inline hexagon::tensor * tensor_from_handle(npu_device_graph_handle_t h) { + return reinterpret_cast(h); +} + +inline npu_device_graph_handle_t tensor_to_handle(hexagon::tensor * tensor) { + return reinterpret_cast(tensor); +} + +inline hexagon::graph * graph_from_handle(npu_device_tensor_handle_t h) { + return reinterpret_cast(h); +} + +inline npu_device_tensor_handle_t graph_to_handle(hexagon::graph * graph) { + return reinterpret_cast(graph); +} + +} // namespace + +int npu_device_open(const char * uri, remote_handle64 * h) { + // TODO: should we have a device context here? + auto * context = new (std::nothrow) npu_device_context(); + if (!context) { + DEVICE_LOG_ERROR("Failed to allocate memory for the npu_device_context"); + return AEE_ENOMEMORY; + } + + *h = reinterpret_cast(context); + return AEE_SUCCESS; +} + +int npu_device_close(remote_handle64 h) { + auto * context = reinterpret_cast(h); + if (!context) { + DEVICE_LOG_ERROR("Invalid npu_device_context handle"); + return AEE_EINVHANDLE; + } + + delete context; + return AEE_SUCCESS; +} + +AEEResult npu_device_device_get_alignment(remote_handle64 _h, uint32_t * alignment) { + NPU_UNUSED(_h); + *alignment = sizeof(HVX_Vector); + return AEE_SUCCESS; +} + +AEEResult npu_device_device_support_op(remote_handle64 _h, const npu_device_tensor_spec * src0, + const npu_device_tensor_spec * src1, const npu_device_tensor_spec * dst, + npu_device_tensor_op op, boolean * is_supported) { + NPU_UNUSED(_h); + *is_supported = hexagon::support_op(*src0, *src1, *dst, op); + return AEE_SUCCESS; +} + +AEEResult npu_device_tensor_init(remote_handle64 _h, const npu_device_tensor_config * info, + npu_device_tensor_handle_t * tensor_handle) { + NPU_UNUSED(_h); + auto * tensor = new (std::nothrow) hexagon::tensor(*info); + if (!tensor) { + DEVICE_LOG_ERROR("Failed to allocate memory for the tensor"); + return AEE_ENOMEMORY; + } + + *tensor_handle = tensor_to_handle(tensor); + return AEE_SUCCESS; +} + +AEEResult npu_device_tensor_set_src(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, uint64_t index, + npu_device_tensor_handle_t src) { + NPU_UNUSED(_h); + auto * tensor = tensor_from_handle(tensor_handle); + if (!tensor) { + return AEE_EINVHANDLE; + } + + auto * src_tensor = tensor_from_handle(src); + tensor->set_src(index, src_tensor); + return AEE_SUCCESS; +} + +AEEResult npu_device_tensor_set_op(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, + npu_device_tensor_op op) { + NPU_UNUSED(_h); + auto * tensor = tensor_from_handle(tensor_handle); + if (!tensor) { + return AEE_EINVHANDLE; + } + + tensor->set_op(op); + return AEE_SUCCESS; +} + +AEEResult npu_device_tensor_free(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle) { + NPU_UNUSED(_h); + auto * tensor = tensor_from_handle(tensor_handle); + if (!tensor) { + return AEE_EINVHANDLE; + } + + delete tensor; + return AEE_SUCCESS; +} + +AEEResult npu_device_graph_init(remote_handle64 _h, npu_device_graph_handle_t * graph_handle) { + NPU_UNUSED(_h); + auto * graph = new (std::nothrow) hexagon::graph(); + if (!graph) { + return AEE_ENOMEMORY; + } + + *graph_handle = graph_to_handle(graph); + return AEE_SUCCESS; +} + +AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handle_t graph_handle, + const npu_device_tensor_handle_t * tensor_handles, int tensor_handlesLen) { + NPU_UNUSED(_h); + auto * graph = graph_from_handle(graph_handle); + if (!graph || !tensor_handles || tensor_handlesLen <= 0) { + return AEE_EINVHANDLE; + } + + graph->set_tensor(tensor_handles, tensor_handlesLen); + return AEE_SUCCESS; +} + +AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) { + NPU_UNUSED(_h); + auto * graph = graph_from_handle(graph_handle); + if (!graph) { + return AEE_EINVHANDLE; + } + + if (!graph->compute()) { + return AEE_EFAILED; + } + + return AEE_SUCCESS; +} + +AEEResult npu_device_graph_free(remote_handle64 _h, npu_device_graph_handle_t graph_handle) { + NPU_UNUSED(_h); + auto * graph = graph_from_handle(graph_handle); + if (graph) { + delete graph; + } + + return AEE_SUCCESS; +} diff --git a/ggml/src/ggml-qnn/npu/device/graph.cpp b/ggml/src/ggml-qnn/npu/device/graph.cpp new file mode 100644 index 0000000000000..b21b8add2997c --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/graph.cpp @@ -0,0 +1,67 @@ + +#include "graph.hpp" + +#include + +#include "op_impl.hpp" +#include "util.hpp" + +namespace hexagon { + +graph::~graph() noexcept { + if (_tensors) { + delete[] _tensors; + } +} + +void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count) { + if (_tensor_count > 0) { + delete[] _tensors; + } + + if (tensor_count <= 0) { + _tensors = nullptr; + _tensor_count = 0; + return; + } + + _tensors = new (std::nothrow) tensor *[tensor_count]; + for (int i = 0; i < tensor_count; ++i) { + auto * tensor_obj = reinterpret_cast(tensors[i]); + _tensors[i] = tensor_obj; + DEVICE_LOG_DEBUG("graph(%p) set_tensor[%d]: %p(%p,%p), op: %d\n", (void *) this, i, (void *) tensor_obj, + (void *) tensor_obj->get_src(0), (void *) tensor_obj->get_src(1), tensor_obj->get_op()); + } + + _tensor_count = tensor_count; + DEVICE_LOG_DEBUG("graph(%p) tensor count: %zu\n", (void *) this, _tensor_count); +} + +bool graph::compute() { + if (!_tensors || !_tensor_count) { + DEVICE_LOG_DEBUG("graph(%p) no tensors to compute\n", (void *) this); + return true; // return success if no tensors to compute + } + + DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this); + for (size_t i = 0; i < _tensor_count; ++i) { + auto * dst = _tensors[i]; + auto op = dst->get_op(); + auto * func = get_compute_func(op); + if (!func) { + DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op); + return false; + } + + if (!func(dst)) { + DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op); + return false; + } + + dst->flush(); // TODO: optimize this + } + + return true; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/graph.hpp b/ggml/src/ggml-qnn/npu/device/graph.hpp new file mode 100644 index 0000000000000..22f6615d1435f --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/graph.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include "hexagon_npu.h" +#include "tensor.hpp" + +namespace hexagon { + +class graph { + public: + // TODO: add execute direction here + explicit graph() noexcept {} + + ~graph() noexcept; + + void set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count); + + bool compute(); + + private: + tensor ** _tensors = nullptr; + size_t _tensor_count = 0; + + graph(const graph &) = delete; + void operator=(const graph &) = delete; + graph(graph &&) = delete; + void operator=(graph &&) = delete; +}; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp new file mode 100644 index 0000000000000..7067a1d52bc9a --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -0,0 +1,194 @@ + + +#include "op_impl.hpp" + +#include +#include + +#include "op_mul_mat.hpp" + +namespace { + +template +inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, float * dst) { + HVX_Vector * iptr0 = ((HVX_Vector *) src0); + HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / hexagon::kFloatsPerVector); + HVX_Vector * iptr1 = ((HVX_Vector *) src1); + HVX_Vector * optr = ((HVX_Vector *) dst); + HVX_Vector prev0 = *iptr0++; + HVX_Vector prev1 = *iptr1++; + + // TODO: prefetch or just use VTCM? + while (iptr0 < iptr0_end) { + HVX_Vector curr0 = *iptr0++; + HVX_Vector curr1 = *iptr1++; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + *optr++ = Q6_Vsf_equals_Vqf32(_OpIntrinsic(s0, s1)); + prev0 = curr0; + prev1 = curr1; + } + + if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { + // handle the last vector + // see also: + // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 + // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c + bool iptr0_aligned = hexagon::is_addr_aligned(iptr0); + HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0; + iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1; + bool iptr1_aligned = hexagon::is_addr_aligned(iptr1); + HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1; + iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + *optr++ = Q6_Vsf_equals_Vqf32(_OpIntrinsic(s0, s1)); + prev0 = curr0; + prev1 = curr1; + } + + const size_t leftover = count % hexagon::kFloatsPerVector; + const size_t leftover_bytes = leftover * sizeof(float); + if (leftover > 0) { + // handle the leftover elements + HVX_Vector curr0 = + (leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + + HVX_Vector curr1 = + (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + + q6op_vstu_variable_ARV(optr, leftover_bytes, Q6_Vsf_equals_Vqf32(_OpIntrinsic(curr0, curr1))); + } +} + +inline HVX_Vector vadd_f32_f32(HVX_Vector a, HVX_Vector b) { + return Q6_Vqf32_vadd_VsfVsf(a, b); +} + +inline HVX_Vector vsub_f32_f32(HVX_Vector a, HVX_Vector b) { + return Q6_Vqf32_vsub_VsfVsf(a, b); +} + +inline HVX_Vector vmul_f32_f32(HVX_Vector a, HVX_Vector b) { + return Q6_Vqf32_vmpy_VsfVsf(a, b); +} + +template +bool element_wise_op(hexagon::tensor * out) { + if (!out) { + return false; + } + + auto * src0 = out->get_src(0); + auto * src1 = out->get_src(1); + if (!src0 || !src1) { + return true; // skip if no src + } + + if (src0->get_ne(0) != src1->get_ne(0)) { + // TODO: handle this case + DEVICE_LOG_ERROR("src0[0] and src1[0] not match: %ld vs %ld\n", (long) src0->get_ne(0), (long) src1->get_ne(0)); + return false; + } + + static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4"); + + const auto * src0_ptr = reinterpret_cast(src0->get_data()); + const auto * src1_ptr = reinterpret_cast(src1->get_data()); + auto * dst_ptr = reinterpret_cast(out->get_data()); + for (int64_t i3 = 0; i3 < out->get_ne(3); i3++) { + const auto * src0_cube = src0_ptr + i3 * src0->get_nb(3); + const auto * src1_cube = src1_ptr + (i3 % src1->get_ne(3)) * src1->get_nb(3); + auto * dst_cube = dst_ptr + i3 * out->get_nb(3); + for (int64_t i2 = 0; i2 < out->get_ne(2); i2++) { + const auto * src0_plane = src0_cube + i2 * src0->get_nb(2); + const auto * src1_plane = src1_cube + (i2 % src1->get_ne(2)) * src1->get_nb(2); + auto * dst_plane = dst_cube + i2 * out->get_nb(2); + for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) { + // TODO: prefetch row? + auto * src0_row = src0_plane + i1 * src0->get_nb(1); + auto * src1_row = src1_plane + (i1 % src1->get_ne(1)) * src1->get_nb(1); + auto * dst_row = reinterpret_cast(dst_plane + i1 * out->get_nb(1)); + _RowFunc(reinterpret_cast(src0_row), reinterpret_cast(src1_row), + static_cast(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row)); + } + } + } + + return true; +} + +bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op) { + if (op != NPU_OP_ADD && op != NPU_OP_SUB && op != NPU_OP_MUL) { + DEVICE_LOG_DEBUG("Unsupported element wise op: %s\n", hexagon::op_get_name(op)); + return false; + } + + if (src0.ne[0] != src1.ne[0]) { + DEVICE_LOG_DEBUG("src0.ne[0] and src1.ne[0] not match: %ld vs %ld\n", (long) src0.ne[0], (long) src1.ne[0]); + return false; + } + + for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) { + if (src0.ne[i] != dst.ne[i]) { + DEVICE_LOG_DEBUG("src0.ne[%zu] and dst.ne[%zu] not match: %lld vs %lld\n", i, i, (long long) src0.ne[i], + (long long) dst.ne[i]); + return false; + } + } + + return true; +} + +struct op_capabilities { + npu_device_tensor_op op; + hexagon::compute_func_type compute_func; + hexagon::op_is_supported_func_type is_supported; +}; + +constexpr const op_capabilities kOpCapabilities[] = { + { NPU_OP_MUL_MAT, hexagon::mul_mat_f32, hexagon::is_mul_mat_supported }, + { NPU_OP_ADD, element_wise_op>, is_element_wise_op_supported }, + { NPU_OP_SUB, element_wise_op>, is_element_wise_op_supported }, + { NPU_OP_MUL, element_wise_op>, is_element_wise_op_supported }, +}; + +static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_func == hexagon::mul_mat_f32, + "kOpArray[NPU_OP_MUL_MAT] != mul_mat_f32"); + +static_assert(std::size(kOpCapabilities) == NPU_OP_COUNT); +static_assert(kOpCapabilities[NPU_OP_MUL_MAT].op == NPU_OP_MUL_MAT, "kOpArray[NPU_OP_MUL_MAT].op != NPU_OP_MUL_MAT"); +static_assert(kOpCapabilities[NPU_OP_MUL].op == NPU_OP_MUL, "kOpArray[NPU_OP_MUL].op != NPU_OP_MUL"); + +} // namespace + +namespace hexagon { + +compute_func_type get_compute_func(npu_device_tensor_op op) { + if (op >= NPU_OP_COUNT) { + return nullptr; + } + + return kOpCapabilities[op].compute_func; +} + +bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op) { + if (get_compute_func(op) == nullptr) { + DEVICE_LOG_ERROR("Unsupported op: %s, get_compute_func failed\n", op_get_name(op)); + return false; + } + + auto is_supported_func = kOpCapabilities[op].is_supported; + if (!is_supported_func || !is_supported_func(src0, src1, dst, op)) { + DEVICE_LOG_ERROR("Unsupported op: %s, is_supported_func failed\n", op_get_name(op)); + return false; + } + + return true; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.hpp b/ggml/src/ggml-qnn/npu/device/op_impl.hpp new file mode 100644 index 0000000000000..1fee7769ce04c --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_impl.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "hexagon_npu.h" +#include "tensor.hpp" + +namespace hexagon { + +typedef bool (*compute_func_type)(tensor * dst); +typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op); + +compute_func_type get_compute_func(npu_device_tensor_op op); + +bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op); + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp new file mode 100644 index 0000000000000..fbda69d2d7cc2 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -0,0 +1,146 @@ +#include "op_mul_mat.hpp" + +#include + +namespace { + +inline float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) { + HVX_Vector * iptr0 = ((HVX_Vector *) src0); + HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / hexagon::kFloatsPerVector); + HVX_Vector * iptr1 = ((HVX_Vector *) src1); + HVX_Vector prev0 = *iptr0++; + HVX_Vector prev1 = *iptr1++; + HVX_Vector sum = Q6_V_vzero(); + + // TODO: prefetch or just use VTCM? + while (iptr0 < iptr0_end) { + HVX_Vector curr0 = *iptr0++; + HVX_Vector curr1 = *iptr1++; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); + prev0 = curr0; + prev1 = curr1; + } + + if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { + // handle the last vector + // see also: + // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 + // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c + bool iptr0_aligned = hexagon::is_addr_aligned(iptr0); + HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0; + iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1; + bool iptr1_aligned = hexagon::is_addr_aligned(iptr1); + HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1; + iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); + prev0 = curr0; + prev1 = curr1; + } + + const size_t leftover = count % hexagon::kFloatsPerVector; + const size_t leftover_bytes = leftover * sizeof(float); + if (leftover > 0) { + // handle the leftover elements + HVX_Vector curr0 = + (leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + + HVX_Vector curr1 = + (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + + sum = Q6_Vqf32_vadd_Vqf32Vqf32( + Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum); + } + + // TODO: do we have a better way to do the reduction? + for (size_t i = hexagon::kFloatsPerVector / 2; i > 0; i /= 2) { + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vror_VR(sum, i * sizeof(float))); + } + + float result; + q6op_vstu_variable_ARV(&result, sizeof(float), Q6_Vsf_equals_Vqf32(sum)); + return result; +} + +} // namespace + +namespace hexagon { + +bool mul_mat_f32(hexagon::tensor * out) { + if (!out) { + return false; + } + + auto * src0 = out->get_src(0); + auto * src1 = out->get_src(1); + if (!src0 || !src1) { + return true; // skip if no src + } + + static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "mul_mat_f32 requires max dims 4"); + + const auto r02 = src1->get_ne(2) / src0->get_ne(2); + const auto r03 = src1->get_ne(3) / src0->get_ne(3); + const auto * src0_ptr = reinterpret_cast(src0->get_data()); + const auto * src1_ptr = reinterpret_cast(src1->get_data()); + auto * dst_ptr = reinterpret_cast(out->get_data()); + for (int64_t i3 = 0; i3 < out->get_ne(3); i3++) { + const auto * src0_cube = src0_ptr + i3 / r03 * src0->get_nb(3); + const auto * src1_cube = src1_ptr + i3 * src1->get_nb(3); + auto * dst_cube = dst_ptr + i3 * out->get_nb(3); + for (int64_t i2 = 0; i2 < out->get_ne(2); i2++) { + const auto * src0_plane = src0_cube + i2 / r02 * src0->get_nb(2); + const auto * src1_plane = src1_cube + i2 * src1->get_nb(2); + auto * dst_plane = dst_cube + i2 * out->get_nb(2); + for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) { + // TODO: prefetch row? + auto * src1_row = src1_plane + i1 * src1->get_nb(1); + auto * dst_row = reinterpret_cast(dst_plane + i1 * out->get_nb(1)); + for (int64_t i0 = 0; i0 < out->get_ne(0); i0++) { + auto * src0_row = src0_plane + i0 * src0->get_nb(1); + // TODO: figure out how to handle a entire row + *dst_row++ = + vec_dot_product_f32_f32(reinterpret_cast(src0_row), + reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + } + } + } + } + + return true; +} + +bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op) { + if (op != NPU_OP_MUL_MAT) { + DEVICE_LOG_DEBUG("op is not NPU_OP_MUL_MAT: %d\n", op); + return false; + } + + if (src0.ne[0] != src1.ne[0] || src0.ne[1] != dst.ne[0]) { + DEVICE_LOG_DEBUG("src0 and src1 cannot multiply: %ldx%ld vs %ldx%ld\n", (long) src0.ne[0], (long) src0.ne[1], + (long) src1.ne[0], (long) src1.ne[1]); + return false; + } + + if (src1.ne[1] != dst.ne[1] || src1.ne[2] != dst.ne[2] || src1.ne[3] != dst.ne[3]) { + DEVICE_LOG_DEBUG("src1 and dst dimensions not match: %ldx%ld vs %ldx%ld\n", (long) src1.ne[2], + (long) src1.ne[3], (long) dst.ne[2], (long) dst.ne[3]); + return false; + } + + if (src1.ne[2] % src0.ne[2] || src1.ne[3] % src0.ne[3]) { + DEVICE_LOG_DEBUG("src0 cannot broadcast to src1: %ldx%ld vs %ldx%ld\n", (long) src0.ne[2], (long) src0.ne[3], + (long) src1.ne[2], (long) src1.ne[3]); + return false; + } + + return true; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp new file mode 100644 index 0000000000000..cc57d3d1fe6d4 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include + +#include + +#include "tensor.hpp" + +namespace hexagon { + +constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 +constexpr const size_t kFloatsPerVector = kBytesPerVector / sizeof(float); +constexpr const size_t kAlignMask = kBytesPerVector - 1; + +inline size_t unaligned_bytes(const void * addr) { + return ((size_t) addr) & kAlignMask; +} + +inline bool is_addr_aligned(void * addr) { + return unaligned_bytes(addr) == 0; +} + +bool mul_mat_f32(tensor * out); +bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op); + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/tensor.hpp b/ggml/src/ggml-qnn/npu/device/tensor.hpp new file mode 100644 index 0000000000000..83aa29a609cfc --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp @@ -0,0 +1,90 @@ +#pragma once + +#include +#include + +#include "hexagon_npu.h" +#include "util.hpp" + +namespace hexagon { + +constexpr const size_t kMaxTensorSrc = DEVICE_TENSOR_MAX_SRC; + +class tensor { + public: + explicit tensor(const npu_device_tensor_config & info) noexcept : _info(info) { + uint64 phy_address = 0; + void * mmap_address = nullptr; + auto ret = HAP_mmap_get(_info.buffer_fd, &mmap_address, &phy_address); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to mmap tensor buffer: %d", (int) ret); + return; + } + + _data = static_cast(mmap_address); + DEVICE_LOG_INFO("tensor(%p[%ldx%ldx%ldx%ld]), fd: %d, offset: %zu, mmap_address: %p, phy_address: 0x%lx\n", + (void *) this, (long) _info.ne[0], (long) _info.ne[1], (long) _info.ne[2], (long) _info.ne[3], + _info.buffer_fd, _info.offset, (void *) mmap_address, phy_address); + } + + ~tensor() noexcept { + auto ret = HAP_mmap_put(_info.buffer_fd); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to unmap tensor buffer: %d", (int) ret); + } + + DEVICE_LOG_INFO("~tensor(%p) fd: %d", (void *) this, _info.buffer_fd); + } + + void flush() { + if (_data) { + qurt_mem_cache_clean((qurt_addr_t) (_data + _info.offset), (qurt_size_t) _info.size, + QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE); + } + } + + bool set_src(size_t index, tensor * src) { + if (index >= kMaxTensorSrc) { + return false; + } + + _src[index] = src; + return true; + } + + void set_op(npu_device_tensor_op op) { _info.op = op; } + + tensor * get_src(size_t index) const { + if (index >= kMaxTensorSrc) { + return nullptr; + } + + return _src[index]; + } + + const npu_device_tensor_config & get_info() const { return _info; } + + const int64_t get_ne(size_t index) const { return _info.ne[index]; } + + const size_t get_nb(size_t index) const { return _info.nb[index]; } + + npu_device_tensor_op get_op() const { return _info.op; } + + npu_device_tensor_data_type get_type() const { return _info.type; } + + uint8_t * get_data() const { return _data + _info.offset; } + + bool is_valid() const { return _data != nullptr; } + + private: + npu_device_tensor_config _info; + tensor * _src[kMaxTensorSrc] = {}; + uint8_t * _data = nullptr; + + tensor(const tensor &) = delete; + void operator=(const tensor &) = delete; + tensor(tensor &&) = delete; + void operator=(tensor &&) = delete; +}; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/util.hpp b/ggml/src/ggml-qnn/npu/device/util.hpp new file mode 100644 index 0000000000000..12b7dde81e9c4 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/util.hpp @@ -0,0 +1,36 @@ +#pragma once + +#include + +#include "hexagon_npu.h" + +#define DEVICE_LOG_ERROR(...) FARF(FATAL, __VA_ARGS__) +#define DEVICE_LOG_WARN(...) FARF(ERROR, __VA_ARGS__) +#define DEVICE_LOG_INFO(...) FARF(HIGH, __VA_ARGS__) + +#ifdef _DEBUG +# undef FARF_LOW +# define FARF_LOW 1 +# define DEVICE_LOG_DEBUG(...) FARF(LOW, __VA_ARGS__) +#else +# define DEVICE_LOG_DEBUG(...) (void) 0 +#endif + +namespace hexagon { + +constexpr const char * op_get_name(npu_device_tensor_op op) { + switch (op) { + case NPU_OP_MUL_MAT: + return "MUL_MAT"; + case NPU_OP_ADD: + return "ADD"; + case NPU_OP_SUB: + return "SUB"; + case NPU_OP_MUL: + return "MUL"; + default: + return "UNKNOWN"; + } +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/buffer.cpp b/ggml/src/ggml-qnn/npu/host/buffer.cpp new file mode 100644 index 0000000000000..ff5c8a320c745 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/buffer.cpp @@ -0,0 +1,246 @@ +#include "buffer.hpp" + +#include + +#include "host_device.hpp" +#include "tensor.hpp" + +namespace { + +constexpr const int kRpcMemDefaultHeapId = RPCMEM_HEAP_ID_SYSTEM; +constexpr const uint32_t kRpcMemDefaultFlags = RPCMEM_DEFAULT_FLAGS; // TODO: should we use a different flag? + +static hexagon::host_buffer * get_buffer_object(ggml_backend_buffer_t buffer) { + return reinterpret_cast(buffer->context); +} + +static hexagon::host_buffer_type * get_buffer_type_object(ggml_backend_buffer_type_t buft) { + return reinterpret_cast(buft->context); +} + +void backend_buffer_free_buffer(ggml_backend_buffer_t buffer) { + delete get_buffer_object(buffer); +} + +void * backend_buffer_get_base(ggml_backend_buffer_t buffer) { + auto * buffer_obj = get_buffer_object(buffer); + GGML_ASSERT(buffer_obj != nullptr); + return buffer_obj->get_buffer(); +} + +ggml_status backend_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + auto * buffer_type_obj = get_buffer_type_object(buffer->buft); + GGML_ASSERT(buffer_type_obj != nullptr); + + auto * device_object = buffer_type_obj->get_device(); + GGML_ASSERT(device_object != nullptr); + + auto * buffer_obj = get_buffer_object(buffer); + GGML_ASSERT(buffer_obj != nullptr); + + auto tensor_object = buffer_obj->init_tensor(tensor, device_object->get_device_handle()); + if (!tensor_object) { + LOG_ERROR("Failed to init tensor\n"); + return GGML_STATUS_ALLOC_FAILED; + } + + return GGML_STATUS_SUCCESS; +} + +void backend_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, + size_t size) { + GGML_UNUSED(buffer); + memcpy((char *) tensor->data + offset, data, size); +} + +void backend_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, + size_t size) { + GGML_UNUSED(buffer); + memcpy(data, (const char *) tensor->data + offset, size); +} + +bool backend_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + GGML_UNUSED(buffer); + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + + return false; +} + +void backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + auto * buffer_obj = get_buffer_object(buffer); + GGML_ASSERT(buffer_obj != nullptr); + memset(buffer_obj->get_buffer(), value, buffer_obj->get_size()); +} + +constexpr const ggml_backend_buffer_i backend_buffer_interface = { + /* .free_buffer = */ backend_buffer_free_buffer, + /* .get_base = */ backend_buffer_get_base, + /* .init_tensor = */ backend_buffer_init_tensor, + /* .memset_tensor = */ nullptr, + /* .set_tensor = */ backend_buffer_set_tensor, + /* .get_tensor = */ backend_buffer_get_tensor, + /* .cpy_tensor = */ backend_buffer_cpy_tensor, + /* .clear = */ backend_buffer_clear, + /* .reset = */ nullptr, +}; + +const char * backend_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + auto * buffer_type_obj = get_buffer_type_object(buft); + GGML_ASSERT(buffer_type_obj != nullptr); + return buffer_type_obj->get_name(); +} + +ggml_backend_buffer_t backend_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + auto * buffer_type_obj = get_buffer_type_object(buft); + GGML_ASSERT(buffer_type_obj != nullptr); + return buffer_type_obj->allocate_buffer(size); +} + +size_t backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + auto * buffer_type_obj = get_buffer_type_object(buft); + GGML_ASSERT(buffer_type_obj != nullptr); + return buffer_type_obj->get_buffer_alignment(); +} + +size_t backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + auto * buffer_type_obj = get_buffer_type_object(buft); + GGML_ASSERT(buffer_type_obj != nullptr); + return buffer_type_obj->get_max_buffer_size(); +} + +bool backend_buffer_is_host(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == backend_buffer_type_get_name; +} + +} // namespace + +namespace hexagon { + +host_buffer::host_buffer(common::rpc_mem_ptr allocator, size_t size, uint32_t domain_id) : + _allocator(allocator), + _size(size), + _domain_id(domain_id) { + if (!_allocator->is_valid()) { + LOG_ERROR("rpc memory not initialized\n"); + return; + } + + if (size > _allocator->get_max_alloc_size()) { + LOG_ERROR("rpc memory size %zu exceeds max alloc size %zu\n", size, _allocator->get_max_alloc_size()); + return; + } + + _data = _allocator->alloc(kRpcMemDefaultHeapId, kRpcMemDefaultFlags, size); + if (!_data) { + LOG_ERROR("failed to allocate rpc memory, size: %d MB\n", (int) (size / (1 << 20))); + return; + } + + LOG_DEBUG("create host_buffer(%p), size: %zu, domain_id: %d\n", (void *) _data, size, (int) domain_id); +} + +host_buffer::~host_buffer() { + LOG_DEBUG("destroy host_buffer(%p), size: %zu, domain_id: %d\n", (void *) _data, _size, (int) _domain_id); + _tensors.clear(); + if (_buffer_fd != -1) { + auto ret = _allocator->fastrpc_munmap((int) _domain_id, _buffer_fd, nullptr, 0); + if (ret != AEE_SUCCESS) { + LOG_ERROR("failed to munmap rpc memory, fd: %d, ret: %d\n", _buffer_fd, ret); + return; + } + } + + _allocator->free(_data); +} + +std::shared_ptr host_buffer::init_tensor(ggml_tensor * tensor, remote_handle64 device_handle) { + if (!_data) { + LOG_ERROR("failed to init tensor, rpc memory not initialized\n"); + return std::shared_ptr(); + } + + if (_buffer_fd == -1) { + _buffer_fd = _allocator->to_fd(_data); + if (_buffer_fd < 0) { + LOG_ERROR("failed to get fd from rpc memory\n"); + return std::shared_ptr(); + } + + auto ret = _allocator->fastrpc_mmap((int) _domain_id, _buffer_fd, _data, 0, _size, FASTRPC_MAP_FD); + if (ret != AEE_SUCCESS) { + LOG_ERROR("failed to mmap rpc memory, fd: %d, ret: %d\n", _buffer_fd, ret); + return std::shared_ptr(); + } + + LOG_DEBUG("mmap rpc memory(%p), fd: %d, addr: %p, size: %zu\n", (void *) _data, _buffer_fd, _data, _size); + } + + auto tensor_object = std::make_shared( + tensor, _buffer_fd, (uint64_t) (reinterpret_cast(tensor->data) - reinterpret_cast(_data)), + device_handle); + if (!tensor_object->is_valid()) { + LOG_ERROR("failed to init tensor, device handle: %p\n", (void *) device_handle); + return std::shared_ptr(); + } + + _tensors.push_back(tensor_object); + return tensor_object; +} + +host_buffer_type::host_buffer_type(ggml_backend_dev_t dev, const std::string & name, common::rpc_mem_ptr rpc_mem) : + _name(name), + _rpc_mem(rpc_mem) { + iface = { + /* .get_name = */ backend_buffer_type_get_name, + /* .alloc_buffer = */ backend_buffer_type_alloc_buffer, + /* .get_alignment = */ backend_buffer_type_get_alignment, + /* .get_max_size = */ backend_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ backend_buffer_is_host, + }; + device = dev; + context = this; + + _device = reinterpret_cast(device->context); + LOG_DEBUG("[%s]create host_buffer_type %s\n", _device->get_name(), _name.c_str()); +} + +size_t host_buffer_type::get_buffer_alignment() const { + return _device->is_device_initialized() ? _device->get_alignment() : 128; +} + +size_t host_buffer_type::get_max_buffer_size() const { + if (!_rpc_mem) { + LOG_ERROR("rpc memory not initialized\n"); + return 0; + } + + return _rpc_mem->get_max_alloc_size(); +} + +ggml_backend_buffer_t host_buffer_type::allocate_buffer(size_t size) { + if (!_rpc_mem) { + LOG_ERROR("rpc memory not initialized\n"); + return nullptr; + } + + if (!_device->is_device_initialized()) { + LOG_ERROR("device is not initialized\n"); + return nullptr; + } + + auto * buffer = new host_buffer(_rpc_mem, size, _device->get_dsp_domain_id()); + if (!buffer->is_valid()) { + delete buffer; + LOG_ERROR("Failed to allocate buffer of size %zu\n", size); + return nullptr; + } + + LOG_DEBUG("[%s]allocate buffer %p, size: %zu\n", _device->get_name(), buffer->get_buffer(), size); + return ggml_backend_buffer_init(this, backend_buffer_interface, buffer, size); +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/buffer.hpp b/ggml/src/ggml-qnn/npu/host/buffer.hpp new file mode 100644 index 0000000000000..955944bb98f59 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/buffer.hpp @@ -0,0 +1,66 @@ +#pragma once + +#include +#include + +#include "ggml-backend-impl.h" +#include "hexagon_npu.h" +#include "rpc-mem.hpp" + +namespace hexagon { + +class host_tensor; + +class host_buffer { + public: + explicit host_buffer(common::rpc_mem_ptr allocator, size_t size, uint32_t domain_id); + + ~host_buffer(); + + bool is_valid() const { return _data != nullptr; } + + void * get_buffer() { return _data; } + + size_t get_size() const { return _size; } + + std::shared_ptr init_tensor(ggml_tensor * tensor, remote_handle64 device_handle); + + private: + common::rpc_mem_ptr _allocator; + void * _data = nullptr; + size_t _size = 0; + int _buffer_fd = -1; + uint32_t _domain_id = 0; + + std::list> _tensors; + + DISABLE_COPY(host_buffer); + DISABLE_MOVE(host_buffer); +}; + +class npu_device; + +class host_buffer_type : public ggml_backend_buffer_type { + public: + explicit host_buffer_type(ggml_backend_dev_t dev, const std::string & name, common::rpc_mem_ptr rpc_mem); + + const char * get_name() const { return _name.c_str(); } + + size_t get_buffer_alignment() const; + + size_t get_max_buffer_size() const; + + ggml_backend_buffer_t allocate_buffer(size_t size); + + npu_device * get_device() const { return _device; } + + private: + npu_device * _device = nullptr; + std::string _name; + common::rpc_mem_ptr _rpc_mem; + + DISABLE_COPY(host_buffer_type); + DISABLE_MOVE(host_buffer_type); +}; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/graph.cpp b/ggml/src/ggml-qnn/npu/host/graph.cpp new file mode 100644 index 0000000000000..9e8cf8320408e --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/graph.cpp @@ -0,0 +1,82 @@ +#include "graph.hpp" + +#include "tensor.hpp" + +namespace hexagon { + +host_graph::host_graph(ggml_cgraph * cgraph, remote_handle64 device_handle) : _device_handle(device_handle) { + auto status = npu_device_graph_init(_device_handle, &_graph_handle); + if (status != AEE_SUCCESS) { + LOG_ERROR("Failed to init graph: %d", (int) status); + _graph_handle = 0; + return; + } + + update(cgraph); +} + +host_graph::~host_graph() { + if (_graph_handle) { + npu_device_graph_free(_device_handle, _graph_handle); + _graph_handle = 0; + } +} + +bool host_graph::update(ggml_cgraph * cgraph) { + if (!_graph_handle) { + LOG_ERROR("host_graph not initialized\n"); + return false; + } + + _tensor_handles.clear(); + _tensor_handles.reserve(cgraph->n_nodes); + for (int i = 0; i < cgraph->n_nodes; ++i) { + auto * node = cgraph->nodes[i]; + if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) { + // skip view liked ops + LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, skipped\n", i, ggml_get_name(node), ggml_op_desc(node), + (void *) node, ggml_type_name(node->type)); + continue; + } + + auto * tensor_obj = host_tensor::from_ggml_tensor(node); + if (!tensor_obj) { + LOG_DEBUG("Unable to get host tensor from ggml tensor: %p\n", (void *) node); + continue; + } + + tensor_obj->set_op(node->op); + _tensor_handles.push_back(tensor_obj->get_device_tensor_handle()); + LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, tensor_handle: %p\n", i, ggml_get_name(node), ggml_op_desc(node), + (void *) node, ggml_type_name(node->type), (void *) tensor_obj->get_device_tensor_handle()); + for (size_t j = 0; j < GGML_MAX_SRC && node->src[j]; ++j) { + auto * src = host_tensor::from_ggml_tensor(node->src[j]); + tensor_obj->set_src(j, src); + } + } + + LOG_DEBUG("host_graph::update, host_graph(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this, + (void *) cgraph, _tensor_handles.size()); + if (!_tensor_handles.empty()) { + npu_device_graph_set_tensor(_device_handle, _graph_handle, _tensor_handles.data(), + (int) _tensor_handles.size()); + } + return true; +} + +bool host_graph::compute() { + if (!_graph_handle) { + LOG_ERROR("host_graph not initialized\n"); + return false; + } + + auto status = npu_device_graph_compute(_device_handle, _graph_handle); + if (status != AEE_SUCCESS) { + LOG_ERROR("Failed to compute host_graph: 0x%x\n", (int) status); + return false; + } + + return true; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/graph.hpp b/ggml/src/ggml-qnn/npu/host/graph.hpp new file mode 100644 index 0000000000000..20c917e1203ca --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/graph.hpp @@ -0,0 +1,32 @@ +#pragma once + +#include + +#include "common.hpp" +#include "ggml-backend-impl.h" +#include "hexagon_npu.h" + +namespace hexagon { + +class host_graph { + public: + host_graph(ggml_cgraph * cgraph, remote_handle64 device_handle); + + ~host_graph(); + + bool is_valid() const { return _graph_handle != 0; } + + bool update(ggml_cgraph * cgraph); + + bool compute(); + + private: + remote_handle64 _device_handle = 0; + npu_device_graph_handle_t _graph_handle = 0; + std::vector _tensor_handles; + + DISABLE_COPY(host_graph); + DISABLE_MOVE(host_graph); +}; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/host.cpp b/ggml/src/ggml-qnn/npu/host/host.cpp new file mode 100644 index 0000000000000..90c4cd29e8e20 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/host.cpp @@ -0,0 +1,153 @@ + +#include +#include + +#include "buffer.hpp" +#include "common.hpp" +#include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include "host_device.hpp" + +namespace { + +hexagon::npu_device * get_device_object(ggml_backend_dev_t device) { + return reinterpret_cast(device->context); +} + +hexagon::npu_device * get_device_object(ggml_backend_t backend) { + return get_device_object(backend->device); +} + +const char * backend_dev_get_name(ggml_backend_dev_t dev) { + auto * dev_obj = get_device_object(dev); + GGML_ASSERT(dev_obj != nullptr); + return dev_obj->get_name(); +} + +const char * backend_dev_get_description(ggml_backend_dev_t dev) { + auto * dev_obj = get_device_object(dev); + GGML_ASSERT(dev_obj != nullptr); + return dev_obj->get_description(); +} + +bool backend_dev_is_npu_device(ggml_backend_dev_t dev) { + return dev->iface.get_name == backend_dev_get_name; +} + +void backend_dev_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + GGML_UNUSED(dev); + *free = common::get_system_free_memory_in_bytes(); + *total = common::get_system_total_memory_in_bytes(); +} + +enum ggml_backend_dev_type backend_dev_get_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_ACCEL; +} + +void backend_dev_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + GGML_ASSERT(get_device_object(dev) != nullptr); + props->name = backend_dev_get_name(dev); + props->description = backend_dev_get_description(dev); + props->type = backend_dev_get_type(dev); + backend_dev_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = {}; +} + +ggml_backend_t backend_dev_init_backend(ggml_backend_dev_t dev, const char * params) { + auto * dev_obj = get_device_object(dev); + GGML_ASSERT(dev_obj != nullptr); + if (!dev_obj->init_device(dev, params)) { + LOG_ERROR("[%s]Failed to init device\n", backend_dev_get_name(dev)); + return nullptr; + } + + return new hexagon::npu_backend(dev); +} + +ggml_backend_buffer_type_t backend_dev_get_buffer_type(ggml_backend_dev_t dev) { + auto * dev_obj = get_device_object(dev); + GGML_ASSERT(dev_obj != nullptr); + return dev_obj->get_default_buffer_type(dev); +} + +ggml_backend_buffer_t backend_dev_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, + size_t max_tensor_size) { + // TODO: should we use the device memory here? + GGML_UNUSED(dev); + GGML_UNUSED(max_tensor_size); + return ggml_backend_cpu_buffer_from_ptr(ptr, size); +} + +bool backend_dev_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + if (!backend_dev_is_npu_device(dev)) { + return false; + } + + auto * dev_obj = get_device_object(dev); + GGML_ASSERT(dev_obj != nullptr); + return dev_obj->supports_op(op); +} + +bool backend_dev_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + if (!backend_dev_is_npu_device(dev)) { + return false; + } + + auto * dev_obj = get_device_object(dev); + GGML_ASSERT(dev_obj != nullptr); + return dev_obj->supports_buft(buft); +} + +bool backend_dev_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + if (!backend_dev_is_npu_device(dev)) { + return false; + } + + auto * dev_obj = get_device_object(dev); + GGML_ASSERT(dev_obj != nullptr); + return dev_obj->offload_op(op); +} + +constexpr const ggml_backend_device_i npu_device_interface = { + /* .get_name = */ backend_dev_get_name, + /* .get_description = */ backend_dev_get_description, + /* .get_memory = */ backend_dev_get_memory, + /* .get_type = */ backend_dev_get_type, + /* .get_props = */ backend_dev_get_props, + /* .init_backend = */ backend_dev_init_backend, + /* .get_buffer_type = */ backend_dev_get_buffer_type, + /* .get_host_buffer_type = */ nullptr, + /* .buffer_from_host_ptr = */ backend_dev_buffer_from_host_ptr, + /* .supports_op = */ backend_dev_supports_op, + /* .supports_buft = */ backend_dev_supports_buft, + /* .offload_op = */ backend_dev_offload_op, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_synchronize = */ nullptr, +}; + +class npu_device_proxy : public backend_device_proxy { + public: + explicit npu_device_proxy(backend_index_type device) { _device = std::make_unique(device); } + + const ggml_backend_device_i & get_iface() const { return npu_device_interface; } + + void * get_context() { return _device.get(); } + + private: + std::unique_ptr _device; + + DISABLE_COPY(npu_device_proxy); + DISABLE_MOVE(npu_device_proxy); +}; + +} // namespace + +backend_device_proxy_ptr create_hexagon_backend_context(backend_index_type device) { + if (device < QNN_BACKEND_COUNT || device >= TOTAL_BACKEND_COUNT) { + return backend_device_proxy_ptr(); + } + + return std::make_shared(device); +} diff --git a/ggml/src/ggml-qnn/npu/host/host_device.cpp b/ggml/src/ggml-qnn/npu/host/host_device.cpp new file mode 100644 index 0000000000000..aa90cfa8bc8f1 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/host_device.cpp @@ -0,0 +1,305 @@ +#include "host_device.hpp" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmissing-prototypes" +#include +#pragma GCC diagnostic pop + +#include + +#include "graph.hpp" +#include "util.hpp" + +#define SKEL_URI_DEFINE(arch) ("file:///libhexagon_npu_skel_" arch ".so?npu_device_skel_handle_invoke&_modver=1.0") + +namespace { + +struct device_library_info { + hexagon::hexagon_dsp_arch arch; + const char * device_lib_uri; +}; + +constexpr const device_library_info kDeviceLibraryInfo[] = { + { hexagon::NONE, SKEL_URI_DEFINE("") }, + { hexagon::V68, SKEL_URI_DEFINE("v68") }, + { hexagon::V69, SKEL_URI_DEFINE("v69") }, + { hexagon::V73, SKEL_URI_DEFINE("v73") }, + { hexagon::V75, SKEL_URI_DEFINE("v75") }, + { hexagon::V79, SKEL_URI_DEFINE("v79") }, +}; + +const device_library_info & get_device_library_info(hexagon::hexagon_dsp_arch arch) { + for (const auto & info : kDeviceLibraryInfo) { + if (info.arch == arch) { + return info; + } + } + + LOG_ERROR("Unknown DSP arch: %d, using hexagon::NONE\n", arch); + return kDeviceLibraryInfo[0]; +} + +const char * get_domain_param(uint32_t domain_id) { + for (const auto & domain : supported_domains) { + if ((uint32_t) domain.id == domain_id) { + return domain.uri; + } + } + + return ""; +} + +constexpr const ggml_guid kBackendNpuGuid = { 0x7a, 0xd7, 0x59, 0x7d, 0x8f, 0x66, 0x4f, 0x35, + 0x84, 0x8e, 0xf5, 0x9a, 0x9b, 0x83, 0x7d, 0x0a }; + +hexagon::npu_backend * get_backend_object(ggml_backend_t backend) { + return reinterpret_cast(backend); +} + +const char * backend_get_name(ggml_backend_t backend) { + auto * backend_obj = get_backend_object(backend); + GGML_ASSERT(backend_obj != nullptr); + return backend_obj->get_name(); +} + +void backend_free(ggml_backend_t backend) { + delete get_backend_object(backend); +} + +bool backend_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, + ggml_tensor * dst) { + // TODO: implement this + return false; +} + +ggml_status backend_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + auto * backend_obj = get_backend_object(backend); + GGML_ASSERT(backend_obj != nullptr); + return backend_obj->graph_compute(cgraph); +} + +} // namespace + +namespace hexagon { + +// TODO: should we use another domain? +npu_device::npu_device(backend_index_type device) : _dsp_domain_id(CDSP_DOMAIN_ID) { + GGML_UNUSED(device); + LOG_DEBUG("[%s]NPU device created\n", _name.c_str()); +} + +npu_device::~npu_device() { + if (_device_handle) { + npu_device_close(_device_handle); + } +} + +size_t npu_device::get_alignment() const { + uint32_t alignment = 0; + npu_device_device_get_alignment(_device_handle, &alignment); + return alignment; +} + +bool npu_device::is_device_initialized() const { + if (!_device_handle) { + LOG_ERROR("[%s]NPU device not opened\n", get_name()); + return false; + } + + if (!_rpc_mem) { + LOG_ERROR("[%s]rpc memory not initialized\n", get_name()); + return false; + } + + return true; +} + +bool npu_device::init_device(ggml_backend_dev_t dev, const char * params) { + if (!init_rpc_mem()) { + return false; + } + + if (!_device_handle) { + auto arch = get_dsp_arch(_rpc_interface, _dsp_domain_id); + const auto & device_lib_info = get_device_library_info(arch); + std::string device_lib_uri = device_lib_info.device_lib_uri; + device_lib_uri += get_domain_param(_dsp_domain_id); + LOG_DEBUG("[%s]NPU device arch: %s, uri: %s\n", get_name(), get_dsp_arch_desc(arch), device_lib_uri.c_str()); + auto err = npu_device_open(device_lib_uri.c_str(), &_device_handle); + if (err != AEE_SUCCESS) { + if (err == AEE_ECONNREFUSED) { + LOG_DEBUG("[%s]NPU device is not available, trying to enable unsigned DSP module and reopen\n", + get_name()); + enable_unsigned_dsp_module(_rpc_interface, _dsp_domain_id); + err = npu_device_open(device_lib_uri.c_str(), &_device_handle); + } + + if (err != AEE_SUCCESS) { + LOG_ERROR("[%s]Unable to open NPU device, err: 0x%x, uri %s\n", get_name(), err, + device_lib_uri.c_str()); + _device_handle = 0; + return false; + } + } + + _description += ' '; + _description += get_dsp_arch_desc(arch); + LOG_DEBUG("[%s]NPU device opened successfully\n", get_name()); + } else { + LOG_DEBUG("[%s]NPU device is already opened\n", get_name()); + } + + return true; +} + +bool npu_device::supports_buft(ggml_backend_buffer_type_t buft) const { + return buft && buft->device && buft->device->context == this; +} + +bool npu_device::supports_op_impl(const ggml_tensor * op) { + if (op->op == GGML_OP_NONE) { + return true; + } + + if (type_to_npu_type(op->type) == NPU_DATA_TYPE_COUNT) { + LOG_DEBUG("[%s]Unsupported op tensor type: %s\n", get_name(), ggml_type_name(op->type)); + return false; + } + + auto * src0 = op->src[0]; + if (!src0) { + LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_name(op->op)); + return false; + } + + if (type_to_npu_type(src0->type) == NPU_DATA_TYPE_COUNT) { + LOG_DEBUG("[%s]Unsupported src0 tensor type: %s\n", get_name(), ggml_type_name(src0->type)); + return false; + } + + auto * src1 = op->src[1]; + if (src1 && type_to_npu_type(src1->type) == NPU_DATA_TYPE_COUNT) { + LOG_DEBUG("[%s]Unsupported src1 tensor type: %s\n", get_name(), ggml_type_name(src1->type)); + return false; + } + + auto npu_op = op_to_npu_op(op->op); + if (npu_op == NPU_OP_COUNT) { + LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_name(op->op)); + return false; + } + + constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec { + if (!tensor) { + return npu_device_tensor_spec{}; + } + + static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch"); + npu_device_tensor_spec spec{}; + spec.ne[0] = tensor->ne[0]; + spec.ne[1] = tensor->ne[1]; + spec.ne[2] = tensor->ne[2]; + spec.ne[3] = tensor->ne[3]; + spec.type = type_to_npu_type(tensor->type); + return spec; + }; + + boolean supported = false; + auto src0_spec = get_spec(src0); + auto src1_spec = get_spec(src1); + auto dst_spec = get_spec(op); + auto ret = npu_device_device_support_op(_device_handle, &src0_spec, &src1_spec, &dst_spec, npu_op, &supported); + if (ret != AEE_SUCCESS || !supported) { + LOG_DEBUG("[%s]Unsupported op: %s, ret: 0x%x, supported: %d\n", get_name(), ggml_op_name(op->op), ret, + supported); + return false; + } + + LOG_DEBUG("[%s]Supported op: %s\n", get_name(), ggml_op_name(op->op)); + return true; +} + +bool npu_device::init_rpc_mem() { + if (!_rpc_mem) { + auto rpc_interface = std::make_shared(); + if (!rpc_interface->is_valid()) { + LOG_ERROR("[%s]Failed to load rpc memory library\n", get_name()); + return false; + } + + auto rpc_mem = std::make_shared(rpc_interface); + _rpc_interface = rpc_interface; + _rpc_mem = rpc_mem; + LOG_DEBUG("[%s]rpc memory initialized\n", get_name()); + } else { + LOG_DEBUG("[%s]rpc memory already initialized\n", get_name()); + } + + return true; +} + +bool npu_device::offload_op(const ggml_tensor * op) { + // TODO: implement this + return false; +} + +ggml_backend_buffer_type_t npu_device::get_default_buffer_type(ggml_backend_dev_t dev) { + // Note that this function will be called before the npu_device::init_device + if (!init_rpc_mem()) { + return nullptr; + } + + if (!_default_buffer_type) { + LOG_DEBUG("[%s]Creating default buffer type\n", get_name()); + _default_buffer_type = std::make_unique(dev, _name + "_buffer_type", _rpc_mem); + if (!_default_buffer_type) { + LOG_ERROR("[%s]Default buffer type not initialized\n", get_name()); + return nullptr; + } + } else { + LOG_DEBUG("[%s]Default buffer type already created\n", get_name()); + } + + return _default_buffer_type.get(); +} + +npu_backend::npu_backend(ggml_backend_dev_t dev) : ggml_backend{} { + memccpy(&_guid, &kBackendNpuGuid, 0, sizeof(ggml_guid)); + device = dev; + guid = &_guid; + iface.get_name = backend_get_name; + iface.free = backend_free; + iface.cpy_tensor_async = backend_cpy_tensor_async; + iface.graph_compute = backend_graph_compute; + _device = reinterpret_cast(dev->context); +} + +ggml_status npu_backend::graph_compute(ggml_cgraph * cgraph) { + if (!cgraph || !cgraph->n_nodes) { + LOG_DEBUG("[%s]Graph is empty, nothing to compute\n", get_name()); + return GGML_STATUS_SUCCESS; + } + + std::shared_ptr graph; + if (_graph_cache.count(cgraph) == 0) { + LOG_DEBUG("[%s]graph(%p) not found in cache, creating new graph\n", get_name(), (void *) cgraph); + graph = std::make_shared(cgraph, _device->get_device_handle()); + if (!graph->is_valid()) { + LOG_ERROR("Failed to create graph\n"); + return GGML_STATUS_FAILED; + } + + _graph_cache[cgraph] = graph; + } else { + graph = _graph_cache[cgraph]; + LOG_DEBUG("[%s]graph(%p) found in cache, using existing graph\n", get_name(), (void *) cgraph); + if (!graph->update(cgraph)) { + LOG_ERROR("[%s]Failed to update graph(%p)\n", get_name(), (void *) cgraph); + return GGML_STATUS_FAILED; + } + } + + return graph->compute() ? GGML_STATUS_SUCCESS : GGML_STATUS_FAILED; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/host_device.hpp b/ggml/src/ggml-qnn/npu/host/host_device.hpp new file mode 100644 index 0000000000000..efc7914f18615 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/host_device.hpp @@ -0,0 +1,107 @@ +#pragma once + +#include +#include +#ifndef NDEBUG +# include +#endif + +#include "buffer.hpp" +#include "common.hpp" +#include "ggml-backend-impl.h" +#include "hexagon_npu.h" +#include "rpc-mem.hpp" + +namespace hexagon { + +class npu_device { + public: + explicit npu_device(backend_index_type device); + + ~npu_device(); + + const char * get_name() const { return _name.c_str(); } + + const char * get_description() const { return _description.c_str(); } + + size_t get_alignment() const; + + uint32_t get_dsp_domain_id() const { return _dsp_domain_id; } + + ggml_backend_buffer_type_t get_default_buffer_type(ggml_backend_dev_t dev); + + bool is_device_initialized() const; + bool init_device(ggml_backend_dev_t dev, const char * params); + + bool supports_buft(ggml_backend_buffer_type_t buft) const; + bool offload_op(const ggml_tensor * op); + +#ifndef NDEBUG + bool supports_op(const ggml_tensor * op) { + if (supports_op_impl(op)) { + if (op->op != GGML_OP_NONE) { + _supported_op++; + LOG_DEBUG("[%s]Supported op: %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), + _supported_op.load(), _unsupported_op.load()); + } + + return true; + } + + _unsupported_op++; + LOG_DEBUG("[%s]Unsupported op: %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), + _supported_op.load(), _unsupported_op.load()); + return false; + } +#else + bool supports_op(const ggml_tensor * op) { return supports_op_impl(op); } +#endif + + remote_handle64 get_device_handle() const { return _device_handle; } + + private: + bool supports_op_impl(const ggml_tensor * op); + bool init_rpc_mem(); + + std::string _name = "hexagon-npu"; + std::string _description = "Hexagon NPU"; + common::rpc_interface_ptr _rpc_interface; + common::rpc_mem_ptr _rpc_mem; + remote_handle64 _device_handle = 0; + std::unique_ptr _default_buffer_type; + uint32_t _dsp_domain_id = 0; + +#ifndef NDEBUG + std::atomic_uint32_t _supported_op = 0; + std::atomic_uint32_t _unsupported_op = 0; +#endif + + DISABLE_COPY(npu_device); + DISABLE_MOVE(npu_device); +}; + +class host_graph; + +class npu_backend : public ggml_backend { + public: + explicit npu_backend(ggml_backend_dev_t dev); + + ~npu_backend() {} + + const char * get_name() const { + // TODO: should we use the device name here? + return _device->get_name(); + } + + ggml_status graph_compute(ggml_cgraph * cgraph); + + private: + ggml_guid _guid = {}; + npu_device * _device = nullptr; + std::unordered_map> _graph_cache; + + DISABLE_COPY(npu_backend); + DISABLE_MOVE(npu_backend); +}; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/tensor.hpp b/ggml/src/ggml-qnn/npu/host/tensor.hpp new file mode 100644 index 0000000000000..e7d5f7a88aeb4 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp @@ -0,0 +1,88 @@ +#pragma once + +#include "common.hpp" +#include "ggml-impl.h" +#include "hexagon_npu.h" +#include "util.hpp" + +namespace hexagon { + +// TODO: merge this with device tensor? +class host_tensor { + public: + static host_tensor * from_ggml_tensor(ggml_tensor * tensor) { + if (!tensor || !tensor->extra) { + return nullptr; + } + return static_cast(tensor->extra); + } + + explicit host_tensor(ggml_tensor * tensor, int buffer_fd, uint64_t offset, remote_handle64 device_handle) : + _device_handle(device_handle) { + _info.buffer_fd = buffer_fd; + _info.offset = offset; + _info.type = type_to_npu_type(tensor->type); + _info.op = op_to_npu_op(tensor->op); + _info.size = ggml_nbytes(tensor); + + static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch"); + static_assert(sizeof(_info.ne) == sizeof(tensor->ne), "tensor ne size mismatch"); + static_assert(sizeof(_info.nb) == sizeof(tensor->nb), "tensor nb size mismatch"); + memcpy(_info.ne, tensor->ne, sizeof(_info.ne)); + memcpy(_info.nb, tensor->nb, sizeof(_info.nb)); + + auto status = npu_device_tensor_init(_device_handle, &_info, &_device_tensor_handle); + if (status != AEE_SUCCESS) { + LOG_ERROR("Failed to init tensor: %d", (int) status); + _device_tensor_handle = 0; + return; + } + + tensor->extra = this; + _ggml_tensor = tensor; + LOG_DEBUG( + "host_tensor(%p) created, ggml_tensor(%p[%ldx%ldx%ldx%ld], nb[%ld][%ld][%ld][%ld]), " + "device_tensor_handle(%p)\n", + (void *) this, (void *) tensor, (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2], + (long) tensor->ne[3], (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2], + (long) tensor->nb[3], (void *) _device_tensor_handle); + } + + ~host_tensor() { + LOG_DEBUG("host_tensor(%p) destroy, device_tensor_handle: %p\n", (void *) this, (void *) _device_tensor_handle); + if (_device_tensor_handle) { + npu_device_tensor_free(_device_handle, _device_tensor_handle); + _ggml_tensor->extra = nullptr; + } + } + + npu_device_tensor_handle_t get_device_tensor_handle() const { return _device_tensor_handle; } + + void set_src(size_t index, host_tensor * src) { + if (index >= DEVICE_TENSOR_MAX_SRC) { + LOG_ERROR("host_tensor(%p) set_src[%zu] out of range\n", (void *) this, index); + return; + } + + LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, index, (void *) src); + npu_device_tensor_set_src(_device_handle, _device_tensor_handle, index, src->get_device_tensor_handle()); + } + + void set_op(ggml_op op) { + _info.op = op_to_npu_op(op); + npu_device_tensor_set_op(_device_handle, _device_tensor_handle, _info.op); + } + + bool is_valid() const { return _device_tensor_handle != 0; } + + private: + remote_handle64 _device_handle = 0; + npu_device_tensor_handle_t _device_tensor_handle = 0; + npu_device_tensor_config _info = {}; + ggml_tensor * _ggml_tensor = nullptr; + + DISABLE_COPY(host_tensor); + DISABLE_MOVE(host_tensor); +}; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/util.cpp b/ggml/src/ggml-qnn/npu/host/util.cpp new file mode 100644 index 0000000000000..5db54b661ebde --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/util.cpp @@ -0,0 +1,96 @@ +#include "util.hpp" + +#include + +namespace hexagon { + +enum npu_device_tensor_op op_to_npu_op(ggml_op op) { + switch (op) { + case GGML_OP_MUL_MAT: + return NPU_OP_MUL_MAT; + case GGML_OP_ADD: + return NPU_OP_ADD; + case GGML_OP_SUB: + return NPU_OP_SUB; + case GGML_OP_MUL: + return NPU_OP_MUL; + default: + return NPU_OP_COUNT; + } +} + +enum npu_device_tensor_data_type type_to_npu_type(ggml_type type) { + switch (type) { + case GGML_TYPE_F32: + return NPU_DATA_TYPE_F32; + default: + return NPU_DATA_TYPE_COUNT; + } +} + +hexagon_dsp_arch get_dsp_arch(common::rpc_interface_ptr rpc_interface, uint32_t domain_id) { + if (!rpc_interface || !rpc_interface->is_valid()) { + return NONE; + } + + remote_dsp_capability dsp_caps = {}; + dsp_caps.domain = domain_id; + dsp_caps.attribute_ID = ARCH_VER; + auto ret = rpc_interface->remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_caps, sizeof(dsp_caps)); + if (ret != AEE_SUCCESS) { + LOG_ERROR("failed to get DSP arch: %d\n", ret); + return NONE; + } + + LOG_DEBUG("get DSP arch: 0x%x\n", (int) dsp_caps.capability); + auto arch = dsp_caps.capability & 0xFF; + switch (arch) { + case 0x68: + return V68; + case 0x69: + return V69; + case 0x73: + return V73; + case 0x75: + return V75; + case 0x79: + return V79; + default: + LOG_ERROR("unknown DSP arch: %x\n", arch); + return NONE; + } +} + +const char * get_dsp_arch_desc(hexagon_dsp_arch arch) { + switch (arch) { + case V68: + return "V68"; + case V69: + return "V69"; + case V73: + return "V73"; + case V75: + return "V75"; + case V79: + return "V79"; + case NONE: + default: + return "UnknownArch"; + } +} + +void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id) { + if (!rpc_interface || !rpc_interface->is_valid()) { + return; + } + + remote_rpc_control_unsigned_module data = {}; + data.domain = domain_id; + data.enable = 1; + auto ret = rpc_interface->remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, &data, sizeof(data)); + if (ret != AEE_SUCCESS) { + LOG_ERROR("failed to enable unsigned DSP module: 0x%x\n", ret); + } +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/util.hpp b/ggml/src/ggml-qnn/npu/host/util.hpp new file mode 100644 index 0000000000000..c001272d4cf7f --- /dev/null +++ b/ggml/src/ggml-qnn/npu/host/util.hpp @@ -0,0 +1,26 @@ +#include "ggml-impl.h" +#include "hexagon_npu.h" +#include "rpc-interface.hpp" + +namespace hexagon { + +enum npu_device_tensor_op op_to_npu_op(ggml_op op); +enum npu_device_tensor_data_type type_to_npu_type(ggml_type type); + +// TODO: merge with qcom_htp_arch +enum hexagon_dsp_arch { + NONE = 0, + V68, + V69, + V73, + V75, + V79, // SD 8 Gen 4 (SM8750) +}; + +hexagon_dsp_arch get_dsp_arch(common::rpc_interface_ptr rpc_interface, uint32_t domain_id); + +const char * get_dsp_arch_desc(hexagon_dsp_arch arch); + +void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id); + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl new file mode 100644 index 0000000000000..d62e65b3bd877 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl @@ -0,0 +1,90 @@ +#include "AEEStdDef.idl" +#include "AEEStdErr.idl" +#include "remote.idl" + +const uint32_t DEVICE_TENSOR_MAX_DIMS = 4; +const uint32_t DEVICE_TENSOR_MAX_SRC = 2; + +interface npu_device : remote_handle64{ + + typedef int64_t ne_type[DEVICE_TENSOR_MAX_DIMS]; + typedef uint64_t tensor_handle_t; + typedef uint64_t graph_handle_t; + + enum tensor_op { + NPU_OP_MUL_MAT, + NPU_OP_ADD, + NPU_OP_SUB, + NPU_OP_MUL, + NPU_OP_COUNT + }; + + enum tensor_data_type { + NPU_DATA_TYPE_F32, + NPU_DATA_TYPE_COUNT + }; + + struct tensor_spec { + ne_type ne; + tensor_data_type type; + }; + + struct tensor_config { + ne_type ne; + uint64_t nb[DEVICE_TENSOR_MAX_DIMS]; + long buffer_fd; + uint64_t offset; + uint64_t size; + tensor_data_type type; + tensor_op op; + }; + + AEEResult device_get_alignment( + rout uint32_t alignment + ); + + AEEResult device_support_op( + in tensor_spec src0, + in tensor_spec src1, + in tensor_spec dst, + in tensor_op op, + rout boolean is_supported + ); + + AEEResult tensor_init( + in tensor_config info, + rout tensor_handle_t tensor_handle + ); + + AEEResult tensor_set_src( + in tensor_handle_t tensor_handle, + in uint64_t index, + in tensor_handle_t src + ); + + AEEResult tensor_set_op( + in tensor_handle_t tensor_handle, + in tensor_op op + ); + + AEEResult tensor_free( + in tensor_handle_t tensor_handle + ); + + AEEResult graph_init( + rout graph_handle_t graph_handle + ); + + AEEResult graph_set_tensor( + in graph_handle_t graph_handle, + in sequence tensor_handles + ); + + AEEResult graph_compute( + in graph_handle_t graph_handle + ); + + AEEResult graph_free( + in graph_handle_t graph_handle + ); +}; diff --git a/ggml/src/ggml-qnn/qnn/backend-ops.cpp b/ggml/src/ggml-qnn/qnn/backend-ops.cpp new file mode 100644 index 0000000000000..d4d2c57cbf4fe --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/backend-ops.cpp @@ -0,0 +1,449 @@ + +#include "backend-ops.hpp" + +#include + +#include "ggml-impl.h" +#include "graph.hpp" +#include "logger.hpp" +#include "op-config.hpp" +#include "tensor.hpp" +#include "utils.hpp" + +namespace { + +qnn::qnn_graph * get_qnn_graph_from_cache(qnn::ggml_backend_qnn_device_context * ctx, const ggml_cgraph * cgraph) { + auto & graph_cache = ctx->qnn_graph_cache; + std::string graph_key; + auto op_data_type = qnn::qnn_graph::get_graph_key_from_cgraph(cgraph, graph_key); + if (graph_key.empty()) { + QNN_LOG_DEBUG("[%s]empty graph key for cgraph: %p, size: %d\n", qnn::get_backend_name(ctx->device), + (const void *) cgraph, (int) cgraph->n_nodes); + return nullptr; + } + + auto it = graph_cache.find(graph_key); + qnn::qnn_graph * graph_ptr = nullptr; + if (it != graph_cache.end()) { + auto it = graph_cache.find(graph_key); + QNN_LOG_DEBUG("[%s]found graph %s in cache, cache size: %d\n", qnn::get_backend_name(ctx->device), + graph_key.c_str(), (int) graph_cache.size()); + graph_ptr = it->second.get(); + } else { + auto precision = qnn::qnn_graph::kHtpDefault; + if (op_data_type == GGML_TYPE_F16) { + QNN_LOG_DEBUG("[%s][%s]set graph precision to FP16\n", qnn::get_backend_name(ctx->device), + graph_key.c_str()); + precision = qnn::qnn_graph::kHtpFp16; + } + + auto graph = std::make_unique(graph_key, ctx->device, ctx->instance, precision, + ctx->socinfo.vtcm_size_in_mb); + if (!graph->is_valid()) { + return nullptr; + } + + if (!graph->build_graph_from_ggml_graph(cgraph)) { + QNN_LOG_ERROR("[%s]build_graph_from_op failed\n", qnn::get_backend_name(ctx->device)); + return nullptr; + } + + graph_ptr = graph.get(); + graph_cache[graph_key] = std::move(graph); + QNN_LOG_DEBUG("[%s]add graph %s to cache, cache size: %d\n", qnn::get_backend_name(ctx->device), + graph_key.c_str(), (int) graph_cache.size()); + } + + return graph_ptr; +} + +// TODO: could be merge into op caps array +constexpr const bool kQnnSupportedOps[] = { + true, // GGML_OP_NONE + false, // GGML_OP_DUP + true, // GGML_OP_ADD + false, // GGML_OP_ADD1 + false, // GGML_OP_ACC + true, // GGML_OP_SUB + true, // GGML_OP_MUL + false, // GGML_OP_DIV, disabled for now cause failed on test-backend-ops + false, // GGML_OP_SQR + false, // GGML_OP_SQRT, disabled for now cause failed on test-backend-ops + true, // GGML_OP_LOG + false, // GGML_OP_SIN + false, // GGML_OP_COS + false, // GGML_OP_SUM + false, // GGML_OP_SUM_ROWS + false, // GGML_OP_MEAN + false, // GGML_OP_ARGMAX + false, // GGML_OP_COUNT_EQUAL + false, // GGML_OP_REPEAT + false, // GGML_OP_REPEAT_BACK + false, // GGML_OP_CONCAT + false, // GGML_OP_SILU_BACK + false, // GGML_OP_NORM + false, // GGML_OP_RMS_NORM + false, // GGML_OP_RMS_NORM_BACK + false, // GGML_OP_GROUP_NORM + false, // GGML_OP_L2_NORM + + true, // GGML_OP_MUL_MAT + false, // GGML_OP_MUL_MAT_ID + false, // GGML_OP_OUT_PROD + + false, // GGML_OP_SCALE + false, // GGML_OP_SET + false, // GGML_OP_CPY + false, // GGML_OP_CONT + false, // GGML_OP_RESHAPE + false, // GGML_OP_VIEW + false, // GGML_OP_PERMUTE + false, // GGML_OP_TRANSPOSE + false, // GGML_OP_GET_ROWS + false, // GGML_OP_GET_ROWS_BACK + false, // GGML_OP_DIAG + false, // GGML_OP_DIAG_MASK_INF + false, // GGML_OP_DIAG_MASK_ZERO + false, // GGML_OP_SOFT_MAX + false, // GGML_OP_SOFT_MAX_BACK + false, // GGML_OP_ROPE + false, // GGML_OP_ROPE_BACK + false, // GGML_OP_CLAMP + false, // GGML_OP_CONV_TRANSPOSE_1D + false, // GGML_OP_IM2COL + false, // GGML_OP_IM2COL_BACK + false, // GGML_OP_CONV_TRANSPOSE_2D + false, // GGML_OP_POOL_1D + false, // GGML_OP_POOL_2D + false, // GGML_OP_POOL_2D_BACK + false, // GGML_OP_UPSCALE + false, // GGML_OP_PAD + false, // GGML_OP_PAD_REFLECT_1D + false, // GGML_OP_ARANGE + false, // GGML_OP_TIMESTEP_EMBEDDING + false, // GGML_OP_ARGSORT + false, // GGML_OP_LEAKY_RELU + + false, // GGML_OP_FLASH_ATTN_EXT + false, // GGML_OP_FLASH_ATTN_BACK + false, // GGML_OP_SSM_CONV + false, // GGML_OP_SSM_SCAN + false, // GGML_OP_WIN_PART + false, // GGML_OP_WIN_UNPART + false, // GGML_OP_GET_REL_POS + false, // GGML_OP_ADD_REL_POS + false, // GGML_OP_RWKV_WKV6 + false, // GGML_OP_GATED_LINEAR_ATTN + false, // GGML_OP_RWKV_WKV7 + + false, // GGML_OP_UNARY + + false, // GGML_OP_MAP_CUSTOM1 + false, // GGML_OP_MAP_CUSTOM2 + false, // GGML_OP_MAP_CUSTOM3 + + false, // GGML_OP_CUSTOM + + false, // GGML_OP_CROSS_ENTROPY_LOSS + false, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + false, // GGML_OP_OPT_STEP_ADAMW + + // ggml_unary_op + false, // GGML_UNARY_OP_ABS + false, // GGML_UNARY_OP_SGN + false, // GGML_UNARY_OP_NEG + false, // GGML_UNARY_OP_STEP + false, // GGML_UNARY_OP_TANH + false, // GGML_UNARY_OP_ELU + false, // GGML_UNARY_OP_RELU + false, // GGML_UNARY_OP_SIGMOID + true, // GGML_UNARY_OP_GELU + false, // GGML_UNARY_OP_GELU_QUICK + false, // GGML_UNARY_OP_SILU + false, // GGML_UNARY_OP_HARDSWISH + false, // GGML_UNARY_OP_HARDSIGMOID + false, // GGML_UNARY_OP_EXP +}; + +static_assert(kQnnSupportedOps[GGML_OP_NONE], "GGML_OP_NONE is not true"); +static_assert(kQnnSupportedOps[GGML_OP_ADD], "GGML_OP_ADD is not true"); +static_assert(kQnnSupportedOps[GGML_OP_MUL], "GGML_OP_MUL is not true"); +static_assert(kQnnSupportedOps[GGML_OP_MUL_MAT], "GGML_OP_MUL_MAT is not true"); +static_assert(!kQnnSupportedOps[GGML_OP_RESHAPE], "GGML_OP_RESHAPE should not be true"); +static_assert(!kQnnSupportedOps[GGML_OP_VIEW], "GGML_OP_VIEW is not false"); +static_assert(std::size(kQnnSupportedOps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kQnnSupportedOps table"); + +inline bool is_type_bit_enabled(uint64_t bits, ggml_type type) { + return bits & (uint64_t(1) << type); +} + +inline bool is_tensor_size_valid(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) { + constexpr const auto get_tensor_size_in_bytes = [](const ggml_tensor * tensor, ggml_type type) -> size_t { + return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3] * ggml_type_size(type); + }; + + auto type = tensor->type; + if (ggml_is_quantized(type) && ctx->enable_cpu_dequantize) { + type = GGML_TYPE_F32; // TODO: [quantize] fix me if plan to dequantize to other types + } + + const auto tensor_size = get_tensor_size_in_bytes(tensor, type); + if (ctx->max_tensor_size_in_bytes && tensor_size >= ctx->max_tensor_size_in_bytes) { + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) size(%lld) exceeds the limit(%lld)\n", + qnn::get_backend_name(ctx->device), ggml_get_name(tensor), (int) tensor->ne[0], + (int) tensor->ne[1], (int) tensor->ne[2], (int) tensor->ne[3], (long long int) tensor_size, + (long long int) ctx->max_tensor_size_in_bytes); + return false; + } + + return true; +} + +bool is_tensor_type_valid(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) { + if (!tensor) { + QNN_LOG_DEBUG("tensor is nullptr\n"); + return false; + } + +#ifndef NDEBUG + if (tensor->view_src) { + auto * src_tensor = tensor->view_src; + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d\n", qnn::get_backend_name(ctx->device), + ggml_get_name(tensor), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], + (int) tensor->ne[3], ggml_get_name(src_tensor), (int) src_tensor->ne[0], (int) src_tensor->ne[1], + (int) src_tensor->ne[2], (int) src_tensor->ne[3]); + } +#endif + + switch (tensor->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + if (!is_type_bit_enabled(ctx->supported_types, tensor->type)) { + QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x\n", + qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type), + (unsigned int) ctx->supported_types); + return false; + } + break; + default: + QNN_LOG_DEBUG("[%s]unsupported data type %s\n", qnn::get_backend_name(ctx->device), + ggml_type_name(tensor->type)); + return false; + } + + return true; +} + +bool is_data_reinterpretation_op(ggml_op op) { + return op == GGML_OP_VIEW || op == GGML_OP_PERMUTE; +} + +bool ggnl_qnn_supports_op_tensor(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { + if (op->op == GGML_OP_NONE) { + return true; + } + + if (!is_tensor_type_valid(ctx, op) || !is_tensor_size_valid(ctx, op)) { + return false; + } + + // TODO: fix for other op + const bool cpu_dequant = ctx->enable_cpu_dequantize && op->op == GGML_OP_MUL_MAT; + for (size_t i = 0; i < GGML_MAX_SRC && op->src[i]; ++i) { + auto * src = op->src[i]; + if (!is_tensor_size_valid(ctx, src)) { + return false; + } + + // passthrough the quantized tensor for CPU dequantization + if (!is_tensor_type_valid(ctx, src) && (!cpu_dequant || !ggml_is_quantized(src->type))) { + return false; + } + } + + return true; +} + +bool ggml_qnn_have_same_tensor_types(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { + auto * src0 = op->src[0]; + auto * src1 = op->src[1]; + if (src1) { + if (src0->type != op->type || src1->type != op->type) { + QNN_LOG_DEBUG("[%s][%s]type src0(%s), src1(%s) and op(%s) are not equal\n", + qnn::get_backend_name(ctx->device), ggml_op_name(op->op), ggml_type_name(src0->type), + ggml_type_name(src1->type), ggml_type_name(op->type)); + return false; + } + } else { + if (src0->type != op->type) { + QNN_LOG_DEBUG("[%s][%s]type src0(%s) and op(%s) are not equal\n", qnn::get_backend_name(ctx->device), + ggml_op_name(op->op), ggml_type_name(src0->type), ggml_type_name(op->type)); + return false; + } + } + +#ifdef NDEBUG + GGML_UNUSED(ctx); +#endif + + return true; +} + +// TODO: move to caps array? +bool ggml_qnn_supports_matmul_op(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { + auto * src0 = op->src[0]; + auto * src1 = op->src[1]; + if (is_data_reinterpretation_op(src0->op) || is_data_reinterpretation_op(src1->op)) { + // TODO: remove the blocker here when we support permute op + QNN_LOG_DEBUG("[%s][MUL_MAT]data reorganization op is not supported, (%s, %s)\n", + qnn::get_backend_name(ctx->device), ggml_op_name(src0->op), ggml_op_name(src1->op)); + return false; + } + + switch (ctx->device) { + case QNN_BACKEND_NPU: + if (src1->ne[2] != src0->ne[2] || src1->ne[3] != src0->ne[3]) { + /* + * TODO: remove the blocker here when NPU backend supports mul_mat like this: + * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] + */ + QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]src0 and src1 dimensions are not equal\n"); + return false; + } + // fall through, from test here, the convert op is super slow on NPU: + // https://github.com/usefulsensors/qc_npu_benchmark + case QNN_BACKEND_GPU: + if (!ggml_qnn_have_same_tensor_types(ctx, op) && op->type != GGML_TYPE_F32) { + // for different tensor types and not float32, we don't support it currently, since there's no convert + QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 and dst types are not equal\n", + qnn::get_backend_name(ctx->device)); + return false; + } + if (op->type == GGML_TYPE_F32 && ggml_is_quantized(src0->type) && + !is_type_bit_enabled(ctx->cpu_preprocess_types, src0->type)) { + // for such cases that src0 is quantized and op is float32, check if the quant type is enabled + QNN_LOG_DEBUG("[%s][MUL_MAT]quantized src0 type %s is not enabled\n", + qnn::get_backend_name(ctx->device), ggml_type_name(src0->type)); + return false; + } + break; + default: + break; + } + + if ((src1->ne[2] % src0->ne[2]) != 0 || (src1->ne[3] % src0->ne[3]) != 0) { + QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 dimensions are not equal\n", qnn::get_backend_name(ctx->device)); + return false; + } + + QNN_LOG_DEBUG("[%s][MUL_MAT]supported matmul op\n", qnn::get_backend_name(ctx->device)); + return true; +} + +#ifndef NDEBUG + +void print_tensor_info(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op, bool is_supported) { + const char * supported = is_supported ? "supported" : "unsupported"; + std::string op_key; + qnn::get_qnn_op_desc(op, true, GGML_TYPE_COUNT, op_key); + + QNN_LOG_DEBUG("[%s][%s]op was %s, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), op_key.c_str(), + supported, ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); +} + +#endif + +} // namespace + +namespace qnn { + +bool device_supports_op(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { + // Note that this function could be called before the device context is initialized + if (op->op == GGML_OP_NONE) { + return true; + } + + if (!kQnnSupportedOps[qnn::get_qnn_op_index(op)]) { +#ifndef NDEBUG + ctx->unsupported_op_count++; + print_tensor_info(ctx, op, false); +#endif + return false; + } + + if (!ggnl_qnn_supports_op_tensor(ctx, op)) { +#ifndef NDEBUG + ctx->unsupported_op_count++; + print_tensor_info(ctx, op, false); +#endif + return false; + } + + bool is_op_supported = true; + if (op->op == GGML_OP_UNARY) { + const auto unary_op = ggml_get_unary_op(op); + if (unary_op == GGML_UNARY_OP_GELU) { + // TODO: fix this + QNN_LOG_DEBUG("[GELU]unsupported unary op GGML_UNARY_OP_GELU for NPU\n"); + is_op_supported = false; + } + } else { + auto * src0 = op->src[0]; + auto * src1 = op->src[1]; + switch (op->op) { + case GGML_OP_MUL: + // TODO: fix this when we have the support for mul with rms_norm + if (ctx->enable_cpu_dequantize && (src0->op == GGML_OP_RMS_NORM || src1->op == GGML_OP_RMS_NORM)) { + QNN_LOG_DEBUG("[%s][%s]skip unsupported mul with rms norm, (%s, %s)\n", + qnn::get_backend_name(ctx->device), ggml_op_desc(op), ggml_op_desc(src0), + ggml_op_desc(src1)); + is_op_supported = false; + break; + } + // fall through, just skip the mul with rms_norm, in llama, its at start of decoder block + case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_DIV: + // TODO: move to op caps array? + if (!ggml_are_same_shape(src0, src1)) { + QNN_LOG_DEBUG("[%s][%s] src0 and src1 dimensions are not equal\n", + qnn::get_backend_name(ctx->device), ggml_op_desc(op)); + is_op_supported = false; + } + break; + case GGML_OP_MUL_MAT: + is_op_supported = ggml_qnn_supports_matmul_op(ctx, op); + break; + + default: + is_op_supported = ggml_qnn_have_same_tensor_types(ctx, op); + break; + } + } + +#ifndef NDEBUG + if (is_op_supported) { + ctx->supported_op_count++; + } else { + ctx->unsupported_op_count++; + } + + print_tensor_info(ctx, op, is_op_supported); +#endif + + return is_op_supported; +} + +bool device_compute_graph(qnn::ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph) { + QNN_LOG_DEBUG("[%s]compute graph start, nodes count: %d\n", qnn::get_backend_name(ctx->device), + (int) cgraph->n_nodes); + + auto qnn_graph = get_qnn_graph_from_cache(ctx, cgraph); + bool success = qnn_graph && qnn_graph->execute(cgraph, ctx->convert_context); + + QNN_LOG_DEBUG("[%s]compute graph, success: %d\n", qnn::get_backend_name(ctx->device), (int) success); + return success; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/backend-ops.hpp b/ggml/src/ggml-qnn/qnn/backend-ops.hpp new file mode 100644 index 0000000000000..564a64a40e654 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/backend-ops.hpp @@ -0,0 +1,59 @@ +#pragma once + +#ifndef NDEBUG +# include +#endif + +#include +#include +#include +#include + +#include "convert.hpp" +#include "ggml-backend.h" +#include "ggml-qnn.h" +#include "ggml.h" +#include "graph.hpp" +#include "qnn-lib.hpp" + +namespace qnn { + +typedef std::unordered_map> qnn_graph_cache_t; + +struct ggml_backend_qnn_device_context { + // initialize in constructor + backend_index_type device; + size_t threads; + std::string name; + std::string description; + + // initialize in qnn init + qnn::qcom_socinfo socinfo = {}; + size_t max_tensor_size_in_bytes; + std::shared_ptr instance; + std::shared_ptr qnn_interface; + + qnn::qnn_graph_cache_t qnn_graph_cache; + std::shared_ptr convert_context = std::make_shared(); + +#ifndef NDEBUG + std::atomic_uint32_t supported_op_count = 0; + std::atomic_uint32_t unsupported_op_count = 0; +#endif + + bool enable_cpu_dequantize = false; + uint64_t supported_types; + uint64_t cpu_preprocess_types; + + explicit ggml_backend_qnn_device_context(backend_index_type device, size_t threads, const char * name, + uint64_t supported_types) : + device(device), + threads(threads), + name(name), + supported_types(supported_types) {} +}; + +bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op); +bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph); + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/buffer.hpp b/ggml/src/ggml-qnn/qnn/buffer.hpp new file mode 100644 index 0000000000000..2840f78fb51ba --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/buffer.hpp @@ -0,0 +1,187 @@ +#pragma once + +#include +#include + +#include "logger.hpp" +#include "qnn-lib.hpp" + +namespace qnn { + +/** + * @brief An interface for managing generic QNN buffers. + * + * This abstract class defines the interface for managing generic memory buffers in a QNN context. + */ +class qnn_buffer_interface { + public: + virtual ~qnn_buffer_interface() = default; + + /** + * @brief Checks if the buffer is valid. + * + * This pure virtual function must be implemented by derived classes to check + * the validity of the buffer. + * + * @return true if the buffer is valid, false otherwise. + */ + virtual bool is_valid() const = 0; + + /** + * @brief Gets the buffer pointer. + * + * This pure virtual function must be implemented by derived classes to return + * a pointer to the buffer. + * + * @return A pointer to the buffer. + */ + virtual uint8_t * get_buffer() = 0; + + /** + * @brief Gets the buffer pointer. + * + * This pure virtual function must be implemented by derived classes to return + * a pointer to the buffer. + * + * @return A pointer to the buffer. + */ + virtual size_t get_size() const = 0; + + /** + * @brief Gets the QNN memory handle associated with the buffer. + * + * This pure virtual function must be implemented by derived classes to return + * the memory handle associated with the buffer. + * + * @return The memory handle, or null if no valid QNN memory handle is attached. + */ + virtual Qnn_MemHandle_t get_mem_handle() const = 0; +}; + +using qnn_buffer_ptr = std::shared_ptr; + +/** + * @brief A class for managing QNN RPC memory buffers. + * + * This class is responsible for allocating, registering, and managing a buffer in RPC memory. + * It ensures that the buffer is properly allocated and registered with the QNN instance, and + * handles cleanup of the buffer and its associated memory handle upon destruction. + */ +class qnn_rpc_buffer : public qnn_buffer_interface { + public: + qnn_rpc_buffer(qnn_instance_ptr qnn_instance, const size_t size, const uint32_t rank, uint32_t * dimensions, + Qnn_DataType_t data_type) : + _size(size), + _qnn_instance(qnn_instance) { + _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *))); + _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type); + if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) { + QNN_LOG_WARN("Failed to register RPC memory: buffer or memory handle is null\n"); + // let the destructor free the buffer + return; + } + + QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d\n", (void *) _qnn_rpc_buffer, (int) size); + } + + ~qnn_rpc_buffer() { + if (_qnn_instance) { + if (_qnn_rpc_mem_handle) { + _qnn_instance->unregister_rpcmem(_qnn_rpc_mem_handle); + } + + if (_qnn_rpc_buffer) { + _qnn_instance->free_rpcmem(_qnn_rpc_buffer); + } + } + } + + bool is_valid() const override { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; } + + uint8_t * get_buffer() override { return _qnn_rpc_buffer; } + + size_t get_size() const override { return _size; } + + Qnn_MemHandle_t get_mem_handle() const override { return _qnn_rpc_mem_handle; } + + private: + size_t _size = 0; + uint8_t * _qnn_rpc_buffer = nullptr; + Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; + qnn_instance_ptr _qnn_instance; + + DISABLE_COPY(qnn_rpc_buffer); + DISABLE_MOVE(qnn_rpc_buffer); +}; + +/** + * @brief A class for managing QNN memory buffers allocated in regular memory. + * + * This class is responsible for allocating, managing, and freeing memory buffers + * in regular (non-RPC) memory. It implements the qnn_buffer_interface to provide + * a consistent interface for buffer management. + */ +class qnn_mem_buffer : public qnn_buffer_interface { + public: + explicit qnn_mem_buffer(const uint8_t * data, size_t size) { + _buffer = reinterpret_cast(qnn::page_align_alloc(size)); + + if (!_buffer) { + QNN_LOG_WARN("failed to allocate %.2f MiB\n", float(size / (1 << 20))); + return; + } + + _size = size; + + if (data) { + memcpy(_buffer, data, size); + } + + QNN_LOG_DEBUG("alloc buffer: %p, size: %ld\n", (void *) _buffer, (long) size); + } + + explicit qnn_mem_buffer(size_t size) : qnn_mem_buffer(nullptr, size) {} + + ~qnn_mem_buffer() { + QNN_LOG_DEBUG("free buffer: %p, size: %ld\n", (void *) _buffer, (long) _size); + // the free will do nothing if the _buffer is nullptr + qnn::align_free(_buffer); + } + + bool is_valid() const override { return _buffer != nullptr; } + + uint8_t * get_buffer() override { return _buffer; } + + size_t get_size() const override { return _size; } + + Qnn_MemHandle_t get_mem_handle() const override { return nullptr; } + + private: + size_t _size = 0; + uint8_t * _buffer = nullptr; + + DISABLE_COPY(qnn_mem_buffer); + DISABLE_MOVE(qnn_mem_buffer); +}; + +class qnn_mem_buffer_slice : public qnn_buffer_interface { + public: + qnn_mem_buffer_slice(const uint8_t * buffer, size_t size) : _buffer(const_cast(buffer)), _size(size) {} + + bool is_valid() const override { return _buffer && _size; } + + uint8_t * get_buffer() override { return _buffer; } + + size_t get_size() const override { return _size; } + + Qnn_MemHandle_t get_mem_handle() const override { return nullptr; } + + private: + uint8_t * _buffer = nullptr; + size_t _size = 0; + + DISABLE_COPY(qnn_mem_buffer_slice); + DISABLE_MOVE(qnn_mem_buffer_slice); +}; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/convert.cpp b/ggml/src/ggml-qnn/qnn/convert.cpp new file mode 100644 index 0000000000000..9719bac345eee --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/convert.cpp @@ -0,0 +1,155 @@ + +#include "convert.hpp" + +#include "logger.hpp" + +namespace { + +size_t get_convert_buffer_size(const qnn::ggml_dimension_array_t & dimensions, ggml_type dst_type) { + GGML_ASSERT(ggml_blck_size(dst_type) == 1); + size_t nbytes = ggml_type_size(dst_type); + for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { + nbytes *= dimensions[i]; // tight packing + } + + return nbytes; +} + +// from ggml_backend_blas_mul_mat, when omp available, use it otherwise will fall back to standard lib solution +// TODO: remove this when we can fall back the convert to blas backend +#ifdef GGML_USE_OPENMP + +void convert_tensor_impl(const ggml_tensor * src, int max_threads, + std::shared_ptr & output_buffer) { + const auto ne03 = src->ne[3]; + const auto ne02 = src->ne[2]; + const auto ne01 = src->ne[1]; + const auto ne00 = src->ne[0]; + const auto ne_plane = ne01 * ne00; + const auto nb03 = src->nb[3]; + const auto nb02 = src->nb[2]; + const auto nb01 = src->nb[1]; + const int min_cols_per_thread = 4096; + void * wdata = output_buffer->get_buffer(); + const auto to_float = ggml_get_type_traits(src->type)->to_float; + GGML_ASSERT(to_float); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src->data + i02 * nb02 + i03 * nb03; + float * const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane; + + const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1); + const int n_threads = std::max(std::min(max_threads, (int) (ne01 / min_rows_per_thread)), 1); + +# pragma omp parallel for num_threads(n_threads) + for (int64_t i01 = 0; i01 < ne01; i01++) { + to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00); + } + } + } + + return output_buffer; +} + +#else + +void convert_tensor_impl(const ggml_tensor * src, int max_threads, std::vector> & tasks, + std::shared_ptr & output_buffer) { + const auto ne03 = src->ne[3]; + const auto ne02 = src->ne[2]; + const auto ne01 = src->ne[1]; + const auto ne00 = src->ne[0]; + const auto ne_plane = ne01 * ne00; + const auto nb03 = src->nb[3]; + const auto nb02 = src->nb[2]; + const auto nb01 = src->nb[1]; + const int min_cols_per_thread = 4096; + void * wdata = output_buffer->get_buffer(); + const auto to_float = ggml_get_type_traits(src->type)->to_float; + GGML_ASSERT(to_float); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src->data + i02 * nb02 + i03 * nb03; + float * const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane; + + const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1); + const int n_threads = std::max(std::min(max_threads, (int) (ne01 / min_rows_per_thread)), 1); + + for (int i = 1; i < n_threads; i++) { + const int64_t start = i * ne01 / n_threads; + const int64_t end = (i + 1) * ne01 / n_threads; + if (start < end) { + tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00); + } + })); + } + } + { + // reuse the current thread for the first task + const int64_t start = 0; + const int64_t end = ne01 / n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00); + } + } + } + } + + // wait for all tasks to finish + for (auto & task : tasks) { + task.get(); + } + tasks.clear(); +} + +#endif + +} // namespace + +namespace qnn { + +std::vector convert(std::shared_ptr convert_context, + const ggml_tensor_array_t & tensors, ggml_type target_data_type) { + convert_context->buffers.resize(tensors.size()); + std::vector output_buffers(tensors.size()); + for (size_t i = 0; i < tensors.size(); ++i) { + const ggml_tensor * src = tensors[i]; + if (src->type == target_data_type) { + continue; + } + + auto & data_buffer = convert_context->buffers[i]; + const auto dst_size = get_convert_buffer_size(src->ne, target_data_type); + if (!data_buffer || data_buffer->get_size() < dst_size) { +#ifndef NDEBUG + auto old_size = data_buffer ? data_buffer->get_size() : 0; + QNN_LOG_DEBUG("create buffer[%d] for tensor %s(%s), old_size: %d, new_size: %d\n", (int) i, + ggml_get_name(src), ggml_type_name(src->type), (int) old_size, (int) dst_size); +#endif + data_buffer = std::make_shared(dst_size); + } + + // TODO: add more restrictions to the buffer slice here + std::shared_ptr output_buffer = + std::make_shared(data_buffer->get_buffer(), dst_size); + + QNN_LOG_DEBUG("convert tensor(%s) from %s to %s, size: %d, n_threads: %d\n", ggml_get_name(src), + ggml_type_name(src->type), ggml_type_name(target_data_type), (int) dst_size, + convert_context->n_threads); + +#ifdef GGML_USE_OPENMP + convert_tensor_impl(src, convert_context->n_threads, output_buffer); +#else + convert_tensor_impl(src, convert_context->n_threads, convert_context->tasks, output_buffer); +#endif + output_buffers[i] = output_buffer; + } + + return output_buffers; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/convert.hpp b/ggml/src/ggml-qnn/qnn/convert.hpp new file mode 100644 index 0000000000000..818004c587ba8 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/convert.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include + +#include "buffer.hpp" +#include "ggml-qnn.h" +#include "tensor.hpp" +#include "utils.hpp" + +namespace qnn { + +// see also: ggml_backend_blas_context +struct qnn_convert_context_t { + int n_threads = std::thread::hardware_concurrency(); + std::vector> buffers; +#ifndef GGML_USE_OPENMP + std::vector> tasks; +#endif +}; + +std::vector convert(std::shared_ptr convert_context, + const ggml_tensor_array_t & tensors, ggml_type target_data_type); + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp new file mode 100644 index 0000000000000..e559cfdb28627 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp @@ -0,0 +1,394 @@ +#include +#include +#include + +#include "backend-ops.hpp" +#include "common.hpp" +#include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include "logger.hpp" +#include "tensor.hpp" +#include "utils.hpp" + +namespace { + +qnn::ggml_backend_qnn_device_context * get_device_context(ggml_backend_dev_t dev) { + return reinterpret_cast(dev->context); +} + +qnn::qnn_buffer_interface * get_buffer_context(ggml_backend_buffer_t buffer) { + return reinterpret_cast(buffer->context); +} + +/* + * ----------------------------------------------------------------------------------------------- + * qnn backend buffer object + * ----------------------------------------------------------------------------------------------- + */ +void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { + auto * ctx = get_buffer_context(buffer); + delete ctx; +} + +void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { + auto * ctx = get_buffer_context(buffer); + return ctx->get_buffer(); +} + +ggml_status ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + GGML_UNUSED(buffer); + GGML_UNUSED(tensor); + return GGML_STATUS_SUCCESS; +} + +void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, + size_t offset, size_t size) { + GGML_UNUSED(buffer); + memcpy((char *) tensor->data + offset, data, size); +} + +void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, + size_t offset, size_t size) { + GGML_UNUSED(buffer); + memcpy(data, (const char *) tensor->data + offset, size); +} + +bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + GGML_UNUSED(buffer); + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + + return false; +} + +void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + auto * ctx = get_buffer_context(buffer); + memset(ctx->get_buffer(), value, ctx->get_size()); +} + +constexpr const ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { + /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, + /* .get_base = */ ggml_backend_qnn_buffer_get_base, + /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, + /* .memset_tensor = */ nullptr, + /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, + /* .clear = */ ggml_backend_qnn_buffer_clear, + /* .reset = */ nullptr, +}; + +/* + * ----------------------------------------------------------------------------------------------- + * qnn backend object + * ----------------------------------------------------------------------------------------------- + */ +const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { + auto * dev_ctx = get_device_context(buft->device); + return qnn::get_backend_name(dev_ctx->device); +} + +ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + qnn::qnn_buffer_interface * ctx = new qnn::qnn_mem_buffer(size); + if (!ctx->is_valid()) { + return nullptr; + } + + QNN_LOG_DEBUG("[%s]alloc buffer: %p, size: %ld\n", qnn::get_backend_name(get_device_context(buft->device)->device), + (void *) ctx->get_buffer(), (long) size); + return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); +} + +size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + // TODO: fix this + return 32; +} + +size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + // TODO: get the max size from device + return 1024L * 1024 * 1024; +} + +bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { + // TODO: fix this + GGML_UNUSED(buft); + return true; +} + +const char * ggml_backend_qnn_name(ggml_backend_t backend) { + auto * device_ctx = get_device_context(backend->device); + return device_ctx->name.c_str(); +} + +void ggml_backend_qnn_free(ggml_backend_t backend) { + auto * device_ctx = get_device_context(backend->device); + QNN_LOG_INFO("idx %d, name:%s\n", device_ctx->device, device_ctx->name.c_str()); + + auto & instance = device_ctx->instance; + if (instance) { + device_ctx->qnn_graph_cache.clear(); + device_ctx->qnn_interface.reset(); + instance->qnn_finalize(); + instance.reset(); + } + + delete backend; +} + +ggml_guid_t ggml_backend_qnn_guid() { + static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; + return &guid; +} + +bool ggml_backend_is_qnn(ggml_backend_t backend) { + return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); +} + +bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, + ggml_tensor * dst) { + GGML_UNUSED(backend_src); + GGML_UNUSED(backend_dst); + GGML_UNUSED(src); + GGML_UNUSED(dst); + + QNN_LOG_DEBUG("opy form %s to %s, src_is_qnn: %d, dst_is_qnn: %d\n", ggml_get_name(src), ggml_get_name(dst), + (int) ggml_backend_is_qnn(backend_src), (int) ggml_backend_is_qnn(backend_dst)); + return false; +} + +ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) { + static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[QNN_BACKEND_COUNT]; + auto * dev_ctx = get_device_context(dev); + if (!ggml_backend_qnn_buffer_types[dev_ctx->device].device) { + ggml_backend_qnn_buffer_types[dev_ctx->device] = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ + ggml_backend_qnn_buffer_type_alloc_buffer, /* .get_alignment = */ + ggml_backend_qnn_buffer_type_get_alignment, /* .get_max_size = */ + ggml_backend_qnn_buffer_type_get_max_size, /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_qnn_buffer_is_host, + }, + /* .device */ + dev, + /* .context = */ nullptr, + }; + } else { + GGML_ASSERT(ggml_backend_qnn_buffer_types[dev_ctx->device].device == dev); + } + + return &ggml_backend_qnn_buffer_types[dev_ctx->device]; +} + +ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + return qnn::device_compute_graph(get_device_context(backend->device), cgraph) ? GGML_STATUS_SUCCESS : + GGML_STATUS_FAILED; +} + +constexpr const ggml_backend_i ggml_backend_qnn_interface = { + /* .get_name = */ ggml_backend_qnn_name, + /* .free = */ ggml_backend_qnn_free, + /* .set_tensor_async = */ nullptr, + /* .get_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ ggml_backend_qnn_cpy_tensor_async, + /* .synchronize = */ nullptr, + /* .graph_plan_create = */ nullptr, + /* .graph_plan_free = */ nullptr, + /* .graph_plan_update = */ nullptr, + /* .graph_plan_compute = */ nullptr, + /* .graph_compute = */ ggml_backend_qnn_graph_compute, + /* .event_record = */ nullptr, + /* .event_wait = */ nullptr, +}; + +/* + * ----------------------------------------------------------------------------------------------- + * qnn backend device object + * ----------------------------------------------------------------------------------------------- + */ +const char * ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) { + auto * dev_ctx = get_device_context(dev); + return qnn::get_backend_name(dev_ctx->device); +} + +const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) { + auto * dev_ctx = get_device_context(dev); + return dev_ctx->description.empty() ? qnn::get_backend_desc(dev_ctx->device) : dev_ctx->description.c_str(); +} + +void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + GGML_UNUSED(dev); + *free = common::get_system_free_memory_in_bytes(); + *total = common::get_system_total_memory_in_bytes(); + QNN_LOG_DEBUG("free memory: %ldMB, total memory: %ldMB\n", (*free / 1048576), (*total) / 1048576); +} + +enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) { + return qnn::get_device_caps(get_device_context(dev)->device).type; +} + +void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { + props->name = ggml_backend_qnn_device_get_name(dev); + props->description = ggml_backend_qnn_device_get_description(dev); + props->type = ggml_backend_qnn_device_get_type(dev); + ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* async */ false, + /* host_buffer */ false, + /* buffer_from_host_ptr */ false, + /* events */ false, + }; +} + +ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char * extend_lib_search_path) { + if (!extend_lib_search_path) { + extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; + QNN_LOG_WARN( + "extend_lib_search_path is nullptr, will " + "use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default\n"); + } + + auto * dev_ctx = get_device_context(dev); + const auto device = dev_ctx->device; + QNN_LOG_DEBUG("device %s\n", qnn::get_backend_name(device)); + QNN_LOG_DEBUG("extend_lib_search_path %s\n", extend_lib_search_path); + auto instance = std::make_shared(extend_lib_search_path, device); + if (!instance->qnn_init(nullptr)) { + QNN_LOG_WARN("failed to init qnn backend %s\n", qnn::get_backend_name(device)); + return nullptr; + } + auto qnn_interface = instance->get_qnn_interface(); + if (!qnn_interface) { + QNN_LOG_WARN("qnn subsystem failure\n"); + return nullptr; + } + + std::string device_name = qnn::get_backend_name(device); + QNN_LOG_INFO("qnn device name %s\n", device_name.c_str()); + const auto & device_caps = qnn::get_device_caps(device); + dev_ctx->instance = instance; + dev_ctx->qnn_interface = qnn_interface; + dev_ctx->socinfo = instance->get_soc_info(); + dev_ctx->supported_types = device_caps.supported_types; + dev_ctx->cpu_preprocess_types = device_caps.cpu_preprocess_types; + dev_ctx->max_tensor_size_in_bytes = device_caps.max_tensor_size_in_bytes; + { + char buffer[256]; + snprintf(buffer, sizeof(buffer), "%s(%s)", qnn::get_chipset_desc(dev_ctx->socinfo.soc_model), + qnn::get_backend_desc(dev_ctx->device)); + dev_ctx->description = buffer; + } + // TODO: remove npu from here if hardware quantization is supported + dev_ctx->enable_cpu_dequantize = device == QNN_BACKEND_CPU; + + ggml_backend_t qnn_backend = new ggml_backend{ + /* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .device = */ dev, + /* .context = */ nullptr, + }; + + return qnn_backend; +} + +ggml_backend_t ggml_backend_qnn_device_init(ggml_backend_dev_t dev, const char * params) { + return ggml_backend_qnn_init_with_device_context(dev, params); +} + +ggml_backend_buffer_type_t ggml_backend_qnn_device_get_buffer_type(ggml_backend_dev_t dev) { + return ggml_backend_qnn_buffer_type(dev); +} + +ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, + size_t max_tensor_size) { + // TODO + GGML_UNUSED(dev); + GGML_UNUSED(max_tensor_size); + return ggml_backend_cpu_buffer_from_ptr(ptr, size); +} + +bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + // Note that this function could be called before the device context is initialized + auto * device_ctx = get_device_context(dev); + return qnn::device_supports_op(device_ctx, op); +} + +bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + GGML_UNUSED(dev); + return ggml_backend_buft_is_host(buft); +} + +bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { +#ifdef NDEBUG + GGML_UNUSED(dev); + GGML_UNUSED(op); +#else + auto * device_ctx = get_device_context(dev); + QNN_LOG_DEBUG("[%s][%s]offload op\n", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op)); +#endif + return false; +} + +constexpr const ggml_backend_device_i ggml_backend_qnn_device_interface = { + /* .get_name = */ ggml_backend_qnn_device_get_name, + /* .get_description = */ ggml_backend_qnn_device_get_description, + /* .get_memory = */ ggml_backend_qnn_device_get_memory, + /* .get_type = */ ggml_backend_qnn_device_get_type, + /* .get_props = */ ggml_backend_qnn_device_get_props, + /* .init_backend = */ ggml_backend_qnn_device_init, + /* .get_buffer_type = */ ggml_backend_qnn_device_get_buffer_type, + /* .get_host_buffer_type = */ nullptr, + /* .buffer_from_host_ptr = */ ggml_backend_qnn_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_qnn_device_supports_op, + /* .supports_buft = */ ggml_backend_qnn_device_supports_buft, + /* .offload_op = */ ggml_backend_qnn_device_offload_op, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_synchronize = */ nullptr, +}; + +class qnn_device_proxy : public backend_device_proxy { + public: + explicit qnn_device_proxy(backend_index_type device) { + const auto & device_caps = qnn::get_device_caps(device); + _device_context = std::make_unique( + /* .device = */ device, // init from the last device, i.e. NPU + /* .threads = */ 1, // TODO: fix this + /* .name = */ qnn::get_backend_name(device), + /* .supported_types = */ device_caps.supported_types); + } + + const ggml_backend_device_i & get_iface() const { return ggml_backend_qnn_device_interface; } + + void * get_context() { return _device_context.get(); } + + private: + std::unique_ptr _device_context; +}; + +} // namespace + +backend_device_proxy_ptr create_qnn_backend_context(backend_index_type device) { + if (device >= QNN_BACKEND_COUNT) { + QNN_LOG_ERROR("[qnn]invalid device %d\n", device); + return backend_device_proxy_ptr(); + } + +#ifndef GGML_QNN_ENABLE_CPU_BACKEND + if (device == QNN_BACKEND_CPU) { + /* + * here we skip the initialization of CPU device, + * cause it'll block unsupported ops fallback to ggml cpu backend + */ + GGML_LOG_DEBUG("qnn backend registry skip CPU device\n"); + return backend_device_proxy_ptr(); + } +#endif + + return std::make_unique(device); +} diff --git a/ggml/src/ggml-qnn/qnn/graph.cpp b/ggml/src/ggml-qnn/qnn/graph.cpp new file mode 100644 index 0000000000000..70fc71c211c14 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/graph.cpp @@ -0,0 +1,550 @@ + +#include "graph.hpp" + +#include +#include + +#include "ggml-impl.h" +#include "logger.hpp" +#include "op-config.hpp" +#include "profiler.hpp" +#include "tensor.hpp" + +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +# define GRAPH_PROFILE_HANDLE (_event_tracer ? _event_tracer->get_handle() : nullptr) +# define GRAPH_PROFILE_PRINT() \ + if (_event_tracer) { \ + _event_tracer->print_profile_events(); \ + } \ + (void) 0 +#else +# define GRAPH_PROFILE_HANDLE (nullptr) +# define GRAPH_PROFILE_PRINT() (void) 0 +#endif + +namespace { +using qnn_tensor_cache_t = std::unordered_map; + +int get_op_max_rank(const ggml_tensor * op) { + int max_rank = ggml_n_dims(op); + for (int i = 0; i < GGML_MAX_DIMS && op->src[i]; ++i) { + max_rank = std::max(max_rank, ggml_n_dims(op->src[i])); + } + + return max_rank; +} + +qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor * tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank, + ggml_type override_data_type, backend_index_type device, + Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, + qnn_tensor_cache_t & tensor_cache) { + GGML_ASSERT(tensor); + if (tensor_cache.count(tensor)) { + return tensor_cache[tensor]; + } + + QNN_LOG_DEBUG("[%s]create_tensor_with_cache, data_type: %s, override_data_type: %s\n", + qnn::get_backend_name(device), ggml_type_name(tensor->type), ggml_type_name(override_data_type)); + auto data_type = override_data_type != GGML_TYPE_COUNT ? override_data_type : tensor->type; + + // We've observed that some tensors have the same name with different op types will be added to the same graph + // which will cause the graph build failed. To avoid this, we append the op type to the tensor name. + char tensor_name[256]; + snprintf(tensor_name, sizeof(tensor_name), "%s_%s", ggml_get_name(tensor), ggml_op_desc(tensor)); + auto qnn_tensor = std::make_shared(type, std::string(tensor_name), tensor->ne, data_type, + rank, device, graph_handle, qnn_instance); + tensor_cache[tensor] = qnn_tensor; + return qnn_tensor; +} + +qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t & ggml_tensors, + qnn::ggml_qnn_tensor::tensor_type_t type, int rank, + ggml_type override_data_type, backend_index_type device, + Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, + qnn_tensor_cache_t & tensor_cache) { + qnn::qnn_tensor_array_t tensors; + for (auto * tensor : ggml_tensors) { + tensors.push_back(create_tensor_with_cache(tensor, type, rank, override_data_type, device, graph_handle, + qnn_instance, tensor_cache)); + } + + return tensors; +} + +qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor * dst, const std::string & name, int rank, + backend_index_type device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, + qnn_tensor_cache_t & tensor_cache) { + auto operation = qnn::create_op(dst, name, qnn_instance); + + // input tensors + qnn::qnn_tensor_array_t input_qnn_tensors; + for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) { + auto * src = dst->src[i]; + auto input_qnn_tensor = create_tensor_with_cache(src, qnn::ggml_qnn_tensor::INTERMEDIATE, rank, GGML_TYPE_COUNT, + device, graph_handle, qnn_instance, tensor_cache); + input_qnn_tensors.push_back(input_qnn_tensor); + } + operation->set_input_tensors(input_qnn_tensors); + + // output tensor + qnn::qnn_tensor_array_t output_qnn_tensors = + create_tensors_with_cache({ dst }, qnn::ggml_qnn_tensor::INTERMEDIATE, rank, GGML_TYPE_COUNT, device, + graph_handle, qnn_instance, tensor_cache); + operation->set_output_tensors(output_qnn_tensors); + + // initialize operation + if (!operation->initialize_op_nodes(device, graph_handle)) { + QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed\n", qnn::get_backend_name(device), name.c_str()); + return nullptr; + } + + return operation; +} + +/** + * @brief Extracts input and output tensors from a computational graph. + * + * This function identifies the input and output tensors of a computational graph by analyzing the connectivity between + * tensor nodes. It does this by iterating over each node in the graph, using a connectivity map that associates every + * tensor with its number of incoming connections (in_degree), outgoing connections (out_degree), and an insertion index + * that preserves order. The insertion index is used later to sort the tensors in their original discovery order. + * + * TODO: this algorithm is not perfect and may not work for all cases. It assumes that the tensors are + * connected in a way that allows for unambiguous categorization. + */ +int get_io_tensors_from_graph(const ggml_cgraph * cgraph, qnn::ggml_tensor_array_t & inputs, + qnn::ggml_tensor_array_t & outputs) { + struct _tensor_connectivity_info { + size_t in_degree = 0; + size_t out_degree = 0; + size_t insert_index = 0; + }; + + using ggml_tensor_connectivity_map_t = std::unordered_map; + + ggml_tensor_connectivity_map_t connectivity_map; + int rank = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * dst = cgraph->nodes[i]; + if (ggml_is_empty(dst)) { + continue; + } + + if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW || dst->op == GGML_OP_PERMUTE) { + // TODO: remove GGML_OP_VIEW after view op is supported + QNN_LOG_DEBUG("node[%d]%s(%s), type: %s, skipped\n", i, ggml_get_name(dst), ggml_op_desc(dst), + ggml_type_name(dst->type)); + continue; + } + + QNN_LOG_DEBUG("node[%d]%s(%s), type: %s\n", i, ggml_get_name(dst), ggml_op_desc(dst), + ggml_type_name(dst->type)); + rank = std::max(rank, ggml_n_dims(dst)); + if (connectivity_map.count(dst) == 0) { + connectivity_map[dst] = { + 1, // in-degree, at least 1 + 0, + connectivity_map.size(), + }; + } else { + ++(connectivity_map[dst].in_degree); + } + + for (size_t j = 0; j < GGML_MAX_DIMS && dst->src[j]; ++j) { + auto * src = dst->src[j]; + rank = std::max(rank, ggml_n_dims(src)); + + QNN_LOG_DEBUG("node[%d]: src[%d]: %s(%s), type: %s\n", i, (int) j, ggml_get_name(src), ggml_op_desc(src), + ggml_type_name(src->type)); + if (connectivity_map.count(src) == 0) { + connectivity_map[src] = { + 0, + 1, // out-degree, at least 1 + connectivity_map.size(), + }; + } else { + ++(connectivity_map[src].out_degree); + } + } + } + + for (const auto & kv : connectivity_map) { + if (kv.second.in_degree == 0) { + inputs.push_back(kv.first); + } + + if (kv.second.out_degree == 0) { + outputs.push_back(kv.first); + } + } + + std::sort(inputs.begin(), inputs.end(), [&connectivity_map](ggml_tensor * lhs, ggml_tensor * rhs) { + return connectivity_map[lhs].insert_index < connectivity_map[rhs].insert_index; + }); + + std::sort(outputs.begin(), outputs.end(), [&connectivity_map](ggml_tensor * lhs, ggml_tensor * rhs) { + return connectivity_map[lhs].insert_index < connectivity_map[rhs].insert_index; + }); + + return rank; +} + +/* + * for src0_F32, src1_F32, dst_F32 -> GGML_TYPE_COUNT + * for src0_F16, src1_F16, dst_F16 -> GGML_TYPE_COUNT + * for src0_F16, src1_F32, dst_F32 -> GGML_TYPE_F32 + * for src0_q4, src1_F32, dst_F32 -> GGML_TYPE_F32 + * for src0_q4, src1_F16, dst_F32 -> GGML_TYPE_F32 + */ +ggml_type get_override_data_type(const qnn::ggml_tensor_array_t & inputs, const qnn::ggml_tensor_array_t & outputs) { + GGML_ASSERT(!inputs.empty()); + ggml_type override_data_type = inputs.front()->type; + bool is_same_data_type = true; + for (auto * tensor : inputs) { + QNN_LOG_DEBUG("input_tensor: %s(%s), override_data_type(%s)\n", ggml_get_name(tensor), + ggml_type_name(tensor->type), ggml_type_name(override_data_type)); + is_same_data_type = is_same_data_type && tensor->type == override_data_type; + override_data_type = std::min(override_data_type, tensor->type); + } + + for (auto * tensor : outputs) { + QNN_LOG_DEBUG("output_tensor: %s(%s), override_data_type(%s)\n", ggml_get_name(tensor), + ggml_type_name(tensor->type), ggml_type_name(override_data_type)); + is_same_data_type = is_same_data_type && tensor->type == override_data_type; + override_data_type = std::min(override_data_type, tensor->type); + } + + return is_same_data_type ? GGML_TYPE_COUNT : override_data_type; +} + +static const QnnHtpGraph_CustomConfig_t kDefaultHvxConfig = []() { + QnnHtpGraph_CustomConfig_t hvx_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + return hvx_config; +}(); + +static const QnnHtpGraph_CustomConfig_t kDefaultDlbcConfig = []() { + QnnHtpGraph_CustomConfig_t dlbc_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + return dlbc_config; +}(); + +/* + * 1 = Faster preparation time, less optimal graph + * 2 = Longer preparation time, more optimal graph + * 3 = Longest preparation time, most likely even more optimal graph: + * QNN_HTP_DEVICE_CONFIG_OPTION_SOC configuration will be taken into account when possible, details see HTP Backend Specific Page + */ +static const QnnHtpGraph_CustomConfig_t kDefaultOptConfig = []() { + QnnHtpGraph_CustomConfig_t opt_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; +#ifndef NDEBUG + opt_config.optimizationOption.floatValue = 3; +#else + opt_config.optimizationOption.floatValue = 1; +#endif + return opt_config; +}(); + +static const QnnHtpGraph_CustomConfig_t kHtpPrecisionConfigF16 = []() { + QnnHtpGraph_CustomConfig_t precision_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION; + precision_config.precision = QNN_PRECISION_FLOAT16; + return precision_config; +}(); + +constexpr QnnHtpGraph_CustomConfig_t make_vtcm_config(size_t vtcm_size_in_mb) { + QnnHtpGraph_CustomConfig_t vtcm_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = (uint32_t) vtcm_size_in_mb; + return vtcm_config; +} + +constexpr QnnGraph_Config_t make_graph_config(const QnnHtpGraph_CustomConfig_t * custom_config) { + QnnGraph_Config_t graph_config = QNN_GRAPH_CONFIG_INIT; + graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_config.customConfig = const_cast(custom_config); + return graph_config; +} + +} // namespace + +namespace qnn { + +ggml_type qnn_graph::get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output) { + if (cgraph->n_nodes == 0) { + QNN_LOG_DEBUG("empty cgraph\n"); + return GGML_TYPE_COUNT; + } + + ggml_type override_type = GGML_TYPE_COUNT; + { + // TODO: can we have a better approach to get the override_type here? + // though it is O(n) + O(mlog(m)) complexity, our graph is small, so it is fine + ggml_tensor_array_t inputs; + ggml_tensor_array_t outputs; + get_io_tensors_from_graph(cgraph, inputs, outputs); + if (!inputs.empty() && !outputs.empty()) { + override_type = get_override_data_type(inputs, outputs); + QNN_LOG_DEBUG("get_graph_key, override_type: %s\n", ggml_type_name(override_type)); + } else { + QNN_LOG_DEBUG("get_graph_key, no input or output tensors\n"); + } + } + + ggml_type min_op_type = GGML_TYPE_COUNT; + { + bool is_start = true; + for (int i = 0; i < cgraph->n_nodes; ++i) { + auto * op = cgraph->nodes[i]; + if (ggml_is_empty(op)) { + QNN_LOG_DEBUG("empty op in graph, skipping\n"); + continue; + } + + if (op->op == GGML_OP_NONE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE) { + QNN_LOG_DEBUG("%s in graph, skipping\n", ggml_op_desc(op)); + continue; + } + + min_op_type = std::min(min_op_type, op->type); + if (is_start) { + qnn::get_qnn_op_desc(op, is_start, override_type, output); + is_start = false; + } else { + output += '#'; + qnn::get_qnn_op_desc(op, is_start, override_type, output); + } + } + } + + if (cgraph->n_nodes > 1) { + auto * last_op = cgraph->nodes[cgraph->n_nodes - 1]; + output += qnn::get_ggml_type_name(last_op->type); + output += '_'; + qnn::append_tensor_shape_and_type(last_op, output); + } + + return min_op_type; +} + +qnn_graph::qnn_graph(const std::string & graph_name, backend_index_type device, qnn_instance_ptr qnn_instance, + htp_precision precision, size_t vtcm_size_in_mb) : + _graph_name(graph_name), + _device(device), + _qnn_instance(qnn_instance) { + QNN_LOG_DEBUG("[%s][%s]creating\n", get_backend_name(device), graph_name.c_str()); + + auto qnn_interface = qnn_instance->get_qnn_interface(); + auto qnn_context = qnn_instance->get_qnn_context_handle(); + Qnn_ErrorHandle_t error = QNN_SUCCESS; + Qnn_GraphHandle_t graph_handle = nullptr; + if (device == QNN_BACKEND_NPU) { + // TODO: fix graph config here for NPU + std::vector graph_configs; + + auto hvx_config = make_graph_config(&kDefaultHvxConfig); + graph_configs.push_back(&hvx_config); + + auto dlbc_config = make_graph_config(&kDefaultDlbcConfig); + graph_configs.push_back(&dlbc_config); + + auto opt_config = make_graph_config(&kDefaultOptConfig); + graph_configs.push_back(&opt_config); + + auto vctm_sub_config = make_vtcm_config(vtcm_size_in_mb); + auto vtcm_config = make_graph_config(&vctm_sub_config); + graph_configs.push_back(&vtcm_config); + + if (precision == qnn_graph::kHtpFp16) { + auto precision_config = make_graph_config(&kHtpPrecisionConfigF16); + graph_configs.push_back(&precision_config); + QNN_LOG_DEBUG("[%s][%s]set precision to F16\n", get_backend_name(device), graph_name.c_str()); + } + + graph_configs.push_back(nullptr); + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs.data(), &graph_handle); + } else { + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); + } + + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s\n", get_backend_name(device), graph_name.c_str(), + get_qnn_error_string(error)); + return; + } + +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING + if (device == QNN_BACKEND_NPU) { + _event_tracer = std::make_shared( + graph_name, qnn_interface, qnn_instance->get_qnn_backend_handle(), qnn_event_tracer::PROFILE_OP_TRACE); + } +#endif + + _graph_handle = graph_handle; + _qnn_interface = qnn_interface; + QNN_LOG_DEBUG("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); +} + +qnn_graph::~qnn_graph() { + QNN_LOG_DEBUG("[%s][%s]destroy\n", get_backend_name(_device), _graph_name.c_str()); +} + +bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) { + QNN_LOG_DEBUG("[%s][%s]build start\n", get_backend_name(_device), _graph_name.c_str()); + + ggml_tensor_array_t inputs; + ggml_tensor_array_t outputs; + int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); + QNN_LOG_DEBUG("[%s][%s]rank: %d, graph_nodes: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), + _graph_name.c_str(), rank, cgraph->n_nodes, int(inputs.size()), int(outputs.size())); + + { + static_assert( + GGML_TYPE_COUNT > GGML_TYPE_Q8_0 && GGML_TYPE_Q8_0 > GGML_TYPE_F16 && GGML_TYPE_F16 > GGML_TYPE_F32, + "GGML_TYPE enum order is not correct"); + + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device), + _graph_name.c_str()); + + auto override_data_type = get_override_data_type(inputs, outputs); + if (override_data_type != GGML_TYPE_COUNT) { + QNN_LOG_DEBUG("[%s][%s]set override_data_type: %s\n", get_backend_name(_device), _graph_name.c_str(), + ggml_type_name(override_data_type)); + } + + qnn_tensor_cache_t tensor_cache; + auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, override_data_type, + _device, _graph_handle, _qnn_instance, tensor_cache); + auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, GGML_TYPE_COUNT, + _device, _graph_handle, _qnn_instance, tensor_cache); + qnn_op_config_array_t operations; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * dst = cgraph->nodes[i]; + if (ggml_is_empty(dst)) { + continue; + } + + if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW || dst->op == GGML_OP_PERMUTE) { + // TODO: remove GGML_OP_VIEW after view op is supported + continue; + } + +#ifndef NDEBUG + { + std::string op_desc; + get_qnn_op_desc(dst, true, GGML_TYPE_COUNT, op_desc); + QNN_LOG_DEBUG("[%s]create op(%s) with qnn op(%s)\n", get_backend_name(_device), op_desc.c_str(), + get_qnn_op_name(dst)); + } +#endif + auto operation = create_operation_from_op_tensor(dst, dst->name, rank, _device, _graph_handle, + _qnn_instance, tensor_cache); // TODO: fix op name + operations.push_back(operation); + } + + _tensor_inputs = std::move(input_tensors); + _tensor_outputs = std::move(output_tensors); + _operations = std::move(operations); + if (!finalize()) { + return false; + } + } + + QNN_LOG_DEBUG("[%s][%s]build succeed\n", get_backend_name(_device), _graph_name.c_str()); + return true; +} + +bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr convert_context) { + ggml_tensor_array_t inputs; + ggml_tensor_array_t outputs; + { + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]get_io_tensors_from_graph", get_backend_name(_device), + _graph_name.c_str()); +#ifdef NDEBUG + get_io_tensors_from_graph(cgraph, inputs, outputs); +#else + int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); + QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank, + int(inputs.size()), int(outputs.size())); +#endif + } + + { + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]bind_tensors", get_backend_name(_device), _graph_name.c_str()); + auto override_data_type = get_override_data_type(inputs, outputs); + if (override_data_type != GGML_TYPE_COUNT) { + QNN_LOG_DEBUG("[%s][%s]override_data_type: %s\n", get_backend_name(_device), _graph_name.c_str(), + ggml_type_name(override_data_type)); + auto buffers = convert(convert_context, inputs, override_data_type); + if (!qnn::bind_tensors_with_custom_buffers(inputs, buffers, _tensor_inputs, _qnn_tensor_inputs)) { + QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); + return false; + } + } else { + if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) { + QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); + return false; + } + } + + if (!qnn::bind_tensors(outputs, _tensor_outputs, _qnn_tensor_outputs)) { + QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str()); + return false; + } + } + + { + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]execute", get_backend_name(_device), _graph_name.c_str()); + auto & qnn_tensor_inputs = _qnn_tensor_inputs; + auto & qnn_tensor_outputs = _qnn_tensor_outputs; + auto error = _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), + qnn_tensor_inputs.size(), qnn_tensor_outputs.data(), + qnn_tensor_outputs.size(), GRAPH_PROFILE_HANDLE, nullptr); + unbind_tensors(_tensor_inputs); + unbind_tensors(_tensor_outputs); + if (error != QNN_SUCCESS) { + if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { + QNN_LOG_WARN("[%s][%s][execute]NPU crashed. SSR detected. Caused QNN graph execute error.\n", + get_backend_name(_device), _graph_name.c_str()); + } else { + QNN_LOG_ERROR("[%s][%s][execute]error: %s\n", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); + } + return false; + } + + QNN_LOG_DEBUG("[%s][%s]execute succeed\n", get_backend_name(_device), _graph_name.c_str()); + } + + GRAPH_PROFILE_PRINT(); + return true; +} + +bool qnn_graph::finalize() { + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]finalize", get_backend_name(_device), _graph_name.c_str()); + + if (!qnn::add_op_to_graph(_graph_handle, _operations)) { + QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str()); + return false; + } + + auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, GRAPH_PROFILE_HANDLE, nullptr); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s\n", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); + return false; + } + + QNN_LOG_DEBUG("[%s][%s]finalize succeed\n", get_backend_name(_device), _graph_name.c_str()); + return true; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/graph.hpp b/ggml/src/ggml-qnn/qnn/graph.hpp new file mode 100644 index 0000000000000..5e862112fbd1e --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/graph.hpp @@ -0,0 +1,93 @@ + +#pragma once + +#include +#include +#include + +#include "convert.hpp" +#include "ggml-qnn.h" +#include "op-config.hpp" +#include "profiler.hpp" +#include "qnn-lib.hpp" + +namespace qnn { + +/** + * @class qnn_graph + * @brief Manages a QNN graph, converting a GGML graph to QNN format and handling its execution. + * + * This class is responsible for building a QNN graph from a given GGML graph, + * determining its input/output tensors, finalizing the configuration, and + * executing the graph on the specified backend device. + */ +class qnn_graph { + public: + enum htp_precision { + kHtpDefault = 0, + kHtpFp16, + }; + + /** + * @brief Generates a unique key for a given computation graph (cgraph). + * + * This key is used to cache the graph, enabling efficient reuse of previously + * compiled graphs. The key is constructed by concatenating the descriptions + * of the operations and their associated tensor dimensions within the graph. + * + * Example key format: "MUL_MATf32_2048x8192q4_K_2048x2f32#MUL(SILU,MUL_MAT)#MUL_MAT(NONE,MUL)#ADD(MUL_MAT,ADD)f32_2048x2f32" + * + * @param cgraph The computation graph for which the key is generated. + * @param output The string where the generated key will be stored. + * @return The max ggml_type of all tensors in the graph. + * + * TODO: Improve the key generation logic to handle more complex graph structures and edge cases. + */ + static ggml_type get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output); + + explicit qnn_graph(const std::string & graph_name, backend_index_type device, qnn_instance_ptr qnn_instance, + htp_precision precision, size_t vtcm_size_in_mb); + + ~qnn_graph(); + + bool build_graph_from_ggml_graph(const ggml_cgraph * cgraph); + + bool execute(const ggml_cgraph * cgraph, std::shared_ptr convert_context); + + bool is_valid() const { return _graph_handle != nullptr; } + + Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } + + qnn_instance_ptr get_qnn_instance() { return _qnn_instance; } + + const std::string & get_name() const { return _graph_name; } + + backend_index_type get_device() const { return _device; } + + private: + bool finalize(); + + const std::string _graph_name; + const backend_index_type _device; + Qnn_GraphHandle_t _graph_handle = nullptr; + qnn_instance_ptr _qnn_instance; + qnn_interface_ptr _qnn_interface; + qnn_op_config_array_t _operations; + + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING + // profiler + qnn_event_tracer_ptr _event_tracer; +#endif + + DISABLE_COPY(qnn_graph); + DISABLE_MOVE(qnn_graph); +}; + +using qnn_graph_ptr_t = std::shared_ptr; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage.xml b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage.xml new file mode 100644 index 0000000000000..f4c6575902948 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage.xml @@ -0,0 +1,88 @@ + + + + + GgmlMulMat + + + GGML MulMat operator + + + + + in[0] + + src0 + + true + BACKEND_SPECIFIC + + 4D + NHWC + [N, C, H , W] + + + + + in[1] + + src1 + + true + BACKEND_SPECIFIC + + 4D + NHWC + [N, C, H , W] + + + + + out[0] + + dst + + true + BACKEND_SPECIFIC + + 4D + [N, C, H , W] + + + + + HTP + + + + + + + GgmlMulMat + + + + + GgmlMulMat + + + in[0] + QNN_DATATYPE_FLOAT_16 + QNN_DATATYPE_FLOAT_32 + + + in[1] + QNN_DATATYPE_FLOAT_16 + QNN_DATATYPE_FLOAT_32 + + + + out[0] + QNN_DATATYPE_FLOAT_16 + QNN_DATATYPE_FLOAT_32 + + + + + + diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/Makefile b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/Makefile new file mode 100644 index 0000000000000..f177822d35a06 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/Makefile @@ -0,0 +1,357 @@ +# check all setup prerequisites if the command goal is not clean +ifneq ($(MAKECMDGOALS),clean) +ifndef QNN_INCLUDE +$(info "INFO: Qnn include not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid") +QNN_INCLUDE := $(QNN_SDK_ROOT)/include/QNN +endif +ifeq ($(wildcard $(QNN_INCLUDE)),) +$(error "ERROR: QNN_INCLUDE path is not set. QNN include paths must be set to obtain BE headers necessary to compile the package") +endif +ifndef QNN_TARGET_LIB +$(info "INFO: Qnn target not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid") +QNN_TARGET_LIB := $(QNN_SDK_ROOT)/lib/aarch64-android +endif +ifeq ($(wildcard $(QNN_TARGET_LIB)),) +ifeq ($(MAKECMDGOALS),htp_aarch64) +$(error "ERROR: QNN_TARGET_LIB is needed to compile package for aarch64") +else ifeq ($(MAKECMDGOALS),all) +$(info "WARNING:QNN_TARGET_LIB may need to be defined to compile packages") +endif +endif + +ifndef HEXAGON_SDK_ROOT +$(error "ERROR: HEXAGON_SDK_ROOT is not set. Hexagon-SDK path must be set to the latest hexagon-sdk-x.y.z") +endif + +ifeq ($(wildcard $(HEXAGON_SDK_ROOT)),) +$(error "ERROR: HEXAGON_SDK_ROOT is not set correctly. Please set HEXAGON_SDK_ROOT to latest hexagon-sdk-X.Y.Z path") +endif + +HEXAGON_SDK_BASE := $(dir $(HEXAGON_SDK_ROOT)) + +$(info "HEXAGON_SDK_ROOT is [${HEXAGON_SDK_ROOT}]") +# Users should note that the tools version may change between hexagon sdk versions +# Following combination of SDK and Tool version is supported +# fix the sdk root for new versions +HEXAGON_SDK_ROOT_V68 := $(HEXAGON_SDK_ROOT) +HEXAGON_SDK_ROOT_V69 := $(HEXAGON_SDK_ROOT) +HEXAGON_SDK_ROOT_V73 := $(HEXAGON_SDK_ROOT) +HEXAGON_SDK_ROOT_V75 := $(HEXAGON_SDK_ROOT) +HEXAGON_SDK_ROOT_V79 := $(HEXAGON_SDK_ROOT) + +#Updated to point to latest sdk to match with libQnnHtp.so +HEXAGON_SDK_ROOT_X86 := $(HEXAGON_SDK_ROOT) +HEXAGON_TOOLS_VERSION_V68 := 8.7.06 +HEXAGON_TOOLS_VERSION_V69 := 8.7.06 +HEXAGON_TOOLS_VERSION_V73 := 8.7.06 +HEXAGON_TOOLS_VERSION_V75 := 8.7.06 +HEXAGON_TOOLS_VERSION_V79 := 8.7.06 + +#Updated to point to latest sdk to match with libQnnHtp.so +HEXAGON_TOOLS_VERSION_X86 := 8.7.06 + +ifndef ANDROID_NDK_ROOT +ifeq ($(MAKECMDGOALS),htp_aarch64) +$(error "ERROR: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64") +else ifeq ($(MAKECMDGOALS),all) +$(info "WARNING: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64") +endif +endif + +ifndef PACKAGE_NAME +export +PACKAGE_NAME := $(notdir $(shell pwd)) +$(info "INFO: No package name defined. Using current directory name: $(PACKAGE_NAME) as the package name") +endif + +WORK := build +SRC_DIR := src +OP_SRC_DIR := src/ops +OP_INCLUDE_DIR := ./include +OP_INCLUDES = #$(wildcard $(OP_INCLUDE_DIR)/*.h) user defined if any op specific headers are needed, add -I to common flags +LIBRARY_NAME := libQnn$(PACKAGE_NAME).so +SUPPORTED_TARGETS = x86_64-linux-clang hexagon-v68 hexagon-v69 hexagon-v73 hexagon-v75 hexagon-v79 aarch64-android + + +COMMON_CXX_FLAGS = -std=c++17 -I$(QNN_INCLUDE) -fPIC -Wall -Wreorder -Wno-missing-braces -Wno-unused-function +COMMON_CXX_FLAGS += -Werror -Wno-format -Wno-unused-command-line-argument -fvisibility=default -stdlib=libc++ +COMMON_CXX_FLAGS += -DQNN_API="__attribute__((visibility(\"default\")))" -D__QAIC_HEADER_EXPORT="__attribute__((visibility(\"default\")))" + +X86_LIBNATIVE_RELEASE_DIR := $(HEXAGON_SDK_ROOT_X86)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_X86)/Tools + +# Ensure hexagon sdk tool version can be retrieved +ifeq ($(wildcard $(X86_LIBNATIVE_RELEASE_DIR)/.),) +$(error "Cannot retrieve hexagon tools from: $(X86_LIBNATIVE_RELEASE_DIR). \ + \ + Please check that hexagon tools version is correct. Expected: $(HEXAGON_TOOLS_VERSION_X86)") +endif + +#Check tools for hexagon_v68 are present. +ifeq ($(MAKECMDGOALS),htp_v68) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V68)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V68 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V68)") +endif +endif + +ifeq ($(MAKECMDGOALS),htp_v69) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V69)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V69 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V69)") +endif +endif + +ifeq ($(MAKECMDGOALS),htp_v73) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V73)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V73 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V73)") +endif +endif + +ifeq ($(MAKECMDGOALS),htp_v75) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V75)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V75 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V75)") +endif +endif + +#Check tools for hexagon_v79 are present. +ifeq ($(MAKECMDGOALS),htp_v79) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V79)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V79 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V79)") +endif +endif + + + +endif +OP_SOURCES = $(wildcard $(OP_SRC_DIR)/*.cpp) +OTHER_SOURCES = $(wildcard $(SRC_DIR)/*.cpp) +HFILES = $(wildcard $(QNN_INCLUDE)/*.h) +HFILES += $(wildcard $(QNN_INCLUDE)/HTP/*.h) +HFILES += $(wildcard $(QNN_INCLUDE)/HTP/core/*.h) +OP_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OP_SOURCES))) +OTHER_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OTHER_SOURCES))) + +#======= Assembly ======== +OP_SOURCES_ASM_X86 += $(wildcard $(OP_SRC_DIR)/x86_asm/*.S) +OP_OBJS_ASM_X86 += $(subst /x86_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_X86)))) +OP_SOURCES_ASM_V68 += $(wildcard $(OP_SRC_DIR)/v68_asm/*.S) +OP_OBJS_ASM_V68 += $(subst /v68_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V68)))) +OP_SOURCES_ASM_V69 += $(wildcard $(OP_SRC_DIR)/v69_asm/*.S) +OP_OBJS_ASM_V69 += $(subst /v69_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V69)))) +OP_SOURCES_ASM_V73 += $(wildcard $(OP_SRC_DIR)/v73_asm/*.S) +OP_OBJS_ASM_V73 += $(subst /v73_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V73)))) +OP_SOURCES_ASM_V75 += $(wildcard $(OP_SRC_DIR)/v75_asm/*.S) +OP_OBJS_ASM_V75 += $(subst /v75_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V75)))) +OP_SOURCES_ASM_V79 += $(wildcard $(OP_SRC_DIR)/v79_asm/*.S) +OP_OBJS_ASM_V79 += $(subst /v79_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V79)))) + +OP_SOURCES_ASM_ANDROID += $(wildcard $(OP_SRC_DIR)/android_asm/*.S) +OP_OBJS_ASM_ANDROID += $(subst /android_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_ANDROID)))) + + +all: htp_v68 htp_x86 htp_aarch64 + +#============================================================================================================ +# Setup compiler, compiler instructions and linker for x86 +X86_CXX ?= clang++-9 +# Checking if clang++-9 is present. If not switch to clang++ +ifeq ($(shell $(X86_CXX) -v 2>&1 | grep -c "clang version"), 0) + X86_CXX := clang++ +endif +X86_LDFLAGS:= -Wl,--whole-archive -L$(X86_LIBNATIVE_RELEASE_DIR)/libnative/lib -lnative -Wl,--no-whole-archive -lpthread -L$(QNN_SDK_ROOT)/lib/x86_64-linux-clang -lHtpPrepare +X86_C_FLAGS := -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX +X86_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(X86_C_FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof +linux_objs = +#============================================================================================================ +# Setup compiler, compiler instructions and linker for hexagon +HEXAGON_CXX_FLAGS := $(COMMON_CXX_FLAGS) -mhvx -mhvx-length=128B -mhmx -DUSE_OS_QURT -O2 -Wno-reorder -DPREPARE_DISABLED + +HEXAGON_CXX_FLAGS_V68 := $(HEXAGON_CXX_FLAGS) -mv68 -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/qurt -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/posix -I$(HEXAGON_SDK_ROOT_V68)/incs -I$(HEXAGON_SDK_ROOT_V68)/incs/stddef +HEXAGON_CXX_FLAGS_V69 := $(HEXAGON_CXX_FLAGS) -mv69 -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/qurt -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/posix -I$(HEXAGON_SDK_ROOT_V69)/incs -I$(HEXAGON_SDK_ROOT_V69)/incs/stddef +HEXAGON_CXX_FLAGS_V73 := $(HEXAGON_CXX_FLAGS) -mv73 -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/qurt -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/posix -I$(HEXAGON_SDK_ROOT_V73)/incs -I$(HEXAGON_SDK_ROOT_V73)/incs/stddef +HEXAGON_CXX_FLAGS_V75 := $(HEXAGON_CXX_FLAGS) -mv75 -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/qurt -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/posix -I$(HEXAGON_SDK_ROOT_V75)/incs -I$(HEXAGON_SDK_ROOT_V75)/incs/stddef +HEXAGON_CXX_FLAGS_V79 := $(HEXAGON_CXX_FLAGS) -mv79 -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/qurt -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/posix -I$(HEXAGON_SDK_ROOT_V79)/incs -I$(HEXAGON_SDK_ROOT_V79)/incs/stddef + + +HEXAGON_CXX_V68 := $(HEXAGON_SDK_ROOT_V68)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V68)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V69 := $(HEXAGON_SDK_ROOT_V69)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V69)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V73 := $(HEXAGON_SDK_ROOT_V73)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V73)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V75 := $(HEXAGON_SDK_ROOT_V75)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V75)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V79 := $(HEXAGON_SDK_ROOT_V79)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V79)/Tools/bin/hexagon-clang++ + + +HEX_LDFLAGS = +hexagon_objs = +#============================================================================================================ +# Setup compiler, compiler instructions and linker for aarch64 +AARCH64_C__FLAGS = -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX -DANDROID +AARCH64_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(AARCH64_C__FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof -Wno-unused-variable -Wno-unused-parameter -Wno-missing-braces -Wno-sign-compare -Wno-unused-private-field -Wno-unused-variable -Wno-ignored-qualifiers -Wno-missing-field-initializers +ARM_CLANG_OPTS =--target=aarch64-none-linux-android21 --sysroot=$(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/sysroot -stdlib=libc++ -static-libstdc++ +AARCH64_CXX = $(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/bin/clang++ $(ARM_CLANG_OPTS) +AARCH64_LDFLAGS = -L$(QNN_TARGET_LIB) -lQnnHtp -lQnnHtpPrepare +aarch64_objs = +#============================================================================================================ +# Setup targets and goals + +htp_x86: X86_BUILD + +htp_v68: HEXAGON_BUILD_V68 + +htp_v69: HEXAGON_BUILD_V69 + +htp_v73: HEXAGON_BUILD_V73 + +htp_v75: HEXAGON_BUILD_V75 + +htp_v79: HEXAGON_BUILD_V79 + + + +htp_aarch64: AARCH64_BUILD + +AARCH64_BUILD: $(WORK)/aarch64-android/$(LIBRARY_NAME) + +HEXAGON_BUILD_V68: $(WORK)/hexagon-v68/$(LIBRARY_NAME) + +HEXAGON_BUILD_V69: $(WORK)/hexagon-v69/$(LIBRARY_NAME) + +HEXAGON_BUILD_V73: $(WORK)/hexagon-v73/$(LIBRARY_NAME) + +HEXAGON_BUILD_V75: $(WORK)/hexagon-v75/$(LIBRARY_NAME) + +HEXAGON_BUILD_V79: $(WORK)/hexagon-v79/$(LIBRARY_NAME) + + + +X86_BUILD: $(WORK)/x86_64-linux-clang/$(LIBRARY_NAME) + + +define build_objs = +ifneq ($(filter $(2),$(SUPPORTED_TARGETS)),) +$(2)_objs += $(foreach x,$(1),$(WORK)/$(2)/$(x)) +else +$$(error "Unknown target option provided: $(2): Supported targets are: $(SUPPORTED_TARGETS)") +endif +endef + +$(eval $(call build_objs,$(OTHER_OBJS),x86_64-linux-clang)) +$(eval $(call build_objs,$(OP_OBJS),x86_64-linux-clang)) +$(eval $(call build_objs,$(OP_OBJS_ASM_X86),x86_64-linux-clang)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v68)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v68)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V68),hexagon-v68)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v69)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v69)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V69),hexagon-v69)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v73)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v73)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V73),hexagon-v73)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v75)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v75)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v75)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v79)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v79)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v79)) + +$(eval $(call build_objs,$(OTHER_OBJS),aarch64-android)) +$(eval $(call build_objs,$(OP_OBJS),aarch64-android)) +$(eval $(call build_objs,$(OP_OBJS_ASM_ANDROID),aarch64-android)) + +# x86 +$(WORK)/x86_64-linux-clang $(WORK)/hexagon-v68 $(WORK)/hexagon-v69 $(WORK)/hexagon-v73 $(WORK)/hexagon-v75 $(WORK)/hexagon-v79 $(WORK)/aarch64-android: + @mkdir -p $@/ops + +$(WORK)/x86_64-linux-clang/%.o: $(SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang + $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang + $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/x86_asm/%.S | $(WORK)/x86_64-linux-clang + $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/x86_64-linux-clang/$(LIBRARY_NAME): $(x86_64-linux-clang_objs) | $(HFILES) + $(X86_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(X86_LDFLAGS) + +# v68 +$(WORK)/hexagon-v68/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v68 + $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v68 + $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/v68_asm/%.S | $(WORK)/hexagon-v68 + $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v68/$(LIBRARY_NAME): $(hexagon-v68_objs) | $(HFILES) + $(HEXAGON_CXX_V68) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +# v69 +$(WORK)/hexagon-v69/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v69 + $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v69 + $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/v69_asm/%.S | $(WORK)/hexagon-v69 + $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v69/$(LIBRARY_NAME): $(hexagon-v69_objs) | $(HFILES) + $(HEXAGON_CXX_V69) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +# v73 +$(WORK)/hexagon-v73/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v73 + $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v73 + $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/v73_asm/%.S | $(WORK)/hexagon-v73 + $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v73/$(LIBRARY_NAME): $(hexagon-v73_objs) | $(HFILES) + $(HEXAGON_CXX_V73) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +#v75 +$(WORK)/hexagon-v75/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v75 + $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v75 + $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/v75_asm/%.S | $(WORK)/hexagon-v75 + $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v75/$(LIBRARY_NAME): $(hexagon-v75_objs) | $(HFILES) + $(HEXAGON_CXX_V75) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +#v79 +$(WORK)/hexagon-v79/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v79 + $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v79 + $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/v79_asm/%.S | $(WORK)/hexagon-v79 + $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v79/$(LIBRARY_NAME): $(hexagon-v79_objs) | $(HFILES) + $(HEXAGON_CXX_V79) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + + + +# aarch64 +$(WORK)/aarch64-android/%.o: $(SRC_DIR)/%.cpp | $(WORK)/aarch64-android + $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/aarch64-android + $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/android_asm/%.S | $(WORK)/aarch64-android + $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/aarch64-android/$(LIBRARY_NAME): $(aarch64-android_objs) | $(HFILES) + $(AARCH64_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(AARCH64_LDFLAGS) + +clean: + -rm -rf $(WORK) + +.PHONY: all clean diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/config/GgmlOpPackage.xml b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/config/GgmlOpPackage.xml new file mode 100644 index 0000000000000..f4c6575902948 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/config/GgmlOpPackage.xml @@ -0,0 +1,88 @@ + + + + + GgmlMulMat + + + GGML MulMat operator + + + + + in[0] + + src0 + + true + BACKEND_SPECIFIC + + 4D + NHWC + [N, C, H , W] + + + + + in[1] + + src1 + + true + BACKEND_SPECIFIC + + 4D + NHWC + [N, C, H , W] + + + + + out[0] + + dst + + true + BACKEND_SPECIFIC + + 4D + [N, C, H , W] + + + + + HTP + + + + + + + GgmlMulMat + + + + + GgmlMulMat + + + in[0] + QNN_DATATYPE_FLOAT_16 + QNN_DATATYPE_FLOAT_32 + + + in[1] + QNN_DATATYPE_FLOAT_16 + QNN_DATATYPE_FLOAT_32 + + + + out[0] + QNN_DATATYPE_FLOAT_16 + QNN_DATATYPE_FLOAT_32 + + + + + + diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/GgmlOpPackageInterface.cpp b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/GgmlOpPackageInterface.cpp new file mode 100644 index 0000000000000..df9ab364209b5 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/GgmlOpPackageInterface.cpp @@ -0,0 +1,274 @@ +//============================================================================== +// Auto Generated Code for GgmlOpPackage +//============================================================================== + +#include "HTP/QnnHtpCommon.h" +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "HTP/core/unique_types.h" +#include "QnnOpPackage.h" +#include "QnnSdkBuildId.h" + +DEFINE_UNIQ_TY() +BEGIN_PKG_OPS_OPTS_LIST() + +/** Note that the order of declarations given here defines the order in which ops and graph optimizations are + * registered to the HTP Core. + * Append the latest OpName at the bottom + */ +DECLARE_PKG_OPS_OPTS_LIST(PKG_GgmlMulMat) + +END_PKG_OPS_OPTS_LIST() + +// op package info +static constexpr auto sg_packageName = THIS_PKG_NAME_STR; // package name passed in as compile flag + +static std::array sg_opNames{{"GgmlMulMat"}}; + +static Qnn_ApiVersion_t sg_sdkApiVersion = QNN_HTP_API_VERSION_INIT; +static QnnOpPackage_Info_t sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT; + +// global data +static QnnOpPackage_GlobalInfrastructure_t sg_globalInfra = +nullptr; // global infrastructure not in use for now +static bool sg_packageInitialized = false; + +/* + * user provided logging call back function + * currently only supported on linux x86-64 and nonrpc versions + * typedef void (*QnnLog_Callback_t)(const char* fmt, + * QnnLog_Level_t level, + * uint64_t timestamp, + * va_list args); + * usage: if(sg_logInitialized && level <= sg_maxLogLevel) + * sg_logCallback(fmt, level, timestamp, args); + * + * for cross rpc versions, skel side user provided logging call back function + * can be defined as part of op packages. maximal log level sg_maxLogLevel + * can be set by Qnn_ErrorHandle_t GgmlOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel) + */ +/* + * for alternative logging method provided by HTP core, please refer to log.h + */ +static QnnLog_Callback_t sg_logCallback = + nullptr; // user provided call back function pointer for logging +static QnnLog_Level_t sg_maxLogLevel = + (QnnLog_Level_t)0; // maximal log level used in user provided logging +static bool sg_logInitialized = + false; // tracks whether user provided logging method has been initialized + + +/* +* op initialization +* needs to be global in the package +* one initialization per package before any op definitions +* syntax: INIT_PACKAGE_OP_DEF() +*/ +INIT_PACKAGE_OP_DEF() + +/* +* optimization initialization +* needs to be global in the package +* one initialization per package before any optimization definitions +* syntax: INIT_PACKAGE_OPTIMIZATION_DEF() +*/ +INIT_PACKAGE_OPTIMIZATION_DEF() + +/* + * op parameter order initialization + * needs to be global in the package + * one initialization per package before any op parameter order definitions + * syntax: INIT_PACKAGE_PARAM_ORDER_DEF() + */ +INIT_PACKAGE_PARAM_ORDER_DEF() + +/* + * axis parameter name list + * optional + * needs to be global in the package + * one list per package + * for listing axis parameter names passed into Qnn_AddNode API + * HTP backend auto-adjusts values in axis parameters based on HTP backfilling + * note: HTP backend backfills tensor dimensions to 4 dimensions + * syntax: LIST_PACKAGE_AXIS_PARAMS(...) + * e.g. LIST_PACKAGE_AXIS_PARAMS("Axis", "AXIS", "axis") + */ +// LIST_PACKAGE_AXIS_PARAMS() + +/* + * per-channel quantized op name list + * optional + * needs to be global in the package + * one list per package + * for listing op names which support per-channel quantization + * per-axis quantization info of an op is embeded in axisScaleOffsetEncoding + * inside Qnn_Tensor_t types + * HTP backend only supports per-channel scale ops + * i.e. along last dimension, offset is always zero + * if an op name is marked as having per-channel scale support, and in + * QNN_AddNode, at least one input, parameter, or output has + * QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET type: + * then: + * HTP backend will pass to op implementation function the following: + * output(s), input(s), parameter(s), + * outputPerChannelScale(s), inputPerChannelScale(s), paramPerChannelScale(s) + * + * optimization rules can be used to remove extra perChannelScale tensors + * + * syntax: LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...) + * e.g. LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(sg_op1Name, sg_op2Name) + */ + +// LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() + +/* +* Declare and define the special intialize function for HTP Backend to load +*/ +INIT_PKG_CORE_INIT_FUNC() + +/* op package API's */ + +Qnn_ErrorHandle_t GgmlOpPackageInit(QnnOpPackage_GlobalInfrastructure_t infrastructure) { + if (sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED; + + /* + * op parameter order registration + * registers all defined op parameter orders in the package + * syntax: REGISTER_PACKAGE_PARAM_ORDERS() + */ + REGISTER_PACKAGE_PARAM_ORDERS() + + /* + * op axis parameter name registration + * registers all axis parameter names in the package + * used with LIST_PACKAGE_AXIS_PARAMS(...) + * syntax: REGISTER_PACKAGE_AXIS_PARAMS() + */ + REGISTER_PACKAGE_AXIS_PARAMS() + + /* + * per-channel scale op name registration + * registers all per-channel scale op names in the package + * used with LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...) + * syntax: REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() + */ + REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() + + sg_globalInfra = infrastructure; + sg_packageInitialized = true; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t GgmlOpPackageGetInfo(const QnnOpPackage_Info_t** info) { + if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; + if (!info) return QNN_OP_PACKAGE_ERROR_INVALID_INFO; + + sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT; + sg_packageInfo.packageName = sg_packageName; + sg_packageInfo.operationNames = sg_opNames.data(); + sg_packageInfo.numOperations = sg_opNames.size(); + sg_packageInfo.sdkBuildId = QNN_SDK_BUILD_ID; + sg_packageInfo.sdkApiVersion = &sg_sdkApiVersion; + + *info = &sg_packageInfo; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t GgmlOpPackageLogInitialize(QnnLog_Callback_t callback, QnnLog_Level_t maxLogLevel) { + if (sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED; + if (!callback) return QNN_LOG_ERROR_INVALID_ARGUMENT; + if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT; + sg_logCallback = callback; + sg_maxLogLevel = maxLogLevel; + sg_logInitialized = true; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t GgmlOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel) { + if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT; + sg_maxLogLevel = maxLogLevel; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t GgmlOpPackageLogTerminate() { + if (!sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; + sg_logCallback = nullptr; + sg_maxLogLevel = (QnnLog_Level_t)0; + sg_logInitialized = false; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t GgmlOpPackageValidateOpConfig (Qnn_OpConfig_t opConfig){ + if (std::string(sg_packageName) != opConfig.v1.packageName) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + + /* auto-generated validation code below + * Check if op config type matches any registered ops + * If a match is found, check number of inputs, outputs and params + */ + if (std::string(opConfig.v1.typeName) == "GgmlMulMat"){ + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){ + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } + else{ + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + + /* + * additional validation code here + * */ + + return QNN_SUCCESS; +} + +/* The following three functions in this comment are not called by HTP backend for now, + * no auto-generated implementations are created. Users should see example for full function signatures. + * (version 1.3.0) Qnn_ErrorHandle_t GgmlOpPackageCreateKernels (QnnOpPackage_GraphInfrastructure_t + * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_Kernel_t** kernels, uint32_t* + * numKernels) + * (version 1.3.0) Qnn_ErrorHandle_t GgmlOpPackageFreeKernels (QnnOpPackage_Kernel_t* kernels) + * + * (version 1.4.0) Qnn_ErrorHandle_t GgmlOpPackageCreateOpImpl (QnnOpPackage_GraphInfrastructure_t + * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_OpImpl_t* opImpl) + *(version 1.4.0) Qnn_ErrorHandle_t GgmlOpPackageFreeOpImpl (QnnOpPackage_OpImpl_t opImpl) + */ + +Qnn_ErrorHandle_t GgmlOpPackageTerminate() { +if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; + +sg_globalInfra = nullptr; +sg_packageInitialized = false; +return QNN_SUCCESS; +} + +#ifdef __cplusplus +extern "C" { +#endif + + +/* latest version */ +Qnn_ErrorHandle_t GgmlOpPackageInterfaceProvider(QnnOpPackage_Interface_t* interface) { + if (!interface) return QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT; + interface->interfaceVersion = {1, 4, 0}; + interface->v1_4.init = GgmlOpPackageInit; + interface->v1_4.terminate = GgmlOpPackageTerminate; + interface->v1_4.getInfo = GgmlOpPackageGetInfo; + interface->v1_4.validateOpConfig = GgmlOpPackageValidateOpConfig; + interface->v1_4.createOpImpl = nullptr; + interface->v1_4.freeOpImpl = nullptr; + interface->v1_4.logInitialize = GgmlOpPackageLogInitialize; + interface->v1_4.logSetLevel = GgmlOpPackageLogSetLevel; + interface->v1_4.logTerminate = GgmlOpPackageLogTerminate; + return QNN_SUCCESS; +} + +#ifdef __cplusplus +} +#endif + + diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/ops/GgmlMulMat.cpp b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/ops/GgmlMulMat.cpp new file mode 100644 index 0000000000000..137522cc80773 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/ops/GgmlMulMat.cpp @@ -0,0 +1,213 @@ +//============================================================================== +// Auto Generated Code for GgmlOpPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +BEGIN_PKG_OP_DEFINITION(PKG_GgmlMulMat); + +// op execute function declarations +template +GraphStatus ggmlmulmatImpl(TensorType & out_0, const TensorType & in_0, const TensorType & in_1); + +// forward declaration of sample cost function +static float ggmlmulmatCostFunc(const Op * op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default flag (Flags::RESOURCE_HVX) + * syntax: DEF_PACKAGE_OP(F,OP) + * e.g. DEF_PACKAGE_OP((ggmlmulmatImpl), "GgmlMulMat") + */ +DEF_PACKAGE_OP((ggmlmulmatImpl), "GgmlMulMat") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, FAST, FREE) + * and provided flags + * syntax: DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) + * can use zero or more flags, FLAG options are IS_CONST, INHIBIT_CONST_PROP, + * RESOURCE_HVX, RESOURCE_HMX(not supported in external op packages) + * e.g. DEF_PACKAGE_OP_AND_COST_AND_FLAGS((ggmlmulmatImpl), "GgmlMulMat", SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((ggmlmulmatImpl), + * "GgmlMulMat", ggmlmulmatCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op execution functions + * if an op does not have a parameter order definition, parameter order passed into Qnn_addNode + * will be passed into op execution functions + * if an op has a parameter order definition, any parameter passed into Qnn_addNode with unlisted + * name will be abandoned + * if two or more op packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at Qnn_addNode + * DEFAULT is used when MANDATORY is false + * if provided as Qnn_Param_t*, + * DEFAULT will be used for graph construction when this parameter is not provided at + * Qnn_addNode + * if provided as nullptr, + * graph construction will skip this parameter when this parameter is not provided at + * Qnn_addNode + */ + +namespace { + +constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 +constexpr const size_t kFloatsPerVector = kBytesPerVector / sizeof(float); +constexpr const size_t kAlignMask = kBytesPerVector - 1; + +inline size_t unaligned_bytes(const void * addr) { + return ((size_t) addr) & kAlignMask; +} + +inline bool is_addr_aligned(void * addr) { + return unaligned_bytes(addr) == 0; +} + +inline float vec_dot_product_f32(const float * src0, const float * src1, size_t count) { + HVX_Vector * iptr0 = ((HVX_Vector *) src0); + HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kFloatsPerVector); + HVX_Vector * iptr1 = ((HVX_Vector *) src1); + HVX_Vector prev0 = *iptr0++; + HVX_Vector prev1 = *iptr1++; + HVX_Vector sum = Q6_V_vzero(); + + // TODO: prefetch? + while (iptr0 < iptr0_end) { + HVX_Vector curr0 = *iptr0++; + HVX_Vector curr1 = *iptr1++; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); + prev0 = curr0; + prev1 = curr1; + } + + if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { + // handle the last vector + // see also: https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 + HVX_Vector curr0 = is_addr_aligned(iptr0) ? prev0 : *iptr0++; + HVX_Vector curr1 = is_addr_aligned(iptr1) ? prev1 : *iptr1++; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); + prev0 = curr0; + prev1 = curr1; + } + + const size_t leftover = count % kFloatsPerVector; + const size_t leftover_bytes = leftover * sizeof(float); + if (leftover > 0) { + // handle the leftover elements + HVX_Vector curr0 = (leftover_bytes + unaligned_bytes(iptr0) > kBytesPerVector) ? *iptr0 : prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + + HVX_Vector curr1 = (leftover_bytes + unaligned_bytes(iptr1) > kBytesPerVector) ? *iptr1 : prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + + sum = Q6_Vqf32_vadd_Vqf32Vqf32( + Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum); + } + + // TODO: do we have a better way to do the reduction? + for (size_t i = kFloatsPerVector / 2; i > 0; i /= 2) { + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vror_VR(sum, i * sizeof(float))); + } + + float result; + q6op_vstu_variable_ARV(&result, sizeof(float), Q6_Vsf_equals_Vqf32(sum)); + return result; +} + +template +inline GraphStatus mul_mat_2d_f32(TensorType & out_0, const TensorType & in_0, const TensorType & in_1) { + // TODO: handle strides? + if (in_1.dim(1) != in_0.dim(1)) { + return GraphStatus::ErrorDimensions; + } + + size_t dims[4] = { in_1.dim(0), in_0.dim(0) }; + out_0.set_dims(dims); + + auto in0_ptr = (float *) in_0.raw_data_const(); + auto in1_ptr = (float *) in_1.raw_data_const(); + auto out_ptr = (float *) out_0.raw_data(); + + for (size_t i = 0; i < dims[0]; i++) { + // TODO: prefetch? + auto * in1_row = in1_ptr + i * in_1.dim(1); + auto * out_row = out_ptr + i * dims[1]; + for (size_t j = 0; j < dims[1]; j++) { + *out_row++ = vec_dot_product_f32(in0_ptr + j * in_0.dim(1), in1_row, in_0.dim(1)); + } + } + + return GraphStatus::Success; +} + +} // namespace + +/* execute functions for ops */ + +template +GraphStatus ggmlmulmatImpl(TensorType & out_0, const TensorType & in_0, const TensorType & in_1) { + if (!in_0.raw_data_const() || !in_1.raw_data_const() || !out_0.raw_data()) { + return GraphStatus::ErrorBadInput; + } + + if (in_0.rank() != in_1.rank()) { + return GraphStatus::ErrorRank; + } + + auto rank = in_0.rank(); + switch (rank) { + case 4: + case 3: + // TODO: add implementation + return GraphStatus::ErrorUnsupported; + case 2: + return mul_mat_2d_f32(out_0, in_0, in_1); + } + + return GraphStatus::ErrorRank; +} + +__attribute__((unused)) static float ggmlmulmatCostFunc(const Op * op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_GgmlMulMat); diff --git a/ggml/src/ggml-qnn/qnn/logger.cpp b/ggml/src/ggml-qnn/qnn/logger.cpp new file mode 100644 index 0000000000000..0ffa12e7b1bb3 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/logger.cpp @@ -0,0 +1,50 @@ + +#include "logger.hpp" + +#ifndef NDEBUG + +# include + +# include "QnnInterface.h" +# include "QnnTypes.h" +# include "System/QnnSystemInterface.h" + +void qnn::sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t /*timestamp*/, va_list argp) { + static std::mutex log_mutex; + static char s_ggml_qnn_logbuf[4096]; + + char log_level_desc; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + log_level_desc = 'E'; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = 'W'; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = 'I'; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = 'D'; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = 'V'; + break; + default: + log_level_desc = 'U'; + break; + } + + { + std::lock_guard lock(log_mutex); + int size = vsnprintf(s_ggml_qnn_logbuf, sizeof(s_ggml_qnn_logbuf), fmt, argp); + if (size > 0 && s_ggml_qnn_logbuf[size - 1] != '\n') { + QNN_LOG_INFO("[%c]%s\n", log_level_desc, s_ggml_qnn_logbuf); + } else { + QNN_LOG_INFO("[%c]%s", log_level_desc, s_ggml_qnn_logbuf); + } + } +} +#else +void qnn::sdk_logcallback(const char *, QnnLog_Level_t, uint64_t, va_list) {} +#endif diff --git a/ggml/src/ggml-qnn/qnn/logger.hpp b/ggml/src/ggml-qnn/qnn/logger.hpp new file mode 100644 index 0000000000000..309ae3e985a28 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/logger.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include + +#include "ggml-impl.h" +#include "ggml.h" + +namespace qnn { +void sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp); +} // namespace qnn + +#define QNN_LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__)) +#define QNN_LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__)) +#define QNN_LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__)) + +#ifndef NDEBUG +# define QNN_LOG_DEBUG(...) (GGML_LOG_DEBUG(__VA_ARGS__)) +#else +# define QNN_LOG_DEBUG(...) +#endif diff --git a/ggml/src/ggml-qnn/qnn/op-config-base.hpp b/ggml/src/ggml-qnn/qnn/op-config-base.hpp new file mode 100644 index 0000000000000..c2370000b235d --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/op-config-base.hpp @@ -0,0 +1,152 @@ +#pragma once + +#include +#include + +#include "common.hpp" +#include "ggml-qnn.h" +#include "qnn-types.hpp" +#include "tensor.hpp" + +namespace qnn { + +/** + * @class ggml_qnn_op_config + * @brief Abstract base class for configuring QNN operations. + * + * This class provides an interface for creating and managing tensors, + * adding operations to a graph, and binding/unbinding input and output tensors. + */ +class ggml_qnn_op_config { + public: + virtual ~ggml_qnn_op_config() {} + + /** + * @brief Sets custom input tensors for the operation. This method should be called before `initialize_op_nodes`. + * If no custom input tensors are provided, the input tensors will be automatically created from the input ggml + * tensors. + * + * This pure virtual function must be overridden by derived classes to set + * the input tensors for the operation. The function takes a reference to a + * vector of qnn_tensor_ptr_t objects, which represent the input tensors. + * + * @param tensor_inputs A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors. + */ + virtual void set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) = 0; + virtual void set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) = 0; + + /** + * @brief Sets custom output tensors for the operation. This method should be called before `initialize_op_nodes`. + * If no custom output tensors are provided, the output tensors will be automatically created from the output ggml + * tensors. + * + * This pure virtual function must be overridden by derived classes to set + * the output tensors for the operation. The function takes a reference to a + * vector of qnn_tensor_ptr_t objects, which represent the output tensors. + * + * @param tensor_outputs A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors. + */ + virtual void set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) = 0; + virtual void set_output_tensors(qnn::qnn_tensor_array_t && tensor_inputs) = 0; + + /** + * @brief Creates tensors and internal nodes for constructing the calculation graph. + * + * This pure virtual function is responsible for creating tensors on the given + * backend device, associating them with the provided graph handle, and creating + * the internal nodes necessary for constructing the calculation graph. It takes + * input and output tensor arrays as parameters. + * + * @param device + * @param graph_handle + * @return true if tensors and nodes are successfully created, false otherwise. + */ + virtual bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) = 0; + + /** + * @brief Pure virtual function to retrieve the input tensors. + * + * This function must be overridden by derived classes to provide the specific implementation + * for retrieving the input tensors used in QNN operations. + * + * @return A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors. + */ + virtual qnn_tensor_array_t & get_input_tensors() = 0; + + /** + * @brief Pure virtual function to retrieve the output tensors of a QNN. + * + * This function must be overridden by any derived class to provide access to the + * output tensors of the QNN. The function returns a reference to a vector of + * qnn_tensor_ptr_t objects, which represent the output tensors. + * + * @return A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors. + */ + virtual qnn_tensor_array_t & get_output_tensors() = 0; + + /** + * @brief Adds an operation to the given graph. + * + * This pure virtual function must be implemented by derived classes to add + * a specific operation to the provided graph handle. + * + * This function will be called after `initialize_op_nodes` during initialization. + * + * @param graph_handle The handle to the graph where the operation will be added. + * @return true if the operation was successfully added to the graph, false otherwise. + */ + virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0; + + /** + * @brief Binds the input tensors to the operation. + * + * This pure virtual function must be implemented by derived classes to bind + * the provided input tensors to the operation. The function takes a constant + * reference to a ggml_tensor_array_t object, which contains the input tensors + * to be bound. + * + * @param tensor_inputs A constant reference to a ggml_tensor_array_t object + * containing the input tensors. + * @return true if the input tensors were successfully bound, false otherwise. + */ + virtual bool bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) = 0; + + /** + * @brief Binds the output tensors to the given tensor array. + * + * This pure virtual function must be implemented by derived classes to bind + * the output tensors to the provided array of tensors. The function is expected + * to establish the necessary connections or mappings between the output tensors + * and the elements of the given tensor array. + * + * @param tensor_outputs A constant reference to an array of ggml tensors that + * represent the output tensors to be bound. + * @return true if the binding is successful, false otherwise. + */ + virtual bool bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) = 0; + + /** + * @brief Unbinds the input tensors from the operation. + * + * This pure virtual function is intended to be overridden by derived classes + * to implement the logic for unbinding or detaching input tensors that were + * previously bound to the operation. This is typically used to release resources + * or reset the state of the operation. + */ + virtual void unbind_input_tensors() = 0; + + /** + * @brief Unbinds the output tensors. + * + * This pure virtual function is responsible for unbinding or detaching + * the output tensors from their current bindings. Implementations of this + * function should ensure that any resources or references held by the + * output tensors are properly released or reset. + */ + virtual void unbind_output_tensors() = 0; +}; + +using qnn_op_config_ptr_t = std::shared_ptr; +using qnn_op_config_array_t = std::vector; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/qnn/op-config-caps.cpp new file mode 100644 index 0000000000000..d5b55eff970c9 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/op-config-caps.cpp @@ -0,0 +1,437 @@ + +#include "op-config-impl.hpp" + +namespace { + +using op_constructor_t = std::shared_ptr (*)(const ggml_tensor *, const std::string &, + std::shared_ptr); + +using op_description_generator_t = void (*)(const ggml_tensor * op, bool append_dimensions, + ggml_type override_data_type, std::string & output); + +void append_tensor_shape_and_type_impl(const ggml_tensor * tensor, ggml_type override_data_type, std::string & output) { + char buffer[256] = {}; + const auto * type_name = qnn::get_ggml_type_name(std::min(tensor->type, override_data_type)); + int len = 0; + switch (ggml_n_dims(tensor)) { + case 1: + len = snprintf(buffer, sizeof(buffer), "%ld%s", (long) tensor->ne[0], type_name); + break; + case 2: + len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], type_name); + break; + case 3: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], + (long) tensor->ne[2], type_name); + break; + case 4: + default: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], + (long) tensor->ne[2], (long) tensor->ne[3], type_name); + break; + } + GGML_ASSERT(len > 0 && len < (int) sizeof(buffer)); + output.append(buffer, len); +} + +void get_graph_key_from_op(const ggml_tensor * op, ggml_type override_data_type, std::string & output) { + output += ggml_op_desc(op); + output += qnn::get_ggml_type_name(op->type); + for (size_t i = 0; i < GGML_MAX_SRC && op->src[i]; ++i) { + auto * src = op->src[i]; + if (!src) { + break; + } + + output += '_'; + append_tensor_shape_and_type_impl(src, override_data_type, output); + } +} + +void get_op_key_with_src_op_desc(const ggml_tensor * op, std::string & output) { + output += ggml_op_desc(op); + output += '('; + if (op->src[0]) { + output += ggml_op_desc(op->src[0]); + } + for (size_t i = 1; i < GGML_MAX_SRC && op->src[i]; ++i) { + output += ','; + output += ggml_op_desc(op->src[i]); + } + output += ')'; +} + +void generic_get_op_desc(const ggml_tensor * op, bool append_dimensions, ggml_type override_data_type, + std::string & output) { + if (append_dimensions) { + get_graph_key_from_op(op, override_data_type, output); + } else { + get_op_key_with_src_op_desc(op, output); + } +} + +struct qnn_op_caps_t { + const char * qnn_op_name = nullptr; + op_description_generator_t get_desc = nullptr; + const char * qnn_param_name = nullptr; +}; + +constexpr const qnn_op_caps_t kOpCaps[] = { + {}, // GGML_OP_NONE + {}, // GGML_OP_DUP + { + // GGML_OP_ADD + QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name + }, + {}, // GGML_OP_ADD1 + {}, // GGML_OP_ACC + { + // GGML_OP_SUB + QNN_OP_ELEMENT_WISE_SUBTRACT, // qnn_op_name + }, + { + // GGML_OP_MUL + QNN_OP_ELEMENT_WISE_MULTIPLY, // qnn_op_name + }, + { + // GGML_OP_DIV + QNN_OP_ELEMENT_WISE_DIVIDE, // qnn_op_name + }, + {}, // GGML_OP_SQR + { + // GGML_OP_SQRT + QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // qnn_op_name + }, + { + // GGML_OP_LOG + QNN_OP_ELEMENT_WISE_LOG, // qnn_op_name + }, + {}, // GGML_OP_SIN + {}, // GGML_OP_COS + {}, // GGML_OP_SUM + {}, // GGML_OP_SUM_ROWS + {}, // GGML_OP_MEAN + {}, // GGML_OP_ARGMAX + {}, // GGML_OP_COUNT_EQUAL + {}, // GGML_OP_REPEAT + {}, // GGML_OP_REPEAT_BACK + {}, // GGML_OP_CONCAT + {}, // GGML_OP_SILU_BACK + {}, // GGML_OP_NORM + { + // GGML_OP_RMS_NORM + QNN_OP_RMS_NORM, // qnn_op_name + generic_get_op_desc, // get_desc + QNN_OP_RMS_NORM_PARAM_EPSILON, // qnn_param_name + }, + {}, // GGML_OP_RMS_NORM_BACK + {}, // GGML_OP_GROUP_NORM + {}, // GGML_OP_L2_NORM + { + // GGML_OP_MUL_MAT + QNN_OP_MAT_MUL, // qnn_op_name + }, + {}, // GGML_OP_MUL_MAT_ID + {}, // GGML_OP_OUT_PROD + {}, // GGML_OP_SCALE + {}, // GGML_OP_SET + {}, // GGML_OP_CPY + {}, // GGML_OP_CONT + { + // GGML_OP_RESHAPE + QNN_OP_RESHAPE, // qnn_op_name + }, + {}, // GGML_OP_VIEW + {}, // GGML_OP_PERMUTE + {}, // GGML_OP_TRANSPOSE + {}, // GGML_OP_GET_ROWS + {}, // GGML_OP_GET_ROWS_BACK + {}, // GGML_OP_DIAG + {}, // GGML_OP_DIAG_MASK_INF + {}, // GGML_OP_DIAG_MASK_ZERO + {}, // GGML_OP_SOFT_MAX + {}, // GGML_OP_SOFT_MAX_BACK + {}, // GGML_OP_ROPE + {}, // GGML_OP_ROPE_BACK + {}, // GGML_OP_CLAMP + {}, // GGML_OP_CONV_TRANSPOSE_1D + {}, // GGML_OP_IM2COL + {}, // GGML_OP_IM2COL_BACK + {}, // GGML_OP_CONV_TRANSPOSE_2D + {}, // GGML_OP_POOL_1D + {}, // GGML_OP_POOL_2D + {}, // GGML_OP_POOL_2D_BACK + {}, // GGML_OP_UPSCALE + {}, // GGML_OP_PAD + {}, // GGML_OP_PAD_REFLECT_1D + {}, // GGML_OP_ARANGE + + {}, // GGML_OP_TIMESTEP_EMBEDDING + {}, // GGML_OP_ARGSORT + {}, // GGML_OP_LEAKY_RELU + + {}, // GGML_OP_FLASH_ATTN_EXT + {}, // GGML_OP_FLASH_ATTN_BACK + {}, // GGML_OP_SSM_CONV + {}, // GGML_OP_SSM_SCAN + {}, // GGML_OP_WIN_PART + {}, // GGML_OP_WIN_UNPART + {}, // GGML_OP_GET_REL_POS + {}, // GGML_OP_ADD_REL_POS + {}, // GGML_OP_RWKV_WKV6 + {}, // GGML_OP_GATED_LINEAR_ATTN + {}, // GGML_OP_RWKV_WKV7 + + {}, // GGML_OP_UNARY + + {}, // GGML_OP_MAP_CUSTOM1 + {}, // GGML_OP_MAP_CUSTOM2 + {}, // GGML_OP_MAP_CUSTOM3 + + {}, // GGML_OP_CUSTOM + + {}, // GGML_OP_CROSS_ENTROPY_LOSS + {}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + {}, // GGML_OP_OPT_STEP_ADAMW + + // ggml_unary_op + {}, // GGML_UNARY_OP_ABS + {}, // GGML_UNARY_OP_SGN + {}, // GGML_UNARY_OP_NEG + {}, // GGML_UNARY_OP_STEP + {}, // GGML_UNARY_OP_TANH + {}, // GGML_UNARY_OP_ELU + {}, // GGML_UNARY_OP_RELU + {}, // GGML_UNARY_OP_SIGMOID + { + // GGML_UNARY_OP_GELU + QNN_OP_GELU, // qnn_op_name + }, + {}, // GGML_UNARY_OP_GELU_QUICK + {}, // GGML_UNARY_OP_SILU + {}, // GGML_UNARY_OP_HARDSWISH + {}, // GGML_UNARY_OP_HARDSIGMOID + {}, // GGML_UNARY_OP_EXP +}; + +static_assert(kOpCaps[GGML_OP_NONE].get_desc == nullptr, "GGML_OP_NONE should not have get_desc function"); +static_assert(kOpCaps[GGML_OP_ADD].qnn_op_name, "GGML_OP_ADD does not have qnn_op_name in the kOpCaps table"); +static_assert(kOpCaps[GGML_OP_MUL_MAT].qnn_op_name, "GGML_OP_MUL_MAT does not have qnn_op_name in the kOpCaps table"); +static_assert(kOpCaps[GGML_OP_MUL].qnn_op_name, "GGML_OP_MUL does not have qnn_op_name in the kOpCaps table"); +static_assert(kOpCaps[GGML_OP_LOG].qnn_op_name, "GGML_OP_LOG does not have qnn_op_name in the kOpCaps table"); +static_assert(kOpCaps[GGML_OP_COUNT + GGML_UNARY_OP_GELU].qnn_op_name, + "GGML_UNARY_OP_GELU does not have qnn_op_name in the kOpCaps table"); +static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kOpCaps table"); + +std::shared_ptr mat_mul_op_constructor(const ggml_tensor * op, + const std::string & instance_name, + qnn::qnn_instance_ptr qnn_instance) { + if (qnn_instance->has_custom_op_package() && ggml_n_dims(op) == 2) { + QNN_LOG_DEBUG("create GgmlMulMat, name %s, use GgmlOpPackage\n", instance_name.c_str()); + return std::make_shared(instance_name, "GgmlOpPackage", "GgmlMulMat", + qnn_instance); + } + + QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str()); + return std::make_shared(instance_name, qnn_instance); +} + +template +std::shared_ptr generic_op_constructor(const ggml_tensor * op, + const std::string & instance_name, + qnn::qnn_instance_ptr qnn_instance) { + GGML_UNUSED(op); + static_assert(_op < std::size(kOpCaps)); + static_assert(kOpCaps[_op].qnn_op_name != nullptr); + return std::make_shared(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + kOpCaps[_op].qnn_op_name, qnn_instance); +} + +void add_type_parameters(std::shared_ptr op, const char * name, float value) { + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_FLOAT_32; + scalar.floatValue = value; + op->add_scalar_param(name, scalar); +} + +template +std::shared_ptr op_constructor_with_type_param(const ggml_tensor * op, + const std::string & instance_name, + qnn::qnn_instance_ptr qnn_instance) { + static_assert(std::is_base_of::value); + static_assert(_op < std::size(kOpCaps)); + + constexpr auto & op_caps = kOpCaps[_op]; + static_assert(op_caps.qnn_op_name != nullptr); + + _ggml_op_param_type op_param; + memcpy(&op_param, op->op_params, sizeof(op_param)); + auto qnn_op = std::make_shared<_qnn_op_type_name>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_caps.qnn_op_name, + qnn_instance); + if (op_caps.qnn_param_name) { + add_type_parameters(qnn_op, op_caps.qnn_param_name, op_param); + } + return qnn_op; +} + +constexpr const op_constructor_t kOpConstructors[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + generic_op_constructor, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + generic_op_constructor, // GGML_OP_SUB + generic_op_constructor, // GGML_OP_MUL + generic_op_constructor, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + generic_op_constructor, // GGML_OP_SQRT + generic_op_constructor, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_COUNT_EQUAL + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + op_constructor_with_type_param, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_L2_NORM + + mat_mul_op_constructor, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + generic_op_constructor, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_PAD_REFLECT_1D + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + nullptr, // GGML_OP_RWKV_WKV6 + nullptr, // GGML_OP_GATED_LINEAR_ATTN + nullptr, // GGML_OP_RWKV_WKV7 + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CUSTOM + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + nullptr, // GGML_OP_OPT_STEP_ADAMW + + // ggml_unary_op + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + nullptr, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_EXP +}; + +static_assert(kOpConstructors[GGML_OP_NONE] == nullptr, "GGML_OP_NONE does not match the nullptr function"); +static_assert(kOpConstructors[GGML_OP_ADD] == generic_op_constructor, + "GGML_OP_ADD does not match the generic_op_constructor function"); +static_assert(kOpConstructors[GGML_OP_MUL_MAT] == mat_mul_op_constructor, + "GGML_OP_MUL_MAT does not match the mat_mul_op_constructor function"); +static_assert(std::size(kOpConstructors) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kOpConstructors table"); + +} // namespace + +namespace qnn { + +void append_tensor_shape_and_type(const ggml_tensor * tensor, std::string & output) { + append_tensor_shape_and_type_impl(tensor, GGML_TYPE_COUNT, output); +} + +size_t get_qnn_op_index(const ggml_tensor * tensor) { + if (tensor->op == GGML_OP_UNARY) { + return kGgmlUnaryOpStart + ggml_get_unary_op(tensor); + } + + return tensor->op; +} + +const char * get_qnn_op_name(const ggml_tensor * op) { + auto op_index = get_qnn_op_index(op); + GGML_ASSERT(op_index < std::size(kOpCaps)); + GGML_ASSERT(kOpCaps[op_index].qnn_op_name); + return kOpCaps[op_index].qnn_op_name; +} + +void get_qnn_op_desc(const ggml_tensor * op, bool append_dimensions, ggml_type override_data_type, + std::string & output) { + auto op_index = get_qnn_op_index(op); + GGML_ASSERT(op_index < std::size(kOpCaps)); + auto get_desc = kOpCaps[op_index].get_desc; + if (get_desc) { + get_desc(op, append_dimensions, override_data_type, output); + } else { + generic_get_op_desc(op, append_dimensions, override_data_type, output); + } +} + +std::shared_ptr create_op(const ggml_tensor * op, const std::string & name, + qnn_instance_ptr qnn_instance) { + auto op_index = get_qnn_op_index(op); + GGML_ASSERT(op_index < std::size(kOpCaps)); + auto op_constructor = kOpConstructors[op_index]; + GGML_ASSERT(op_constructor); + return op_constructor(op, name, qnn_instance); +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/op-config-impl.cpp b/ggml/src/ggml-qnn/qnn/op-config-impl.cpp new file mode 100644 index 0000000000000..e546da4929c77 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/op-config-impl.cpp @@ -0,0 +1,444 @@ +#include "op-config-impl.hpp" + +#include + +#include "logger.hpp" + +namespace { + +qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t & dimensions, int rank) { + qnn::qnn_dimension_array_t transposed_dims = dimensions; + if (rank >= 2) { + transposed_dims[rank - 1] = dimensions[rank - 2]; + transposed_dims[rank - 2] = dimensions[rank - 1]; + } + + return transposed_dims; +} + +int get_rank(const qnn::ggml_tensor_array_t & tensor_inputs, const qnn::ggml_tensor_array_t & tensor_outputs) { + return std::max(qnn::get_ggml_tensors_max_rank(tensor_inputs), qnn::get_ggml_tensors_max_rank(tensor_outputs)); +} + +Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t & tensors) { + Qnn_DataType_t type = QNN_DATATYPE_UNDEFINED; + for (auto tensor : tensors) { + auto tensor_type_size = qnn::qnn_datatype_size(tensor->get_data_type()); + GGML_ASSERT(tensor_type_size > 0); + if (tensor_type_size > qnn::qnn_datatype_size(type)) { + type = tensor->get_data_type(); + } + } + + return type; +} + +} // namespace + +namespace qnn { + +void ggml_qnn_op_config_base::add_scalar_param(const std::string & name, const Qnn_Scalar_t scalar) { + _param_names.push_back(name); + Qnn_Param_t param = QNN_PARAM_INIT; + param.paramType = QNN_PARAMTYPE_SCALAR; + param.name = _param_names.back().c_str(); + param.scalarParam = scalar; + _qnn_parameters.push_back(param); +} + +bool ggml_qnn_op_config_base::add_tensor_param(const std::string & name, const qnn_dimension_array_t & dimensions, + int rank, const uint8_t * data, const Qnn_DataType_t data_type, + backend_index_type device, Qnn_GraphHandle_t graph_handle) { + std::string tensor_name = _name + name + std::to_string(_tensor_parameters.size()); + auto param_tensor = std::make_shared(ggml_qnn_tensor::PARAMETER, tensor_name, dimensions, + data_type, rank, device, graph_handle, _qnn_instance); + size_t data_size = ggml_type_size(ggml_datatype_from_qnn_datatype(data_type)); + for (int i = 0; i < rank; i++) { + data_size *= dimensions[i]; + } + + GGML_ASSERT(data_size > 0); + if (!param_tensor->set_data_buffer(data, data_size)) { + QNN_LOG_ERROR("parameter tensor bind_buffer failed\n"); + return false; + } + + if (!param_tensor->alloc_qnn_tensor_id()) { + QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed\n"); + return false; + } + + _tensor_parameters.push_back(param_tensor); + _param_names.push_back(name); + Qnn_Param_t param = QNN_PARAM_INIT; + param.paramType = QNN_PARAMTYPE_TENSOR; + param.name = _param_names.back().c_str(); + param.tensorParam = param_tensor->get_qnn_tensor(); + _qnn_parameters.push_back(param); + return true; +} + +void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) { + _tensor_inputs = tensor_inputs; + _qnn_tensor_inputs.resize(_tensor_inputs.size()); +} + +void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) { + _tensor_inputs = std::move(tensor_inputs); + _qnn_tensor_inputs.resize(_tensor_inputs.size()); +} + +void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) { + _tensor_outputs = tensor_outputs; + _qnn_tensor_outputs.resize(_tensor_outputs.size()); +} + +void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t && tensor_outputs) { + _tensor_outputs = std::move(tensor_outputs); + _qnn_tensor_outputs.resize(_tensor_outputs.size()); +} + +bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { + QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str()); + + GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); + GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); + + for (size_t i = 0; i < _tensor_inputs.size(); i++) { + auto tensor = _tensor_inputs[i]; + if (!tensor->alloc_qnn_tensor_id()) { + QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed\n", _name.c_str()); + return false; + } + + QNN_LOG_DEBUG("[%s]input tensor(%s), id(%d)\n", _name.c_str(), tensor->get_tensor_name().c_str(), + tensor->get_qnn_tensor_id()); + _qnn_tensor_inputs[i] = tensor->get_qnn_tensor(); + } + + for (size_t i = 0; i < _tensor_outputs.size(); i++) { + auto tensor = _tensor_outputs[i]; + if (!tensor->alloc_qnn_tensor_id()) { + QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str()); + return false; + } + + QNN_LOG_DEBUG("[%s]output tensor(%s), id(%d)\n", _name.c_str(), tensor->get_tensor_name().c_str(), + tensor->get_qnn_tensor_id()); + _qnn_tensor_outputs[i] = tensor->get_qnn_tensor(); + } + + auto qnn_interface = _qnn_instance->get_qnn_interface(); + auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config()); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[%s][%s][%s]qnn_graph_add_node.error: %s\n", _name.c_str(), _package_name.c_str(), + _op_type.c_str(), get_qnn_error_string(error)); + return false; + } + + QNN_LOG_DEBUG("[%s]added to graph succeed\n", _name.c_str()); + return true; +} + +bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) { + GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); + return qnn::bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); +} + +bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) { + GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); + return qnn::bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); +} + +void ggml_qnn_op_config_base::unbind_input_tensors() { + for (auto & tensor : _tensor_inputs) { + tensor->unbind(); + } +} + +void ggml_qnn_op_config_base::unbind_output_tensors() { + for (auto & tensor : _tensor_outputs) { + tensor->unbind(); + } +} + +Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() { + GGML_ASSERT(_qnn_parameters.size() == _param_names.size()); + + for (size_t i = 0; i < _qnn_parameters.size(); i++) { + _qnn_parameters[i].name = _param_names[i].c_str(); + } + + Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; + config.version = QNN_OPCONFIG_VERSION_1; + auto & op_config = config.v1; + op_config.name = _name.c_str(); + op_config.packageName = _package_name.c_str(); + op_config.typeName = _op_type.c_str(); + op_config.numOfParams = (uint32_t) _qnn_parameters.size(); + op_config.params = _qnn_parameters.data(); + op_config.numOfInputs = (uint32_t) _qnn_tensor_inputs.size(); + op_config.inputTensors = _qnn_tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t) _qnn_tensor_outputs.size(); + op_config.outputTensors = _qnn_tensor_outputs.data(); + return config; +} + +bool ggml_qnn_single_op_config::initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) { + GGML_UNUSED(device); + GGML_UNUSED(graph_handle); + return true; +} + +bool ggml_qnn_rmsnorm_op_config::initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) { + constexpr const uint32_t kAxes[] = { 0 }; + add_tensor_param(QNN_OP_RMS_NORM_PARAM_AXES, { 1 }, 1, reinterpret_cast(kAxes), + QNN_DATATYPE_UINT_32, device, graph_handle); + return true; +} + +void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) { + _tensor_inputs = tensor_inputs; +} + +void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) { + _tensor_inputs = std::move(tensor_inputs); +} + +void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) { + _tensor_outputs = tensor_outputs; +} + +void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t && tensor_outputs) { + _tensor_outputs = std::move(tensor_outputs); +} + +bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) { + return qnn::bind_tensors(tensor_inputs, _tensor_inputs); +} + +bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) { + return qnn::bind_tensors(tensor_outputs, _tensor_outputs); +} + +bool ggml_qnn_matmul_op_config::initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) { + GGML_ASSERT(_tensor_inputs.size() == 2); + GGML_ASSERT(_tensor_outputs.size() == 1); + + // create convert nodes + const auto tensor_rank = _tensor_inputs.front()->get_rank(); + qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; + auto tensor_type = create_input_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs); + + mat_mul_tensor_inputs.front() = + create_gather_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs.front(), + mat_mul_tensor_inputs.back()->get_dimensions()); + + if (device != QNN_BACKEND_GPU && _tensor_outputs.front()->get_data_type() != tensor_type) { + auto convert_out = create_output_convert_nodes(device, graph_handle, tensor_rank, tensor_type, _tensor_outputs); + if (!create_mat_mul_nodes(mat_mul_tensor_inputs, convert_out->get_input_tensors())) { + QNN_LOG_ERROR("create mat_mul nodes failed\n"); + return false; + } + + _operations.push_back(convert_out); + } else { + if (!create_mat_mul_nodes(mat_mul_tensor_inputs, _tensor_outputs)) { + QNN_LOG_ERROR("create mat_mul nodes failed\n"); + return false; + } + } + + return true; +} + +qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(backend_index_type device, + Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_ptr_t tensor_input, + qnn_dimension_array_t output_dimensions) { + if (rank <= 2) { + return tensor_input; + } + + const auto & input_dimensions = tensor_input->get_dimensions(); + output_dimensions[rank - 1] = input_dimensions[rank - 1]; + output_dimensions[rank - 2] = input_dimensions[rank - 2]; + + const auto y = output_dimensions[rank - 3] / input_dimensions[rank - 3]; + if (y == 1 && (rank == 3 || (rank == 4 && output_dimensions[rank - 4] == input_dimensions[rank - 4]))) { + return tensor_input; + } + + // create concat nodes, to convert tensor shape from [ne03, ne02, n, k] to [ne03 * x, ne02 * y, n, k] + constexpr const auto create_node = + [](const std::string & name, const int rank, const int axis, const qnn_dimension_array_t & dimensions, + qnn_tensor_ptr_t tensor_input, backend_index_type device, Qnn_GraphHandle_t graph_handle, + qnn_instance_ptr qnn_instance, qnn_tensor_ptr_t & tensor_output) -> qnn_op_config_ptr_t { + auto gather_out = + std::make_shared(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions, + tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance); + auto gather_op = std::make_shared(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_GATHER, + qnn_instance); + + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_INT_32; + scalar.int32Value = axis; + gather_op->add_scalar_param(QNN_OP_GATHER_PARAM_AXIS, scalar); + gather_op->set_output_tensors({ gather_out }); + + // here we calculate the index mapping, will generate a 1d tensor like [0, 0, 0, 1, 1, 1, 2, 2, 2, ...], + // by repeating each index [scale] times. + const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis]; + auto index_buffer = std::make_shared(dimensions[axis] * sizeof(uint32_t)); + for (uint32_t *curr = reinterpret_cast(index_buffer->get_buffer()), *end = curr + dimensions[axis]; + curr < end; curr++) { + *curr = uint32_t((curr - reinterpret_cast(index_buffer->get_buffer())) / scale); + } + + auto gather_index = std::make_shared( + ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{ dimensions[axis] }, + QNN_DATATYPE_UINT_32, 1, device, graph_handle, qnn_instance); + gather_index->set_data_buffer(index_buffer); + gather_op->set_input_tensors({ tensor_input, gather_index }); + + tensor_output = gather_out; + return gather_op; + }; + + qnn_dimension_array_t intermediate_dimensions = input_dimensions; + intermediate_dimensions[rank - 3] = output_dimensions[rank - 3]; + qnn_tensor_ptr_t gather0_out; + _operations.push_back(create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device, + graph_handle, _qnn_instance, gather0_out)); + if (rank == 3) { + return gather0_out; + } + + qnn_tensor_ptr_t gather1_out; + _operations.push_back(create_node(_name + "_gather1", rank, rank - 4, output_dimensions, gather0_out, device, + graph_handle, _qnn_instance, gather1_out)); + return gather1_out; +} + +Qnn_DataType_t ggml_qnn_matmul_op_config::create_input_convert_nodes(backend_index_type device, + Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t & tensor_inputs) { + if (device == QNN_BACKEND_GPU) { + // there's no convert op for GPU, so we should create matmul nodes directly. + return QNN_DATATYPE_UNDEFINED; + } + + // create tensors for convert node + auto tensor_type = get_tensor_type(tensor_inputs); + for (size_t i = 0; i < tensor_inputs.size(); ++i) { + // create input convert nodes + auto convert_in = tensor_inputs[i]; + if (convert_in->get_data_type() == tensor_type) { + continue; + } + + std::string convert_name("convert_src" + std::to_string(i)); + auto convert_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out", + convert_in->get_dimensions(), tensor_type, rank, device, + graph_handle, _qnn_instance); + auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CAST, _qnn_instance); + QNN_LOG_DEBUG("[%s][MUL_MAT]create: %s, type: %s\n", get_backend_name(device), convert_name.c_str(), + qnn_datatype_to_string(tensor_type)); + convert->set_input_tensors({ convert_in }); + convert->set_output_tensors({ convert_out }); + tensor_inputs[i] = convert_out; + _operations.push_back(convert); + } + + return tensor_type; +} + +qnn_op_config_ptr_t ggml_qnn_matmul_op_config::create_output_convert_nodes(backend_index_type device, + Qnn_GraphHandle_t graph_handle, + const int rank, Qnn_DataType_t tensor_type, + qnn_tensor_array_t & tensor_outputs) { + GGML_ASSERT(tensor_outputs.size() == 1); + // create output convert node + std::string convert_name("convert_dst"); + auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", + tensor_outputs.front()->get_dimensions(), tensor_type, rank, + device, graph_handle, _qnn_instance); + auto output_convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CAST, _qnn_instance); + QNN_LOG_DEBUG("[%s][MUL_MAT]create: %s, type: %s\n", get_backend_name(device), convert_name.c_str(), + qnn_datatype_to_string(tensor_type)); + output_convert->set_input_tensors({ convert_in }); + output_convert->set_output_tensors(tensor_outputs); + return output_convert; +} + +bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, + qnn_tensor_array_t & tensor_outputs) { + /* + * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please refer to: + * https://pytorch.org/blog/tensor-memory-format-matters/#:~:text=Column%20Major%20Order:%20In%20this%20format,%20the%20matrix) + * But the dimensions of the tensor are stored in different order. + * For example, a 2x3 matrix: + * [ + * [1, 2, 3], + * [4, 5, 6], + * ] + * The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3]. + * + * Second, from the ggml introduction here: https://github.com/huggingface/blog/blob/main/introduction-to-ggml.md + * Given 2 matrices A and B, the matrix multiplication C = A * B is defined as: + * ```python + * import torch + * # Create two matrices + * A = torch.tensor([ + * [2, 8], + * [5, 1], + * [4, 2], + * [8, 6], + * ]) + * B = torch.tensor([ + * [10, 5], + * [9, 9], + * [5, 4], + * ]) + * # Perform matrix multiplication + * C = torch.matmul(A, B.T) + * print(C.T) + * ``` + * Here, the B.T is the transpose of B. + * So C.T = A * B.T which is equivalent to C = B * A.T. + * See: https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md + * + * So here we need to create graph like: + * ```mermaid + * graph TD; + * i1>ggml_tensor_in1] --src0--> mat_mul0; + * i2>ggml_tensor_in0] --src1.T--> mat_mul0; + * mat_mul0 --dst0--> o1>ggml_tensor_out]; + * ``` + */ + + // create src0_trans tensor + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS does not match the expected value"); + GGML_ASSERT(tensor_inputs.size() == 2); + GGML_ASSERT(tensor_outputs.size() == 1); + + // create mat_mul + auto mat_mul = + std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, _qnn_instance); + + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_BOOL_8; + scalar.bool8Value = 1; + mat_mul->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar); + + // set tensor to mat_mul + mat_mul->set_input_tensors({ tensor_inputs[1], tensor_inputs[0] }); + mat_mul->set_output_tensors(tensor_outputs); + + _operations.push_back(mat_mul); + return true; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/op-config-impl.hpp b/ggml/src/ggml-qnn/qnn/op-config-impl.hpp new file mode 100644 index 0000000000000..36de66858acb6 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/op-config-impl.hpp @@ -0,0 +1,162 @@ +#pragma once + +#include +#include +#include +#include + +#include "op-config.hpp" +#include "qnn-lib.hpp" +#include "qnn-types.hpp" +#include "tensor.hpp" + +namespace qnn { + +class ggml_qnn_op_config_base : public ggml_qnn_op_config { + public: + explicit ggml_qnn_op_config_base(const std::string & name, const std::string & package_name, + const std::string & op_type, qnn_instance_ptr qnn_instance) : + _name(name), + _package_name(package_name), + _op_type(op_type), + _qnn_instance(qnn_instance) {} + + void add_scalar_param(const std::string & name, const Qnn_Scalar_t scalar); + bool add_tensor_param(const std::string & name, const qnn_dimension_array_t & dimensions, int rank, + const uint8_t * data, const Qnn_DataType_t data_type, backend_index_type device, + Qnn_GraphHandle_t graph_handle); + + void set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override; + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; + bool bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) override; + void unbind_input_tensors() override; + void unbind_output_tensors() override; + + qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; } + + qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; } + + protected: + Qnn_OpConfig_t get_op_config(); + + std::string _name; + std::string _package_name; + std::string _op_type; + qnn_instance_ptr _qnn_instance; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + qnn_tensor_array_t _tensor_parameters; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + std::vector _qnn_parameters; + std::vector _param_names; + + DISABLE_COPY(ggml_qnn_op_config_base); + DISABLE_MOVE(ggml_qnn_op_config_base); +}; + +class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { + public: + explicit ggml_qnn_single_op_config(const std::string & name, const std::string & package_name, + const std::string & op_type, qnn_instance_ptr qnn_instance) : + ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) override; + + private: + DISABLE_COPY(ggml_qnn_single_op_config); + DISABLE_MOVE(ggml_qnn_single_op_config); +}; + +class ggml_qnn_rmsnorm_op_config : public ggml_qnn_op_config_base { + public: + explicit ggml_qnn_rmsnorm_op_config(const std::string & name, const std::string & package_name, + const std::string & op_type, qnn_instance_ptr qnn_instance) : + ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) override; + + private: + DISABLE_COPY(ggml_qnn_rmsnorm_op_config); + DISABLE_MOVE(ggml_qnn_rmsnorm_op_config); +}; + +class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { + public: + explicit ggml_qnn_aggregate_op_config(const std::string & name, qnn_instance_ptr qnn_instance) : + _name(name), + _qnn_instance(qnn_instance) {} + + ~ggml_qnn_aggregate_op_config() { + _tensor_inputs.clear(); + _tensor_outputs.clear(); + _operations.clear(); + } + + void set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override; + + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override { + return qnn::add_op_to_graph(graph_handle, _operations); + } + + bool bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) override; + + void unbind_input_tensors() override { + for (auto & tensor : _tensor_inputs) { + tensor->unbind(); + } + } + + void unbind_output_tensors() override { + for (auto & tensor : _tensor_outputs) { + tensor->unbind(); + } + } + + qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; } + + qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; } + + protected: + std::string _name; + qnn_instance_ptr _qnn_instance; + + std::vector _operations; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + + private: + DISABLE_COPY(ggml_qnn_aggregate_op_config); + DISABLE_MOVE(ggml_qnn_aggregate_op_config); +}; + +class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { + public: + ggml_qnn_matmul_op_config(const std::string & name, qnn_instance_ptr qnn_instance) : + ggml_qnn_aggregate_op_config(name, qnn_instance) {} + + bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) override; + + private: + qnn_tensor_ptr_t create_gather_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); + Qnn_DataType_t create_input_convert_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t & tensor_inputs); + qnn_op_config_ptr_t create_output_convert_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle, + const int rank, Qnn_DataType_t tensor_type, + qnn_tensor_array_t & tensor_outputs); + bool create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs); + + DISABLE_COPY(ggml_qnn_matmul_op_config); + DISABLE_MOVE(ggml_qnn_matmul_op_config); +}; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/op-config.hpp b/ggml/src/ggml-qnn/qnn/op-config.hpp new file mode 100644 index 0000000000000..635a831a06c20 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/op-config.hpp @@ -0,0 +1,38 @@ +#pragma once + +#include +#include +#include +#include + +#include "op-config-base.hpp" +#include "qnn-lib.hpp" +#include "qnn-types.hpp" +#include "tensor.hpp" + +namespace qnn { + +constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; + +// TODO: move to a better place +void append_tensor_shape_and_type(const ggml_tensor * tensor, std::string & output); + +size_t get_qnn_op_index(const ggml_tensor * tensor); +const char * get_qnn_op_name(const ggml_tensor * op); +void get_qnn_op_desc(const ggml_tensor * op, bool append_dimensions, ggml_type override_data_type, + std::string & output); + +std::shared_ptr create_op(const ggml_tensor * op, const std::string & name, + qnn_instance_ptr qnn_instance); + +inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector & operations) { + for (auto & op : operations) { + if (!op->add_op_to_graph(graph_handle)) { + return false; + } + } + + return true; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/profiler.cpp b/ggml/src/ggml-qnn/qnn/profiler.cpp new file mode 100644 index 0000000000000..5625c3acf7ebb --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/profiler.cpp @@ -0,0 +1,170 @@ + +#include "profiler.hpp" + +#include +#include + +#include "logger.hpp" +#include "qnn-lib.hpp" + +namespace { + +std::string get_duration_string(const QnnProfile_EventData_t & event_data) { + char time_str[128] = {}; + switch (event_data.unit) { + case QNN_PROFILE_EVENTUNIT_CYCLES: + snprintf(time_str, sizeof(time_str), "cycles: %lld", (long long int) event_data.value); + break; + case QNN_PROFILE_EVENTUNIT_COUNT: + snprintf(time_str, sizeof(time_str), "count: %lld", (long long int) event_data.value); + break; + case QNN_PROFILE_EVENTUNIT_BYTES: + snprintf(time_str, sizeof(time_str), "size: %lld bytes", (long long int) event_data.value); + break; + case QNN_PROFILE_EVENTUNIT_MICROSEC: + { + double duration_ms = event_data.value / 1000.0; + snprintf(time_str, sizeof(time_str), "duration: %.3f ms", duration_ms); + } + break; + default: + break; + } + + return time_str; +} + +} // namespace + +namespace qnn { + +qnn_event_tracer::qnn_event_tracer(const std::string & prefix, std::shared_ptr interface, + Qnn_BackendHandle_t backend_handle, sdk_profile_level level) : + _interface(interface), + _prefix(prefix) { + QnnProfile_Level_t qnn_profile_level = 0; + switch (level) { + case sdk_profile_level::PROFILE_BASIC: + qnn_profile_level = QNN_PROFILE_LEVEL_BASIC; + break; + case sdk_profile_level::PROFILE_OP_TRACE: + case sdk_profile_level::PROFILE_DETAIL: + qnn_profile_level = QNN_PROFILE_LEVEL_DETAILED; + break; + case sdk_profile_level::PROFILE_OFF: + default: + QNN_LOG_WARN("[profiler][%s]invalid profile level %d, using PROFILE_OFF\n", _prefix.c_str(), level); + return; + } + + auto error = _interface->qnn_profile_create(backend_handle, qnn_profile_level, &_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to create QNN profile_handle. Backend ID %u, error %ld\n", _prefix.c_str(), + _interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error)); + _handle = nullptr; + return; + } + + if (level == sdk_profile_level::PROFILE_OP_TRACE) { + QnnProfile_Config_t qnn_profile_config = QNN_PROFILE_CONFIG_INIT; + qnn_profile_config.option = QNN_PROFILE_CONFIG_OPTION_ENABLE_OPTRACE; + std::array profile_configs = { &qnn_profile_config, nullptr }; + error = _interface->qnn_profile_set_config(_handle, profile_configs.data()); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to set QNN profile event. Backend ID %u, error %ld\n", _prefix.c_str(), + _interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error)); + _interface->qnn_profile_free(_handle); + _handle = nullptr; + return; + } + } + + QNN_LOG_DEBUG("[profiler][%s]created, Backend ID %u, level %d\n", _prefix.c_str(), _interface->get_backend_id(), + level); +} + +qnn_event_tracer::~qnn_event_tracer() { + if (_handle) { + Qnn_ErrorHandle_t error = _interface->qnn_profile_free(_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to free QNN profile_handle. Backend ID %u, error %ld\n", + _prefix.c_str(), _interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error)); + } + _handle = nullptr; + } +} + +void qnn_event_tracer::print_profile_events() { + const QnnProfile_EventId_t * events_ptr = nullptr; + uint32_t num_events = 0; + auto error = _interface->qnn_profile_get_events(_handle, &events_ptr, &num_events); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile events. Backend ID %u, error %ld\n", _prefix.c_str(), + _interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error)); + return; + } + + if (!num_events) { + QNN_LOG_INFO("[profiler][%s]no QNN profile events\n", _prefix.c_str()); + return; + } + + QNN_LOG_INFO("[profiler][%s]print_profile_events start ----------------\n", _prefix.c_str()); + // see also: https://github.com/pytorch/executorch/blob/0ccf5093823761cf8ad98c75e5fe81f15ea42366/backends/qualcomm/runtime/backends/QnnProfiler.cpp#L73 + QnnProfile_EventData_t event_data; + for (uint32_t i = 0; i < num_events; ++i) { + error = _interface->qnn_profile_get_event_data(events_ptr[i], &event_data); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile event data. Backend ID %u, event[%d], error: %ld\n", + _prefix.c_str(), _interface->get_backend_id(), i, (long) QNN_GET_ERROR_CODE(error)); + continue; + } + + const QnnProfile_EventId_t * sub_events_ptr = nullptr; + uint32_t num_sub_events = 0; + error = _interface->qnn_profile_get_sub_events(events_ptr[i], &sub_events_ptr, &num_sub_events); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile sub events. Backend ID %u, event[%d], error: %ld\n", + _prefix.c_str(), _interface->get_backend_id(), i, (long) QNN_GET_ERROR_CODE(error)); + continue; + } + + auto duration = get_duration_string(event_data); + if (!num_sub_events) { + QNN_LOG_INFO("[profiler][%s]event[%d]: %s, %s\n", _prefix.c_str(), i, event_data.identifier, + duration.c_str()); + continue; + } + + QNN_LOG_INFO("[profiler][%s]event[%d]: %s, sub_count: %d, start -------------\n", _prefix.c_str(), i, + event_data.identifier, num_sub_events); + QnnProfile_EventData_t sub_event_data; + for (std::uint32_t j = 0; j < num_sub_events; ++j) { + error = _interface->qnn_profile_get_event_data(sub_events_ptr[j], &sub_event_data); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR( + "[profiler][%s]failed to get QNN profile sub event data. Backend ID %u, event[%d], sub_event[%d], " + "error: %ld\n", + _prefix.c_str(), _interface->get_backend_id(), i, j, (long) QNN_GET_ERROR_CODE(error)); + continue; + } + + if (sub_event_data.type != QNN_PROFILE_EVENTTYPE_NODE) { + QNN_LOG_DEBUG("[profiler][%s]sub_event[%d]%s, type %d, skipping\n", _prefix.c_str(), j, + sub_event_data.identifier, sub_event_data.type); + continue; + } + + auto sub_duration = get_duration_string(sub_event_data); + QNN_LOG_INFO("[profiler][%s]sub_event[%d]: %s, %s\n", _prefix.c_str(), j, sub_event_data.identifier, + sub_duration.c_str()); + } + + QNN_LOG_INFO("[profiler][%s]event[%d]: %s, %s, end --------------\n", _prefix.c_str(), i, event_data.identifier, + duration.c_str()); + } + + QNN_LOG_INFO("[profiler][%s]print_profile_events end -----------------\n", _prefix.c_str()); +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/profiler.hpp b/ggml/src/ggml-qnn/qnn/profiler.hpp new file mode 100644 index 0000000000000..34db09e0bf865 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/profiler.hpp @@ -0,0 +1,100 @@ +#pragma once + +#include + +#include +#include +#include +#include + +#include "logger.hpp" +#include "qnn-types.hpp" + +namespace qnn { + +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING + +class qnn_scoped_timer { + public: + qnn_scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) { + _begin_us = ggml_time_us(); + } + + qnn_scoped_timer(qnn_scoped_timer && other) { + _begin_us = other._begin_us; + _log_prefix = std::move(other._log_prefix); + } + + ~qnn_scoped_timer() { print(); } + + void operator=(qnn_scoped_timer && other) { + _begin_us = other._begin_us; + _log_prefix = std::move(other._log_prefix); + } + + void print() const { + auto duration = (ggml_time_us() - _begin_us) / 1000.0; + QNN_LOG_INFO("[profiler]%s, duration: %.4f ms\n", _log_prefix.c_str(), duration); + } + + + private: + int64_t _begin_us = 0LL; + std::string _log_prefix; + + qnn_scoped_timer(const qnn_scoped_timer &) = delete; + void operator=(const qnn_scoped_timer &) = delete; +}; + +inline qnn_scoped_timer make_scope_perf_timer(const char * format, ...) { + va_list args; + va_start(args, format); + char buffer[4096]; + vsnprintf(buffer, sizeof(buffer), format, args); + va_end(args); + return qnn_scoped_timer(buffer); +} + +#else + +inline void make_scope_perf_timer(const char *, ...) {} + +#endif + +// forward declaration of qnn_interface +class qnn_interface; + +class qnn_event_tracer { + public: + // ref: + // https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53 + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices + enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE }; + + explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr interface, + Qnn_BackendHandle_t backend_handle, sdk_profile_level level); + ~qnn_event_tracer(); + + Qnn_ProfileHandle_t get_handle() const { return _handle; } + + void print_profile_events(); + + private: + std::shared_ptr _interface; + Qnn_ProfileHandle_t _handle = nullptr; + std::string _prefix; + + DISABLE_COPY(qnn_event_tracer); + DISABLE_MOVE(qnn_event_tracer); +}; + +using qnn_event_tracer_ptr = std::shared_ptr; + +} // namespace qnn + +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \ + auto __qnn_timer_##__LINE__ = qnn::make_scope_perf_timer(fmt, __VA_ARGS__) +#else +# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0) +#endif diff --git a/ggml/src/ggml-qnn/qnn/qnn-lib.cpp b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp new file mode 100644 index 0000000000000..12e94aaac747c --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp @@ -0,0 +1,580 @@ + +#include "qnn-lib.hpp" + +#include + +#include "common.hpp" +#include "rpc-mem.hpp" + +#if defined(__linux__) +# include +#endif + +namespace { + +#ifdef _WIN32 +# define PLATFORM_LIB_FILENAME(name) (name ".dll") +#else +# define PLATFORM_LIB_FILENAME(name) ("lib" name ".so") +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) // TODO: check for other platforms +# define PLATFORM_LIB_POSFIX "_aarch64" +#else +# define PLATFORM_LIB_POSFIX "_x64" +#endif + +constexpr const char * kQnnSystemLibName = PLATFORM_LIB_FILENAME("QnnSystem"); +constexpr const char * kQnnCpuLibName = PLATFORM_LIB_FILENAME("QnnCpu"); +constexpr const char * kQnnGpuLibName = PLATFORM_LIB_FILENAME("QnnGpu"); +constexpr const char * kQnnNpuLibName = PLATFORM_LIB_FILENAME("QnnHtp"); +constexpr const char * kQnnCpuPackageLibName = PLATFORM_LIB_FILENAME("QnnGgmlOpPackage" PLATFORM_LIB_POSFIX); + +constexpr const qnn::device_caps kDeviceCaps[] = { + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul + kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, (1L << GGML_TYPE_I8) | (1L << GGML_TYPE_F32), + 0xFFFFFE, // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu + 0, // 0 for no limitation + }, + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul + kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16), + // all quantized types can be offload to GPU, at current implementation, those types will be dequantized into float32 on cpu + 0xFFFFFE, (128256L * 4096 * + sizeof(float)), // tested on 8 gen 2, failed to allocate tensor with size 128256x4096 and float32 + }, + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul + kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, + (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16) | (1L << GGML_TYPE_I16), + (1L << GGML_TYPE_Q2_K) | (1L << GGML_TYPE_Q3_K) | (1L << GGML_TYPE_Q4_K) | (1L << GGML_TYPE_Q8_K), + (8192L * 2048 + 8192 * 512 + 2048 * 512) * sizeof(float), // TODO: should have a better way to get this value + }, +}; + +static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == QNN_BACKEND_COUNT, + "The number of qnn devices should be equal to QNN_BACKEND_COUNT"); +static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, + "The NPU device should be an accelerator device"); +static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU, + "The GPU device should be an GPU device"); +static_assert( + kDeviceCaps[QNN_BACKEND_CPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, + "The CPU device should be an accelerator device"); // we treat qnn-cpu as a supplementary accelerator device +static_assert(GGML_TYPE_Q4_0 == 2 && GGML_TYPE_Q8_K == 15, "The quantized type order is not correct"); + +void insert_path(std::string & path, std::string insert_path, const char separator = ':') { + if (!insert_path.empty() && !path.empty()) { + insert_path += separator; + } + + path.insert(0, insert_path); +} + +// TODO: Fix this for other platforms, or use a more portable way to set the library search path +bool set_qnn_lib_search_path(const std::string & custom_lib_search_path) { +#if defined(__linux__) + { + auto * original = getenv("LD_LIBRARY_PATH"); + std::string lib_search_path = original ? original : ""; + insert_path(lib_search_path, + "/vendor/dsp/cdsp:/vendor/lib64:" + "/vendor/dsp/dsp:/vendor/dsp/images"); + insert_path(lib_search_path, custom_lib_search_path); + if (setenv("LD_LIBRARY_PATH", lib_search_path.c_str(), 1)) { + return false; + } + } + +# if defined(__ANDROID__) || defined(ANDROID) + { + // See also: https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-2/dsp_runtime.html + std::string adsp_lib_search_path = custom_lib_search_path + + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" + "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp"; + if (setenv("ADSP_LIBRARY_PATH", adsp_lib_search_path.c_str(), 1)) { + return false; + } + + QNN_LOG_DEBUG("ADSP_LIBRARY_PATH=%s", getenv("ADSP_LIBRARY_PATH\n")); + } +# endif + + QNN_LOG_DEBUG("LD_LIBRARY_PATH=%s", getenv("LD_LIBRARY_PATH\n")); +#else + (void) custom_lib_search_path; +#endif + + return true; +} + +common::dl_handler_t load_lib_with_fallback(const std::string & lib_path, const std::string & load_directory) { + std::filesystem::path full_path(load_directory); + full_path /= std::filesystem::path(lib_path).filename(); + auto handle = common::dl_load(full_path.string()); + if (!handle) { + QNN_LOG_WARN("failed to load %s, fallback to %s\n", full_path.c_str(), lib_path.c_str()); + handle = common::dl_load(lib_path); + } + + return handle; +} + +struct op_package_lib_info { + const char * lib_name; + const char * interface; + const char * type; + size_t htp_arch; + const char * extra_lib_name = nullptr; +}; + +const op_package_lib_info & get_op_package_lib_info(uint32_t soc_model, size_t htp_arch) { + constexpr static const op_package_lib_info kOpPackageLibInfo[] = { + { kQnnCpuPackageLibName, "GgmlOpPackageInterfaceProvider", "CPU", qnn::NONE, + PLATFORM_LIB_FILENAME("HtpPrepare") }, + { PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v68"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V68 }, + { PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v69"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V69 }, + { PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v73"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V73 }, + { PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v75"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V75 }, + { PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v79"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V79 }, + }; + + if (soc_model == qnn::UNKNOWN || soc_model == qnn::EMULATOR_X64 || soc_model == qnn::EMULATOR_AARCH64) { + return kOpPackageLibInfo[0]; + } + + switch (htp_arch) { + case qnn::V68: + static_assert(kOpPackageLibInfo[1].htp_arch == qnn::V68); + return kOpPackageLibInfo[1]; + case qnn::V69: + static_assert(kOpPackageLibInfo[2].htp_arch == qnn::V69); + return kOpPackageLibInfo[2]; + case qnn::V73: + static_assert(kOpPackageLibInfo[3].htp_arch == qnn::V73); + return kOpPackageLibInfo[3]; + case qnn::V75: + static_assert(kOpPackageLibInfo[4].htp_arch == qnn::V75); + return kOpPackageLibInfo[4]; + case qnn::V79: + default: + static_assert(kOpPackageLibInfo[5].htp_arch == qnn::V79); + return kOpPackageLibInfo[5]; + } +} + +} // namespace + +namespace qnn { + +qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface, + common::dl_handler_t lib_handle) : + _qnn_sys_interface(qnn_sys_interface), + _lib_handle(lib_handle) { + qnn_system_context_create(&_qnn_system_handle); + if (_qnn_system_handle) { + QNN_LOG_INFO("initialize qnn system successfully\n"); + } else { + QNN_LOG_WARN("can not create QNN system contenxt\n"); + } +} + +qnn_system_interface::~qnn_system_interface() { + if (_qnn_system_handle) { + if (qnn_system_context_free(_qnn_system_handle) != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + } else { + QNN_LOG_WARN("system handle is null\n"); + } + + if (_lib_handle) { + if (!common::dl_unload(_lib_handle)) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", common::dl_error()); + } + } else { + QNN_LOG_WARN("system lib handle is null\n"); + } +} + +qnn_instance::qnn_instance(const std::string & lib_path, backend_index_type device) : + _additional_lib_load_path(lib_path) { + _backend_lib_name = kDeviceCaps[device].lib_name; + if (set_qnn_lib_search_path(lib_path)) { + QNN_LOG_DEBUG("[%s] set_qnn_lib_search_path succeed\n", _backend_lib_name.c_str()); + } else { + QNN_LOG_ERROR("[%s] set_qnn_lib_search_path failed\n", _backend_lib_name.c_str()); + } +} + +bool qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qnn_init\n"); + + std::lock_guard lock(_init_mutex); + if (load_system() != 0) { + QNN_LOG_WARN("failed to load QNN system lib\n"); + return false; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); + } + + std::string backend_lib_path = _backend_lib_name; + if (_lib_path_to_backend_id.count(backend_lib_path) == 0) { + if (!load_backend(backend_lib_path, saver_config)) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return false; + } + } + + backend_id = _lib_path_to_backend_id[backend_lib_path]; + if (_loaded_backend.count(backend_id) == 0 || _loaded_lib_handle.count(backend_id) == 0) { + QNN_LOG_WARN( + "library %s is loaded but loaded backend count=%zu, " + "loaded lib_handle count=%zu", + backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id)); + return false; + } + + _qnn_interface = std::make_shared(*_loaded_backend[backend_id]); + _qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); + if (!_qnn_log_handle) { + // NPU backend not work on Qualcomm SoC equipped low-end phone + QNN_LOG_WARN("failed to initialize qnn log\n"); + return false; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } + + std::vector temp_backend_config; + _qnn_interface->qnn_backend_create( + _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); + if (!_qnn_backend_handle) { + QNN_LOG_WARN("failed to initialize qnn backend\n"); + return false; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } + + auto qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE); + switch (qnn_status) { + case QNN_PROPERTY_NOT_SUPPORTED: + QNN_LOG_WARN("device property is not supported\n"); + break; + case QNN_PROPERTY_ERROR_UNKNOWN_KEY: + QNN_LOG_WARN("device property is unknown\n"); + break; + } + + { + const QnnDevice_PlatformInfo_t * p_info = nullptr; + qnn_status = _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); + if (qnn_status == QNN_SUCCESS) { + QNN_LOG_INFO("device counts %d\n", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; + for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId, + (int) infos[i].v1.deviceType, (int) infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; + size_t htp_arch = (size_t) chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)\n", devinfo->devType, + (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("soc_model:%s(%s), htp_arch:%s(%d), vtcm_size:%d MB\n", + get_chipset_desc(chipinfo.socModel), get_chipset_model(chipinfo.socModel), + get_htparch_desc(htp_arch), (int) htp_arch, (int) chipinfo.vtcmSize); + } + + if (p_info->v1.numHwDevices) { + QnnDevice_DeviceInfoExtension_t devinfo = infos[p_info->v1.numHwDevices - 1].v1.deviceInfoExtension; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; + size_t htp_arch = (size_t) chipinfo.arch; + _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } + + _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); + } else { + // For emulator, we can't get platform info + QNN_LOG_INFO("failed to get platform info, emulator or cpu backend?\n"); +#if defined(__aarch64__) || defined(_M_ARM64) + _soc_info = { EMULATOR_AARCH64, NONE, 0 }; +#elif defined(__x86_64__) || defined(__amd64__) || defined(_M_X64) + _soc_info = { EMULATOR_X64, NONE, 0 }; +#else + _soc_info = { UNKNOWN, NONE, 0 }; +#endif + } + } + + { + qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); + if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { + QNN_LOG_WARN("failed to create QNN device\n"); + } else { + QNN_LOG_INFO("create QNN device successfully\n"); + } + } + + { + auto rpc_mem = std::make_unique(); + if (rpc_mem->is_valid()) { + _rpc_mem = std::move(rpc_mem); + } + } + + { + auto & op_package_info = get_op_package_lib_info(_soc_info.soc_model, _soc_info.htp_arch); + if (op_package_info.extra_lib_name) { + _custom_op_extra_lib_handle = + load_lib_with_fallback(op_package_info.extra_lib_name, _additional_lib_load_path); + } + + qnn_status = _qnn_interface->qnn_backend_register_op_package(_qnn_backend_handle, op_package_info.lib_name, + op_package_info.interface, op_package_info.type); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("failed to register op package %s, interface: %s, error: %s\n", op_package_info.lib_name, + op_package_info.interface, qnn::get_qnn_error_string(qnn_status)); + } else { + QNN_LOG_DEBUG("register op package %s successfully, ID %u\n", op_package_info.lib_name, + _qnn_interface->get_backend_id()); + _has_custom_op_package = true; + } + } + + /* TODO: not used, keep it for further usage + QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; + qnn_context_config.priority = QNN_PRIORITY_DEFAULT; + const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; + */ + _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); + if (!_qnn_context_handle) { + QNN_LOG_WARN("failed to initialize qnn context\n"); + return false; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { + if (init_htp_perfinfra() != 0) { + QNN_LOG_WARN("initialize HTP performance failure\n"); + } + if (set_rpc_polling() != 0) { + QNN_LOG_WARN("set RPC polling failure\n"); + } + if (set_high_performance_mode() != 0) { + QNN_LOG_WARN("set HTP high performance mode failure\n"); + } + } + + QNN_LOG_DEBUG("leave qnn_init\n"); + return true; +} + +bool qnn_instance::qnn_finalize() { + if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { + _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); + } + + if (_qnn_context_handle) { + auto error = _qnn_interface->qnn_context_free(_qnn_context_handle, nullptr); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + (int) QNN_GET_ERROR_CODE(error)); + } + _qnn_context_handle = nullptr; + } + + if (_qnn_device_handle) { + auto error = _qnn_interface->qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + (int) QNN_GET_ERROR_CODE(error)); + } + _qnn_device_handle = nullptr; + } + + if (_qnn_backend_handle) { + auto error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + (int) QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + } + + if (_qnn_log_handle) { + auto error = _qnn_interface->qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + (int) QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + if (_custom_op_extra_lib_handle) { + common::dl_unload(_custom_op_extra_lib_handle); + } + + unload_backend(); + + _qnn_sys_interface.reset(); + + _rpc_mem.reset(); + + return true; +} + +int qnn_instance::load_system() { + QNN_LOG_DEBUG("[%s]lib: %s\n", _backend_lib_name.c_str(), kQnnSystemLibName); + auto system_lib_handle = load_lib_with_fallback(kQnnSystemLibName, _additional_lib_load_path); + if (!system_lib_handle) { + QNN_LOG_WARN("can not load QNN library %s, error: %s\n", kQnnSystemLibName, common::dl_error()); + return 1; + } + + auto * get_providers = common::dl_sym_typed( + system_lib_handle, "QnnSystemInterface_getProviders"); + if (!get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", common::dl_error()); + return 2; + } + + uint32_t num_providers = 0; + const QnnSystemInterface_t ** provider_list = nullptr; + Qnn_ErrorHandle_t error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", (int) QNN_GET_ERROR_CODE(error)); + return 3; + } + + QNN_LOG_DEBUG("num_providers: %d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", (int) num_providers, (int) _required_num_providers); + return 4; + } + + if (!provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } + + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + QNN_LOG_DEBUG("find a valid qnn system interface\n"); + } + + auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); + if (!qnn_sys_interface->is_valid()) { + QNN_LOG_WARN("failed to create QNN system interface\n"); + return 7; + } + + _qnn_sys_interface = qnn_sys_interface; + return 0; +} + +bool qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/) { + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + + auto lib_handle = load_lib_with_fallback(lib_path, _additional_lib_load_path); + if (!lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s\n", lib_path.c_str(), common::dl_error()); + return false; + } + + auto get_providers = + common::dl_sym_typed(lib_handle, "QnnInterface_getProviders"); + if (!get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s\n", common::dl_error()); + common::dl_unload(lib_handle); + return false; + } + + std::uint32_t num_providers = 0; + const QnnInterface_t ** provider_list = nullptr; + auto error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", (int) QNN_GET_ERROR_CODE(error)); + common::dl_unload(lib_handle); + return false; + } + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + common::dl_unload(lib_handle); + return false; + } + + if (!provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + common::dl_unload(lib_handle); + return false; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + common::dl_unload(lib_handle); + return false; + } else { + QNN_LOG_DEBUG("find a valid qnn interface\n"); + } + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + if (!common::dl_unload(_loaded_lib_handle[backend_id])) { + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], common::dl_error()); + } + } + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + + return true; +} + +void qnn_instance::unload_backend() { + for (auto & it : _loaded_lib_handle) { + if (!common::dl_unload(it.second)) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, common::dl_error()); + } + } + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); +} + +const device_caps & get_device_caps(backend_index_type device) { + return kDeviceCaps[device]; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn/qnn-lib.hpp new file mode 100644 index 0000000000000..2e7c9339aa60c --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/qnn-lib.hpp @@ -0,0 +1,459 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dyn-lib-loader.hpp" +#include "qnn-types.hpp" +#include "rpc-mem.hpp" +#include "utils.hpp" + +namespace qnn { + +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm +// ================================================================================================= + +// TODO: fix this for other compilers +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wextra-semi" +#pragma GCC diagnostic ignored "-Wpedantic" + +class qnn_system_interface { +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template inline auto qnn_##F(Args... args) const { \ + return (_qnn_sys_interface.QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ + } + + public: + qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface, common::dl_handler_t lib_handle); + ~qnn_system_interface(); + + bool is_valid() const { return _qnn_system_handle != nullptr; } + + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); + + private: + qnn_system_interface(const qnn_system_interface &) = delete; + void operator=(const qnn_system_interface &) = delete; + qnn_system_interface(qnn_system_interface &&) = delete; + void operator=(qnn_system_interface &&) = delete; + + const QnnSystemInterface_t _qnn_sys_interface = {}; + common::dl_handler_t _lib_handle = nullptr; + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; +}; + +class qnn_interface { +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface.QNN_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ + } + + public: + qnn_interface(const QnnInterface_t & qnn_interface) : _qnn_interface(qnn_interface) {} + + // QnnBackend + DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); + + // QnnDevice + DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); + DEFINE_SHIM_FUNCTION_INTERFACE(device_free_platform_info, deviceFreePlatformInfo); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); + + // QnnContext + DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); + DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); + + // QnnGraph + DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); + DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); + DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); + DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); + + // QnnLog + DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); + DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); + + // QnnProfile + DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(profile_set_config, profileSetConfig); + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); + DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); + + // QnnMem + DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); + DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); + + // QnnProperty + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability); + + // QnnTensor + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); + + uint32_t get_backend_id() const { return _qnn_interface.backendId; } + + private: + qnn_interface(const qnn_interface &) = delete; + void operator=(const qnn_interface &) = delete; + qnn_interface(qnn_interface &&) = delete; + void operator=(qnn_interface &&) = delete; + + const QnnInterface_t _qnn_interface = {}; +}; + +#pragma GCC diagnostic pop + +using qnn_interface_ptr = std::shared_ptr; + +class qnn_instance { + public: + using BackendIdType = decltype(QnnInterface_t{}.backendId); + + explicit qnn_instance(const std::string & lib_path, backend_index_type device); + + ~qnn_instance() {} + + bool qnn_init(const QnnSaver_Config_t ** saver_config); + bool qnn_finalize(); + + qnn_interface_ptr get_qnn_interface() { + if (!_qnn_interface) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + + Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + + Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + + Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + + int init_htp_perfinfra() { + QnnDevice_Infrastructure_t device_infra = nullptr; + auto error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get qnn device infra\n"); + return 1; + } else { + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); + } + + QnnHtpDevice_Infrastructure_t * htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; + htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { + QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type\n", htp_infra->infraType); + } else { + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); + } + _qnn_htp_perfinfra = htp_perfinfra; + _qnn_power_configid = power_configid; + + return 0; + } + + int set_rpc_polling() { + if (_qnn_htp_perfinfra) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; + memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); + rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + // use rpc polling time recommended 0-10000 us + rpc_polling_time.rpcPollingTimeConfig = 9999; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency; + memset(&rpc_control_latency, 0, sizeof(rpc_control_latency)); + rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + // use rpc control latency recommended 100 us, refer hexagon sdk + rpc_control_latency.rpcControlLatencyConfig = 100; + + const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { &rpc_polling_time, &rpc_control_latency, + nullptr }; + Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp perf failed\n"); + } else { + QNN_LOG_DEBUG("set htp perf ok\n"); + } + } else { + QNN_LOG_WARN("can't set htp perf\n"); + } + + return 0; + } + + int set_high_performance_mode() { + if (!_qnn_htp_perfinfra) { + QNN_LOG_WARN("perf intra is null\n"); + return 1; + } + + QnnHtpPerfInfrastructure_PowerConfig_t power_config; + memset(&power_config, 0, sizeof(power_config)); + power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.contextId = _qnn_power_configid; + power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + power_config.dcvsV3Config.setSleepLatency = 1; // true to consider Latency parameter otherwise false + power_config.dcvsV3Config.sleepLatency = 40; + power_config.dcvsV3Config.setBusParams = 1; // true to consider Bus parameter otherwise false + power_config.dcvsV3Config.setCoreParams = 1; // true to consider Core parameter otherwise false + power_config.dcvsV3Config.sleepDisable = 1; // true to consider sleep/LPM modes, false to enable + power_config.dcvsV3Config.setSleepDisable = + 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter + // set Bus Clock Parameters + power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters + power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + + // set power config with different performance parameters + const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { &power_config, nullptr }; + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp high performance mode failed\n"); + } else { + QNN_LOG_DEBUG("set htp high performance mode ok\n"); + } + + return 0; + } + + std::string & get_qnn_graph_name() { return _graph_name; } + + void * alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpc_mem) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } + + auto allocate_bytes = static_cast(bytes + alignment); + void * buf = _rpc_mem->alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int) allocate_bytes); + if (!buf) { + QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int) (allocate_bytes / (1 << 20))); + return nullptr; + } + + auto aligned_buf = reinterpret_cast(qnn::align_to(alignment, reinterpret_cast(buf))); + bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + _rpc_mem->free(buf); + } + + return aligned_buf; + } + + void free_rpcmem(void * buf) { + if (!_rpc_mem) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (_rpcmem_store_map.count(buf) == 0) { + QNN_LOG_WARN("no allocated tensor\n"); + } else { + _rpc_mem->free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } + } + + int rpcmem_to_fd(void * buf) { + int fd = -1; + if (!_rpc_mem) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (_rpcmem_store_map.count(buf) == 0) { + QNN_LOG_WARN("no allocated tensor\n"); + } else { + buf = _rpcmem_store_map[buf]; + fd = _rpc_mem->to_fd(buf); + } + return fd; + } + + Qnn_MemHandle_t register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, + Qnn_DataType_t data_type) { + if (!p_data) { + QNN_LOG_WARN("invalid param\n"); + return nullptr; + } + + if (!_rpc_mem) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } + + if (is_rpcmem_registered(p_data)) { + QNN_LOG_WARN("rpc memory already registered\n"); + return _qnn_rpc_buffer_to_handles[p_data]; + } + + auto mem_fd = rpcmem_to_fd(p_data); + if (mem_fd == -1) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return nullptr; + } + + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { + { rank, dimensions, nullptr }, + data_type, QNN_MEM_TYPE_ION, { { mem_fd } } + }; + Qnn_MemHandle_t handle = nullptr; + auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", (int) QNN_GET_ERROR_CODE(error), + strerror(error)); + return nullptr; + } + + _qnn_rpc_buffer_to_handles.insert({ p_data, handle }); + QNN_LOG_DEBUG("successfully register shared memory handler: %p\n", handle); + return handle; + } + + void unregister_rpcmem(Qnn_MemHandle_t mem_handle) { + auto error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", (int) QNN_GET_ERROR_CODE(error)); + } + + auto it = std::find_if(_qnn_rpc_buffer_to_handles.begin(), _qnn_rpc_buffer_to_handles.end(), + [mem_handle](const auto & kv) { return kv.second == mem_handle; }); + if (it == _qnn_rpc_buffer_to_handles.end()) { + QNN_LOG_WARN("failed to find shared memory handler: %p\n", mem_handle); + return; + } + + _qnn_rpc_buffer_to_handles.erase(it); + } + + bool is_rpcmem_allocated(void * buf) { return _rpcmem_store_map.count(buf) != 0; } + + bool is_rpcmem_registered(void * buf) { return _qnn_rpc_buffer_to_handles.count(buf) != 0U; } + + const qnn::qcom_socinfo & get_soc_info() { return _soc_info; } + + bool has_custom_op_package() const { return _has_custom_op_package; } + + private: + int load_system(); + bool load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/); + void unload_backend(); + + private: + static constexpr const int _required_num_providers = 1; + + std::string _additional_lib_load_path; + std::string _backend_lib_name; + BackendIdType _backend_id; + +#ifdef NDEBUG + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_INFO; // TODO: should we consider changing this dynamically? +#else + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; +#endif + + std::shared_ptr _qnn_sys_interface; + std::shared_ptr _qnn_interface; + + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + Qnn_LogHandle_t _qnn_log_handle = nullptr; + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; + + std::unordered_map _qnn_rpc_buffer_to_handles; + + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; + std::unordered_map _lib_path_to_backend_id; + std::unordered_map _loaded_backend; + + std::unique_ptr _rpc_mem; + std::unordered_map _rpcmem_store_map; + + std::string _graph_name; + + qnn::qcom_socinfo _soc_info = {}; + + bool _has_custom_op_package = false; + common::dl_handler_t _custom_op_extra_lib_handle = nullptr; +}; + +using qnn_instance_ptr = std::shared_ptr; + +struct device_caps { + const char * lib_name; + enum ggml_backend_dev_type type; + + // TODO: should we get this from device? + uint64_t supported_types; + + // TODO: should we merge this with supported_types? + uint64_t cpu_preprocess_types; + + // TODO: should we get this from device? + size_t max_tensor_size_in_bytes; +}; + +const device_caps & get_device_caps(backend_index_type device); + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn/qnn-types.hpp new file mode 100644 index 0000000000000..4fe3e9155b185 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/qnn-types.hpp @@ -0,0 +1,51 @@ + +#pragma once + +#include +#include +#include +#include +#include + +#include "common.hpp" + +namespace qnn { + +enum qcom_htp_arch { + NONE = 0, + V68 = 68, + V69 = 69, + V73 = 73, + V75 = 75, + V79 = 79, // SD 8 Gen 4 (SM8750) +}; + +enum qcom_chipset { + UNKNOWN = 0, + EMULATOR_X64 = 0xFF00, // x86_64 emulator + EMULATOR_AARCH64 = 0xFF01, // ARM64 emulator + SM8350 = 30, // v68, SD 888/888+ + SM8450 = 36, // v69, SD 8 Gen 1 + SA8295 = 39, // v68 + SM8475 = 42, // v69, SD 8+ Gen 1 + SM8550 = 43, // v73, SD 8 Gen 2 + SSG2115P = 46, // v73 + SM7675 = 70, // V73, SD 7+ Gen 3 + SM8635 = 68, // v73, SD 8s Gen 3 + SM8650 = 57, // v75, SD 8 Gen 3 + SM8750 = 69, // v79, SD 8 Gen 4 +}; + +struct qcom_socinfo { + uint32_t soc_model; + size_t htp_arch; + size_t vtcm_size_in_mb; +}; + +using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); +} // namespace qnn + +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 diff --git a/ggml/src/ggml-qnn/qnn/tensor.hpp b/ggml/src/ggml-qnn/qnn/tensor.hpp new file mode 100644 index 0000000000000..ef501135b5d86 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/tensor.hpp @@ -0,0 +1,443 @@ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "buffer.hpp" +#include "ggml-qnn.h" +#include "logger.hpp" +#include "qnn-lib.hpp" +#include "utils.hpp" + +namespace qnn { + +static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); + +class ggml_qnn_tensor : public std::enable_shared_from_this { + public: + typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER, BIDIRECTION } tensor_type_t; + + explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name, + const qnn_dimension_array_t & dimensions, Qnn_DataType_t data_type, int rank, + backend_index_type device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) : + _tensor_name(name), + _device(device), + _qnn_instance(qnn_instance), + _graph_handle(graph_handle) { + if (!_tensor_name.empty()) { + QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); + } + + _dimensions = dimensions; + QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); + QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); + update_params_from_ggml_tensor(tensor_type, data_type, rank); + QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s\n", get_backend_name(device), + _tensor_name.c_str(), rank, (int) _dimensions[0], (int) _dimensions[1], (int) _dimensions[2], + (int) _dimensions[3], qnn_datatype_to_string(data_type)); + } + + explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name, + const ggml_dimension_array_t & dimensions, ggml_type data_type, int rank, + backend_index_type device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) : + ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), + qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} + + ~ggml_qnn_tensor() { + _rpc_buffer.reset(); + unbind(); + } + + bool set_data_buffer(const uint8_t * buffer, const size_t buffer_size) { + auto qnn_buffer = std::make_shared(buffer, buffer_size); + if (bind_buffer_impl(qnn_buffer)) { + return true; + } + + _can_unbind = false; + return false; + } + + bool set_data_buffer(qnn_buffer_ptr buffer) { + if (bind_buffer_impl(buffer)) { + return true; + } + + _can_unbind = false; + return false; + } + + bool alloc_qnn_tensor_id() { + if (QNN_TENSOR_GET_ID(_qnn_tensor)) { + QNN_LOG_DEBUG("[%s]tensor already has a id: %d\n", _tensor_name.c_str(), QNN_TENSOR_GET_ID(_qnn_tensor)); + return true; + } + + Qnn_Tensor_t qnn_tensor = _qnn_tensor; + auto qnn_interface = _qnn_instance->get_qnn_interface(); + auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[%s]allocate id failed, error: %s\n", _tensor_name.c_str(), get_qnn_error_string(error)); + return false; + } + + QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); + QNN_LOG_DEBUG("[%s][%s]allocated id: %d, rank: %d\n", get_backend_name(_device), _tensor_name.c_str(), + QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor)); + return true; + } + + bool bind_ggml_tensor(ggml_tensor * tensor, qnn_buffer_ptr buffer) { + if (!_can_unbind) { + QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind\n", _tensor_name.c_str()); + return true; + } + +#ifndef NDEBUG + if (tensor->view_src) { + auto * src = tensor->view_src; + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d\n", get_backend_name(_device), + tensor->name, (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], + (int) tensor->ne[3], src->name, (int) src->ne[0], (int) src->ne[1], (int) src->ne[2], + (int) src->ne[3]); + } +#endif + + if (!buffer) { + buffer = + std::make_shared(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + QNN_LOG_DEBUG("[%s][%s]attach buffer to tensor(%s), size: %d\n", get_backend_name(_device), + _tensor_name.c_str(), tensor->name, (int) buffer->get_size()); + } + if (!bind_buffer_impl(buffer)) { + QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)\n", _tensor_name.c_str(), ggml_get_name(tensor)); + return false; + } + + QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)\n", get_backend_name(_device), _tensor_name.c_str(), + ggml_get_name(tensor)); + tensor->extra = this; + _ggml_tensor = tensor; + return true; + } + + bool unbind() { + if (!_graph_handle) { + QNN_LOG_WARN("[%s]not bound to any graph\n", _tensor_name.c_str()); + return false; + } + + if (!_buffer) { + QNN_LOG_DEBUG("[%s]unbind to ggml tensor\n", _tensor_name.c_str()); + return true; + } + + if (!read_from_qnn_tensor()) { + QNN_LOG_WARN("[%s]read from qnn tensor failed\n", _tensor_name.c_str()); + return false; + } + + if (!_can_unbind) { + QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind\n", _tensor_name.c_str()); + return true; + } + + if (!should_use_mem_handle()) { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + QNN_LOG_DEBUG("[%s]clear client buffer\n", _tensor_name.c_str()); + } + + QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d\n", get_backend_name(_device), _tensor_name.c_str(), + (void *) _buffer->get_buffer(), (int) _buffer->get_size()); + _buffer.reset(); + + if (_ggml_tensor) { + _ggml_tensor->extra = nullptr; + _ggml_tensor = nullptr; + } + + return true; + } + + const Qnn_Tensor_t & get_qnn_tensor() const { return _qnn_tensor; } + + Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } + + const qnn_dimension_array_t & get_dimensions() const { return _dimensions; } + + uint32_t get_rank() const { return QNN_TENSOR_GET_RANK(_qnn_tensor); } + + uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } + + const std::string & get_tensor_name() const { return _tensor_name; } + + private: + bool bind_buffer_impl(qnn_buffer_ptr buffer) { + if (_buffer) { + if (_buffer != buffer) { + QNN_LOG_WARN("[%s]has been bound to another buffer %p\n", _tensor_name.c_str(), + (void *) _buffer->get_buffer()); + return false; + } + + QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p\n", _tensor_name.c_str(), + (void *) _buffer->get_buffer()); + return true; + } + + if (QNN_TENSOR_GET_TYPE(_qnn_tensor) == QNN_TENSOR_TYPE_NATIVE) { + QNN_LOG_DEBUG("[%s]tensor type(%d) not READ/WRITE, skipping\n", _tensor_name.c_str(), + (int) QNN_TENSOR_TYPE_NATIVE); + return true; + } + + if (should_use_mem_handle()) { + if (!_rpc_buffer) { + auto rpc_buffer = std::make_shared( + _qnn_instance, buffer->get_size(), QNN_TENSOR_GET_RANK(_qnn_tensor), + QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); + if (!rpc_buffer->is_valid()) { + QNN_LOG_WARN("[%s][%s]alloc rpc mem failed\n", get_backend_name(_device), _tensor_name.c_str()); + return false; + } + + _rpc_buffer = std::move(rpc_buffer); + } + + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); + auto mem_handle = _rpc_buffer->get_mem_handle(); + if (!mem_handle) { + QNN_LOG_WARN("[%s][%s]can't find rpcmem from qnn mem handle\n", get_backend_name(_device), + _tensor_name.c_str()); + return false; + } + + QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, mem_handle); + QNN_LOG_DEBUG("[%s][%s]use mem handle %p\n", get_backend_name(_device), _tensor_name.c_str(), + QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); + } else { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = { buffer->get_buffer(), (uint32_t) buffer->get_size() }; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + QNN_LOG_DEBUG("[%s][%s]use client buffer %p size %d\n", get_backend_name(_device), _tensor_name.c_str(), + client_buf.data, (int) client_buf.dataSize); + } + + _buffer = buffer; + + if (!write_to_qnn_tensor()) { + QNN_LOG_WARN("[%s]write to qnn tensor failed\n", _tensor_name.c_str()); + return false; + } + + QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d\n", get_backend_name(_device), _tensor_name.c_str(), + (void *) buffer->get_buffer(), (int) buffer->get_size()); + return true; + } + + bool write_to_qnn_tensor() { + auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); + if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { + QNN_LOG_DEBUG("[%s]tensor type(%d) not WRITE\n", _tensor_name.c_str(), (int) tensor_type); + return true; + } + + if (_rpc_buffer) { + memcpy(_rpc_buffer->get_buffer(), _buffer->get_buffer(), _buffer->get_size()); + // For CPU and GPU, the data is already in the tensor. + QNN_LOG_DEBUG("[%s][%s]write buffer(%p) to rpc buffer(%p)\n", get_backend_name(_device), + _tensor_name.c_str(), (void *) _buffer->get_buffer(), (void *) _rpc_buffer->get_buffer()); + } + + return true; + } + + bool read_from_qnn_tensor() { + auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); + if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { + QNN_LOG_DEBUG("[%s]tensor type(%d) not READ\n", _tensor_name.c_str(), (int) tensor_type); + return true; + } + + if (_rpc_buffer) { + memcpy(_buffer->get_buffer(), _rpc_buffer->get_buffer(), _buffer->get_size()); + // For CPU and GPU, the data is already in the tensor. + QNN_LOG_DEBUG("[%s][%s]read buffer(%p) from rpc buffer(%p)\n", get_backend_name(_device), + _tensor_name.c_str(), (void *) _buffer->get_buffer(), (void *) _rpc_buffer->get_buffer()); + } + + return true; + } + + void update_params_from_ggml_tensor(tensor_type_t tensor_type, Qnn_DataType_t data_type, int rank) { + QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, data_type); + // TODO: set the quantizeParams base on the tensor type + + QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t) rank); + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + + Qnn_TensorType_t new_tensor_type; + switch (tensor_type) { + case INPUT: + new_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + break; + case OUTPUT: + new_tensor_type = QNN_TENSOR_TYPE_APP_READ; + break; + case PARAMETER: + new_tensor_type = QNN_TENSOR_TYPE_STATIC; + break; + case BIDIRECTION: + new_tensor_type = QNN_TENSOR_TYPE_APP_READWRITE; + break; + case INTERMEDIATE: + default: + new_tensor_type = QNN_TENSOR_TYPE_NATIVE; + break; + } + QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); + QNN_LOG_DEBUG("[%s][%s]new_tensor_type %s\n", get_backend_name(_device), _tensor_name.c_str(), + get_qnn_tensor_type_name(new_tensor_type)); + } + + bool should_use_mem_handle() const { + // TODO: figure out how to set rpc mem to multiple tensor + return false; + } + + std::string _tensor_name; + qnn_buffer_ptr _buffer; + bool _can_unbind = true; + backend_index_type _device; + qnn_instance_ptr _qnn_instance; + Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); + qnn_dimension_array_t _dimensions = {}; + Qnn_GraphHandle_t _graph_handle = nullptr; + qnn_buffer_ptr _rpc_buffer; + ggml_tensor * _ggml_tensor = nullptr; + + DISABLE_COPY(ggml_qnn_tensor); + DISABLE_MOVE(ggml_qnn_tensor); +}; + +using qnn_tensor_ptr_t = std::shared_ptr; +using qnn_tensor_array_t = std::vector; +using ggml_tensor_array_t = std::vector; + +inline qnn_tensor_ptr_t get_qnn_tensor_ptr(ggml_tensor * ggml_tensor) { + return ggml_tensor->extra ? reinterpret_cast(ggml_tensor->extra)->shared_from_this() : + qnn_tensor_ptr_t(); +} + +inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t & tensors) { + int max_rank = 0; + for (auto tensor : tensors) { + max_rank = std::max(max_rank, ggml_n_dims(tensor)); + } + + return max_rank; +} + +inline bool bind_tensors_with_custom_buffers(const ggml_tensor_array_t & ggml_tensors, + std::vector & buffers, + qnn_tensor_array_t & tensor_wrappers, + std::vector & qnn_tensors) { + GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); + GGML_ASSERT(buffers.size() == ggml_tensors.size()); + qnn_tensors.resize(ggml_tensors.size()); + for (size_t i = 0; i < ggml_tensors.size(); i++) { + auto * ggml_tensor = ggml_tensors[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, buffers[i])) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); + } + + return true; +} + +inline bool bind_tensors(const ggml_tensor_array_t & ggml_tensors, qnn_tensor_array_t & tensor_wrappers, + std::vector & qnn_tensors) { + GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); + qnn_tensors.resize(ggml_tensors.size()); + for (size_t i = 0; i < ggml_tensors.size(); i++) { + auto * ggml_tensor = ggml_tensors[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, qnn_buffer_ptr())) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); + } + + return true; +} + +inline bool bind_tensors(const ggml_tensor_array_t & ggml_tensors, qnn_tensor_array_t & tensor_wrappers) { + GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); + for (size_t i = 0; i < ggml_tensors.size(); i++) { + auto * ggml_tensor = ggml_tensors[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, qnn_buffer_ptr())) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + } + + return true; +} + +inline void unbind_tensors(qnn_tensor_array_t & tensor_wrappers) { + for (auto & tensor : tensor_wrappers) { + tensor->unbind(); + } +} + +struct tensor_create_common_params { + const char * name_prefix; + int tensor_rank; + bool is_input; + backend_index_type device; + Qnn_GraphHandle_t graph_handle; + std::shared_ptr qnn_instance; +}; + +inline void create_tensors_from_ggml_tensor(const tensor_create_common_params & params, + const ggml_tensor_array_t & ggml_tensors, + qnn_tensor_array_t * tensor_wrappers, + std::vector * qnn_tensors) { + if (qnn_tensors) { + qnn_tensors->resize(ggml_tensors.size()); + } + + if (!tensor_wrappers->empty()) { + QNN_LOG_DEBUG("tensor_wrappers is not empty, skip create tensors\n"); + GGML_ASSERT(tensor_wrappers->size() == ggml_tensors.size()); + return; + } + + tensor_wrappers->resize(ggml_tensors.size()); + + char buffer[GGML_MAX_NAME] = {}; + auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT; + for (size_t i = 0; i < ggml_tensors.size(); i++) { + snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int) i); + auto * ggml_tensor = ggml_tensors[i]; + (*tensor_wrappers)[i] = std::make_shared(tensor_type, std::string(buffer), ggml_tensor->ne, + ggml_tensor->type, params.tensor_rank, params.device, + params.graph_handle, params.qnn_instance); + } +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/utils.cpp b/ggml/src/ggml-qnn/qnn/utils.cpp new file mode 100644 index 0000000000000..8f3878aa03115 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/utils.cpp @@ -0,0 +1,467 @@ + +#include "utils.hpp" + +#include + +#include "ggml-qnn.h" +#include "qnn-types.hpp" +#include "QnnGraph.h" + +#ifdef _WIN32 +# include +#else +# include +# include +#endif + +namespace { + +template _Ty align_to_generic(size_t alignment, _Ty offset) { + return offset % alignment == 0 ? offset : + offset + (static_cast<_Ty>(alignment) - (offset % static_cast<_Ty>(alignment))); +} + +} // namespace + +namespace qnn { + +qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t & dims, uint32_t rank) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); + GGML_ASSERT(rank <= GGML_MAX_DIMS && rank > 0); + + qnn_dimension_array_t internal_dims = {}; + /* + * Both the ggml and qnn tensor in memory are stored as row-major format. + * But the dimensions of the tensor are stored in different order. + * For example, a 2x3 matrix: + * [ + * [1, 2, 3], + * [4, 5, 6], + * ] + * The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3]. + */ + for (uint32_t i = 0; i < rank; i++) { + internal_dims[i] = std::max((uint32_t) dims[rank - 1 - i], 1); + } + + return internal_dims; +} + +qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor * tensor, size_t & element_offset_out) { + element_offset_out = 0; + + auto * parent_tensor = tensor; + while (parent_tensor->view_src) { + element_offset_out += parent_tensor->view_offs; + parent_tensor = parent_tensor->view_src; + } + + const auto rank = get_ggml_tensor_rank(tensor); + const auto parent_rank = get_ggml_tensor_rank(parent_tensor); + GGML_ASSERT(parent_tensor->type == tensor->type); + GGML_ASSERT(parent_rank == rank); + + const auto block_size = ggml_blck_size(tensor->type); + element_offset_out = + element_offset_out * block_size / tensor->nb[0]; // calculate the element offset in the view tensor + + return get_internal_dimension(parent_tensor->ne, parent_rank); +} + +// TODO: mapping more ggml data type to QNN data type +// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type) { + switch (ggml_type) { + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_I32: + return QNN_DATATYPE_INT_32; + case GGML_TYPE_I16: + return QNN_DATATYPE_INT_16; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; + default: + break; + } + return QNN_DATATYPE_UNDEFINED; +} + +ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) { + switch (qnn_type) { + case QNN_DATATYPE_FLOAT_32: + return GGML_TYPE_F32; + case QNN_DATATYPE_FLOAT_16: + return GGML_TYPE_F16; + case QNN_DATATYPE_UINT_32: + case QNN_DATATYPE_INT_32: + return GGML_TYPE_I32; + case QNN_DATATYPE_INT_16: + return GGML_TYPE_I16; + case QNN_DATATYPE_INT_8: + return GGML_TYPE_I8; + case QNN_DATATYPE_SFIXED_POINT_8: + return GGML_TYPE_Q8_0; + case QNN_DATATYPE_SFIXED_POINT_4: + return GGML_TYPE_Q4_0; + default: + break; + } + return GGML_TYPE_COUNT; +} + +size_t qnn_datatype_size(Qnn_DataType_t qnn_type) { + switch (qnn_type) { + case QNN_DATATYPE_FLOAT_32: + return sizeof(float); + case QNN_DATATYPE_FLOAT_16: + return sizeof(uint16_t); + case QNN_DATATYPE_UINT_32: + case QNN_DATATYPE_INT_32: + return sizeof(int32_t); + case QNN_DATATYPE_INT_16: + return sizeof(int16_t); + case QNN_DATATYPE_INT_8: + return sizeof(int8_t); + case QNN_DATATYPE_SFIXED_POINT_8: + return sizeof(int8_t); + case QNN_DATATYPE_SFIXED_POINT_4: + return sizeof(int8_t); + default: + break; + } + return 0; +} + +const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type) { + switch (qnn_type) { + case QNN_DATATYPE_FLOAT_32: + return "QNN_DATATYPE_FLOAT_32"; + case QNN_DATATYPE_FLOAT_16: + return "QNN_DATATYPE_FLOAT_16"; + case QNN_DATATYPE_UINT_32: + return "QNN_DATATYPE_UINT_32"; + case QNN_DATATYPE_INT_32: + return "QNN_DATATYPE_INT_32"; + case QNN_DATATYPE_INT_16: + return "QNN_DATATYPE_INT_16"; + case QNN_DATATYPE_INT_8: + return "QNN_DATATYPE_INT_8"; + case QNN_DATATYPE_SFIXED_POINT_8: + return "QNN_DATATYPE_SFIXED_POINT_8"; + case QNN_DATATYPE_SFIXED_POINT_4: + return "QNN_DATATYPE_SFIXED_POINT_4"; + default: + break; + } + + return "QNN_DATATYPE_UNDEFINED"; +} + +uint32_t get_ggml_tensor_rank(const ggml_tensor * tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; +} + +const char * get_ggml_type_name(ggml_type type) { + const auto * traits = ggml_get_type_traits(type); + return traits->type_name; +} + +const char * get_backend_name(backend_index_type device) { + switch (device) { + case QNN_BACKEND_CPU: + return "qnn-cpu"; + case QNN_BACKEND_GPU: + return "qnn-gpu"; + case QNN_BACKEND_NPU: + return "qnn-npu"; + case QNN_BACKEND_COUNT: + default: + return "unknown"; + } +} + +const char * get_backend_desc(backend_index_type device) { + switch (device) { + case QNN_BACKEND_CPU: + return "CPU"; + case QNN_BACKEND_GPU: + return "Adreno GPU"; + case QNN_BACKEND_NPU: + return "Hexagon NPU"; + case QNN_BACKEND_COUNT: + default: + return "unknown"; + } +} + +const char * get_chipset_desc(uint32_t soc_model) { + switch (soc_model) { + case SM8350: + return "Snapdragon 888/888+"; + case SM8450: + return "Snapdragon 8 Gen 1"; + case SM8475: + return "Snapdragon 8 Gen 1+"; + case SM8550: + return "Snapdragon 8 Gen 2"; + case SM7675: + return "Snapdragon 7+ Gen 3"; + case SM8635: + return "Snapdragon 8s Gen 3"; + case SM8650: + return "Snapdragon 8 Gen 3"; + case SM8750: + return "Snapdragon 8 Elite"; + case EMULATOR_AARCH64: + return "AArch64 Emulator"; + case EMULATOR_X64: + return "x86_64 Emulator"; + default: + return "unknown"; + } +} + +const char * get_chipset_model(uint32_t soc_model) { + switch (soc_model) { + case SM8350: + return "SM8350"; + case SM8450: + return "SM8450"; + case SA8295: + return "SA8295"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SSG2115P: + return "SSG2115P"; + case SM7675: + return "SM7675"; + case SM8635: + return "SM8635"; + case SM8650: + return "SM8650"; + case SM8750: + return "SM8750"; + case EMULATOR_AARCH64: + return "AARCH64EMU"; + case EMULATOR_X64: + return "X64EMU"; + default: + return "unknown"; + } +} + +const char * get_htparch_desc(size_t htp_arch) { + switch (htp_arch) { + case V68: + return "HTP_V68"; + case V69: + return "HTP_V69"; + case V73: + return "HTP_V73"; + case V75: + return "HTP_V75"; + case V79: + return "HTP_V79"; + default: + return "unknown"; + } +} + +intptr_t align_to(size_t alignment, intptr_t offset) { + return align_to_generic(alignment, offset); +} + +uint32_t get_ggml_tensor_data_size(const ggml_tensor * tensor) { + return (uint32_t) ggml_nbytes(tensor); +} + +const char * get_qnn_tensor_type_name(Qnn_TensorType_t type) { + switch (type) { + case QNN_TENSOR_TYPE_APP_WRITE: + return "QNN_TENSOR_TYPE_APP_WRITE"; + case QNN_TENSOR_TYPE_APP_READ: + return "QNN_TENSOR_TYPE_APP_READ"; + case QNN_TENSOR_TYPE_APP_READWRITE: + return "QNN_TENSOR_TYPE_APP_READWRITE"; + case QNN_TENSOR_TYPE_STATIC: + return "QNN_TENSOR_TYPE_STATIC"; + case QNN_TENSOR_TYPE_NATIVE: + return "QNN_TENSOR_TYPE_NATIVE"; + case QNN_TENSOR_TYPE_UNDEFINED: + return "QNN_TENSOR_TYPE_UNDEFINED"; + case QNN_TENSOR_TYPE_NULL: + return "QNN_TENSOR_TYPE_NULL"; + default: + break; + } + + return "unknown"; +} + +#ifdef _WIN32 +static void * _align_alloc(size_t alignment, size_t size) { + return _aligned_malloc(size, alignment); +} + +static size_t _get_page_size() { + SYSTEM_INFO si; + GetSystemInfo(&si); + return si.dwPageSize; +} + +void align_free(void * ptr) { + _aligned_free(ptr); +} +#else +static void * _align_alloc(size_t alignment, size_t size) { + return std::aligned_alloc(alignment, size); +} + +static size_t _get_page_size() { + return sysconf(_SC_PAGESIZE); +} + +void align_free(void * ptr) { + std::free(ptr); +} +#endif + +void * page_align_alloc(size_t size) { + const size_t alignment = _get_page_size(); + size_t size_aligned = align_to_generic(alignment, size); + void * data = _align_alloc(alignment, size_aligned); + if (!data) { + QNN_LOG_WARN("_align_alloc failed, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size, + size_aligned); + return nullptr; + } + + QNN_LOG_DEBUG("_align_alloc success, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size, + size_aligned); + return data; +} + +// ================================================================================================= +// +// QNN backend internal helper functions +// +// ================================================================================================= +// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT +const char * opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; + } + return nullptr; +} + +const char * get_qnn_error_string(Qnn_ErrorHandle_t error) { + // A complete list of error codes can be found at here: + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/api_error_codes.html + thread_local static char error_code[128] = {}; + switch (error) { + case QNN_SUCCESS: + return "QNN_SUCCESS"; + case QNN_COMMON_ERROR_GENERAL: + return "QNN_COMMON_ERROR_GENERAL"; + + // QnnGraph_Error_t + case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: + return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE"; + case QNN_GRAPH_ERROR_MEM_ALLOC: + return "QNN_GRAPH_ERROR_MEM_ALLOC"; + case QNN_GRAPH_ERROR_INVALID_ARGUMENT: + return "QNN_GRAPH_ERROR_INVALID_ARGUMENT"; + case QNN_GRAPH_ERROR_INVALID_HANDLE: + return "QNN_GRAPH_ERROR_INVALID_HANDLE"; + case QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST: + return "QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST"; + case QNN_GRAPH_ERROR_INVALID_NAME: + return "QNN_GRAPH_ERROR_INVALID_NAME"; + case QNN_GRAPH_ERROR_INVALID_TENSOR: + return "QNN_GRAPH_ERROR_INVALID_TENSOR"; + case QNN_GRAPH_ERROR_INVALID_OP_CONFIG: + return "QNN_GRAPH_ERROR_INVALID_OP_CONFIG"; + case QNN_GRAPH_ERROR_SET_PROFILE: + return "QNN_GRAPH_ERROR_SET_PROFILE"; + case QNN_GRAPH_ERROR_UNCONNECTED_NODE: + return "QNN_GRAPH_ERROR_UNCONNECTED_NODE"; + case QNN_GRAPH_ERROR_CREATE_FAILED: + return "QNN_GRAPH_ERROR_CREATE_FAILED"; + case QNN_GRAPH_ERROR_OPTIMIZATION_FAILED: + return "QNN_GRAPH_ERROR_OPTIMIZATION_FAILED"; + case QNN_GRAPH_ERROR_FINALIZE_FAILED: + return "QNN_GRAPH_ERROR_FINALIZE_FAILED"; + case QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED: + return "QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED"; + case QNN_GRAPH_ERROR_GRAPH_FINALIZED: + return "QNN_GRAPH_ERROR_GRAPH_FINALIZED"; + case QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL: + return "QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL"; + case QNN_GRAPH_ERROR_SIGNAL_IN_USE: + return "QNN_GRAPH_ERROR_SIGNAL_IN_USE"; + case QNN_GRAPH_ERROR_ABORTED: + return "QNN_GRAPH_ERROR_ABORTED"; + case QNN_GRAPH_ERROR_PROFILE_IN_USE: + return "QNN_GRAPH_ERROR_PROFILE_IN_USE"; + case QNN_GRAPH_ERROR_TIMED_OUT: + return "QNN_GRAPH_ERROR_TIMED_OUT"; + case QNN_GRAPH_ERROR_SUBGRAPH: + return "QNN_GRAPH_ERROR_SUBGRAPH"; + case QNN_GRAPH_ERROR_DISABLED: + return "QNN_GRAPH_ERROR_DISABLED"; + case QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE: + return "QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE"; + case QNN_GRAPH_ERROR_TENSOR_SPARSITY: + return "QNN_GRAPH_ERROR_TENSOR_SPARSITY"; + case QNN_GRAPH_ERROR_EARLY_TERMINATION: + return "QNN_GRAPH_ERROR_EARLY_TERMINATION"; + case QNN_GRAPH_ERROR_INVALID_CONTEXT: + return "QNN_GRAPH_ERROR_INVALID_CONTEXT"; + + // QnnOpPackage_Error_t + case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED: + return "QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED"; + case QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED: + return "QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED"; + case QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: + return "QNN_OP_PACKAGE_ERROR_INVALID_HANDLE"; + case QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE: + return "QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE"; + case QNN_OP_PACKAGE_ERROR_INVALID_INFO: + return "QNN_OP_PACKAGE_ERROR_INVALID_INFO"; + case QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE: + return "QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE"; + case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: + return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT"; + default: + if (error >= QNN_GRAPH_MIN_ERROR && error < QNN_GRAPH_MAX_ERROR) { + snprintf(error_code, sizeof(error_code), "UNKNOWN_GRAPH_ERROR_%d", int(error - QNN_GRAPH_MIN_ERROR)); + } else { + snprintf(error_code, sizeof(error_code), "%d", int(error)); + } + return error_code; + } +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/utils.hpp b/ggml/src/ggml-qnn/qnn/utils.hpp new file mode 100644 index 0000000000000..09596c4e6f6a4 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/utils.hpp @@ -0,0 +1,228 @@ +#pragma once + +#include +#include +#include +#include + +#include "common.hpp" +#include "ggml-qnn.h" +#include "ggml.h" +#include "logger.hpp" +#include "QnnTypes.h" + +#define QNN_TENSOR_VER(x) ((x).v1) + +namespace qnn { + +using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS]; +using ggml_stride_array_t = size_t[GGML_MAX_DIMS]; +using qnn_dimension_array_t = std::array; + +qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t & dims, uint32_t rank); +qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor * tensor, size_t & element_offser_out); + +uint32_t get_ggml_tensor_rank(const ggml_tensor * tensor); +const char * get_ggml_type_name(ggml_type type); +const char * get_backend_name(backend_index_type device); +const char * get_backend_desc(backend_index_type device); +const char * get_chipset_desc(uint32_t soc_model); +const char * get_chipset_model(uint32_t soc_model); +const char * get_htparch_desc(size_t htp_arch); +intptr_t align_to(size_t alignment, intptr_t offset); +uint32_t get_ggml_tensor_data_size(const ggml_tensor * tensor); +const char * get_qnn_tensor_type_name(Qnn_TensorType_t type); + +void * page_align_alloc(size_t size); +void align_free(void * ptr); + +const char * opname_from_ggmlop(enum ggml_op ggmlop); + +const char * get_qnn_error_string(Qnn_ErrorHandle_t error); + +constexpr const Qnn_TensorVersion_t kDefaultQnnTensorVersion = QNN_TENSOR_VERSION_1; + +inline Qnn_Tensor_t qnn_tensor_init(Qnn_TensorVersion_t version) { + Qnn_Tensor_t tensor; + tensor.version = version; + if (version == QNN_TENSOR_VERSION_1) { + tensor.v1 = QNN_TENSOR_V1_INIT; + } else if (version == QNN_TENSOR_VERSION_2) { + tensor.v2 = QNN_TENSOR_V2_INIT; + } + return tensor; +} + +inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).id; + } + + return 0u; +} + +inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).name; + } + return nullptr; +} + +inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).type; + } + return QNN_TENSOR_TYPE_UNDEFINED; +} + +inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} + +inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dataType; + } + return QNN_DATATYPE_UNDEFINED; +} + +inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; +} + +inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).rank; + } + return 0u; +} + +inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dimensions; + } + return nullptr; +} + +inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; +} + +inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) { + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).memHandle; + } + return nullptr; +} + +inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).id = id; + } +} + +inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).name = name; + } +} + +inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).type = type; + } +} + +inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dataFormat = format; + } +} + +inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dataType = dataType; + } +} + +inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).quantizeParams = params; + } +} + +inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).rank = rank; + } +} + +inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dimensions = dims; + } +} + +inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t mem_type) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).memType = mem_type; + } +} + +inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t client_buf) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).clientBuf = client_buf; + } +} + +inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).memHandle = handle; + } +} + +inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t & tensor, uint8_t * isDynamicDimensions) { + if (tensor.version == QNN_TENSOR_VERSION_2) { + tensor.v2.isDynamicDimensions = isDynamicDimensions; + } +} + +Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type); +ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type); +size_t qnn_datatype_size(Qnn_DataType_t qnn_type); +const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type); + +} // namespace qnn + +#define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) qnn::get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) qnn::get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) qnn::get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) +#define QNN_TENSOR_GET_MEM_HANDLE(tensor) qnn::get_qnn_tensor_memhandle(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) qnn::set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) qnn::set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) qnn::set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) qnn::set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) qnn::set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) +#define QNN_TENSOR_SET_DYN_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dyn_dimensions(tensor, value) diff --git a/ggml/src/ggml-qnn/shared/CMakeLists.txt b/ggml/src/ggml-qnn/shared/CMakeLists.txt new file mode 100644 index 0000000000000..b901e656b9ee0 --- /dev/null +++ b/ggml/src/ggml-qnn/shared/CMakeLists.txt @@ -0,0 +1,35 @@ + +file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/*.cpp") + +add_library(runtime-common STATIC + ${common_srcs} +) + +target_include_directories(runtime-common PUBLIC + ${CMAKE_CURRENT_LIST_DIR}/ + ${CMAKE_CURRENT_LIST_DIR}/../ + ${CMAKE_CURRENT_LIST_DIR}/../../ + ${CMAKE_CURRENT_LIST_DIR}/../../../include/ # TODO: figure out how to remove this +) + +if(GGML_QNN_ENABLE_HEXAGON_BACKEND) + if(DEFINED ENV{QNN_SDK_PATH}) + set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT}) + message("found HEXAGON_SDK_ROOT, setting to ${HEXAGON_SDK_ROOT}") + else() + message(FATAL_ERROR "HEXAGON_SDK_ROOT not defined") + endif() + + target_include_directories(runtime-common PUBLIC + ${HEXAGON_SDK_ROOT}/incs/ + ${HEXAGON_SDK_ROOT}/incs/stddef/ + ${HEXAGON_SDK_ROOT}/incs/HAP/ + ${HEXAGON_SDK_ROOT}/rtos/qurt/ + ${HEXAGON_SDK_ROOT}/utils/examples/ + ) + target_compile_definitions(runtime-common PRIVATE + GGML_QNN_ENABLE_HEXAGON_BACKEND + ) +else() + message("HEXAGON_SDK_ROOT not defined, not appending to include directories") +endif() diff --git a/ggml/src/ggml-qnn/shared/common.cpp b/ggml/src/ggml-qnn/shared/common.cpp new file mode 100644 index 0000000000000..d89a31c20ef39 --- /dev/null +++ b/ggml/src/ggml-qnn/shared/common.cpp @@ -0,0 +1,146 @@ + +#include "common.hpp" + +#include + +#include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include "ggml-qnn.h" + +#ifdef _WIN32 +# include +#else +# include +# include +#endif + +namespace { + +struct ggml_backend_qnn_reg_impl : ggml_backend_reg { + std::vector device_proxies; + std::vector devices; + + explicit ggml_backend_qnn_reg_impl(ggml_backend_reg_i backend_iface) { + context = this; + iface = backend_iface; + + LOG_INFO("backend registry init\n"); + for (size_t i = 0; i < TOTAL_BACKEND_COUNT; i++) { + const auto device_enum = + (backend_index_type) (TOTAL_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU + + backend_device_proxy_ptr device_proxy; + if (device_enum < QNN_BACKEND_COUNT) { +#ifdef GGML_HEXAGON_NPU_ONLY + device_proxy = create_qnn_backend_context(device_enum); +#else + LOG_DEBUG("skip qnn device %d\n", (int) device_enum); + continue; +#endif + } else { +#ifdef GGML_QNN_ENABLE_HEXAGON_BACKEND + device_proxy = create_hexagon_backend_context(device_enum); +#else + LOG_DEBUG("skip hexagon device %d\n", (int) device_enum); + continue; +#endif + } + + if (!device_proxy) { + LOG_DEBUG("skip device %d\n", (int) device_enum); + continue; + } + + devices.emplace_back(ggml_backend_device{ + /* iface = */ device_proxy->get_iface(), + /* reg = */ this, + /* context = */ device_proxy->get_context(), + }); + + device_proxies.emplace_back(device_proxy); + } + } +}; + +const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + // TODO: should we use a different name? + return "qualcomm"; +} + +size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) { + auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context; + return ctx->devices.size(); +} + +ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg, size_t index) { + auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context; + GGML_ASSERT(index < ctx->devices.size()); + return &(ctx->devices[index]); +} + +const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { + /* .get_name = */ ggml_backend_qnn_reg_get_name, + /* .get_device_count = */ ggml_backend_qnn_reg_get_device_count, + /* .get_device_get = */ ggml_backend_qnn_reg_get_device, + /* .get_proc_address = */ nullptr, +}; + +} // namespace + +ggml_backend_reg_t ggml_backend_qnn_reg() { + static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface }; + return ® +} + +namespace common { + +#ifdef _WIN32 + +size_t get_system_total_memory_in_bytes() { + MEMORYSTATUSEX mem = {}; + mem.dwLength = sizeof(mem); + if (GlobalMemoryStatusEx(&mem)) { + return mem.ullTotalPhys; + } + + return 0; +} + +size_t get_system_free_memory_in_bytes() { + MEMORYSTATUSEX mem = {}; + mem.dwLength = sizeof(mem); + if (GlobalMemoryStatusEx(&mem)) { + return mem.ullAvailPhys; + } + + return 0; +} + +#else + +size_t get_system_total_memory_in_bytes() { + struct sysinfo info = {}; + if (sysinfo(&info) == 0) { + return (info.totalram + info.totalswap) * info.mem_unit; + } + + auto pages = (size_t) sysconf(_SC_PHYS_PAGES); + auto page_size = (size_t) sysconf(_SC_PAGE_SIZE); + return pages * page_size; +} + +size_t get_system_free_memory_in_bytes() { + struct sysinfo info = {}; + if (sysinfo(&info) == 0) { + return (info.freeram + info.freeswap) * info.mem_unit; + } + + auto avail_pages = (size_t) sysconf(_SC_AVPHYS_PAGES); + auto page_size = (size_t) sysconf(_SC_PAGE_SIZE); + return avail_pages * page_size; +} + +#endif + +} // namespace common diff --git a/ggml/src/ggml-qnn/shared/common.hpp b/ggml/src/ggml-qnn/shared/common.hpp new file mode 100644 index 0000000000000..4feb3365ce102 --- /dev/null +++ b/ggml/src/ggml-qnn/shared/common.hpp @@ -0,0 +1,56 @@ +#pragma once + +#include +#include + +#include "ggml-backend-impl.h" +#include "ggml-impl.h" + +enum backend_index_type { + QNN_BACKEND_CPU = 0, + QNN_BACKEND_GPU, + QNN_BACKEND_NPU, + + HEXAGON_BACKEND, + + TOTAL_BACKEND_COUNT, + QNN_BACKEND_COUNT = HEXAGON_BACKEND, +}; + +class backend_device_proxy { + public: + virtual ~backend_device_proxy() = default; + + virtual const ggml_backend_device_i & get_iface() const = 0; + virtual void * get_context() = 0; +}; + +using backend_device_proxy_ptr = std::shared_ptr; + +backend_device_proxy_ptr create_qnn_backend_context(backend_index_type device); +backend_device_proxy_ptr create_hexagon_backend_context(backend_index_type device); + +namespace common { + +size_t get_system_total_memory_in_bytes(); +size_t get_system_free_memory_in_bytes(); + +} // namespace common + +#define DISABLE_COPY(class_name) \ + class_name(const class_name &) = delete; \ + void operator=(const class_name &) = delete + +#define DISABLE_MOVE(class_name) \ + class_name(class_name &&) = delete; \ + void operator=(class_name &&) = delete + +#define LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__)) +#define LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__)) +#define LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__)) + +#ifndef NDEBUG +# define LOG_DEBUG(...) (GGML_LOG_DEBUG(__VA_ARGS__)) +#else +# define LOG_DEBUG(...) +#endif diff --git a/ggml/src/ggml-qnn/shared/dyn-lib-loader.hpp b/ggml/src/ggml-qnn/shared/dyn-lib-loader.hpp new file mode 100644 index 0000000000000..22cf8901f3cbc --- /dev/null +++ b/ggml/src/ggml-qnn/shared/dyn-lib-loader.hpp @@ -0,0 +1,76 @@ +#pragma once + +#ifdef __linux__ +# include +# include +#elif defined(_WIN32) +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#endif + +#include + +namespace common { + +#ifdef __linux__ +typedef void * dl_handler_t; + +inline dl_handler_t dl_load(const std::string & lib_path) { + return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); +} + +inline void * dl_sym(dl_handler_t handle, const std::string & symbol) { + return dlsym(handle, symbol.c_str()); +} + +inline bool dl_unload(dl_handler_t handle) { + return dlclose(handle) == 0; +} + +inline const char * dl_error() { + return dlerror(); +} +#elif defined(_WIN32) +using dl_handler_t = HMODULE; + +inline dl_handler_t dl_load(const std::string & lib_path) { + // suppress error dialogs for missing DLLs + auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + auto handle = LoadLibraryA(lib_path.c_str()); // TODO: use wstring version for unicode paths + + SetErrorMode(old_mode); + return handle; +} + +inline void * dl_sym(dl_handler_t handle, const std::string & symbol) { + auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + void * p = (void *) GetProcAddress(handle, symbol.c_str()); + + SetErrorMode(old_mode); + return p; +} + +inline bool dl_unload(dl_handler_t handle) { + FreeLibrary(handle); + return true; +} + +inline const char * dl_error() { + // TODO: implement dl_error for Windows + return nullptr; +} + +#endif + +template Fn dl_sym_typed(dl_handler_t handle, const std::string & function_name) { + return reinterpret_cast(dl_sym(handle, function_name)); +} + +} // namespace common diff --git a/ggml/src/ggml-qnn/shared/rpc-interface.hpp b/ggml/src/ggml-qnn/shared/rpc-interface.hpp new file mode 100644 index 0000000000000..5a64a03646e67 --- /dev/null +++ b/ggml/src/ggml-qnn/shared/rpc-interface.hpp @@ -0,0 +1,223 @@ +#pragma once + +#include + +#include "common.hpp" +#include "dyn-lib-loader.hpp" +#ifdef GGML_QNN_ENABLE_HEXAGON_BACKEND +# include +#else +// TODO: remove this when not needed + +/** + * @enum fastrpc_map_flags for fastrpc_mmap and fastrpc_munmap + * @brief Types of maps with cache maintenance + */ +enum fastrpc_map_flags { + /** + * Map memory pages with RW- permission and CACHE WRITEBACK. + * Driver will clean cache when buffer passed in a FastRPC call. + * Same remote virtual address will be assigned for subsequent + * FastRPC calls. + */ + FASTRPC_MAP_STATIC, + + /** Reserved for compatibility with deprecated flag */ + FASTRPC_MAP_RESERVED, + + /** + * Map memory pages with RW- permission and CACHE WRITEBACK. + * Mapping tagged with a file descriptor. User is responsible for + * maintenance of CPU and DSP caches for the buffer. Get virtual address + * of buffer on DSP using HAP_mmap_get() and HAP_mmap_put() functions. + */ + FASTRPC_MAP_FD, + + /** + * Mapping delayed until user calls HAP_mmap() and HAP_munmap() + * functions on DSP. User is responsible for maintenance of CPU and DSP + * caches for the buffer. Delayed mapping is useful for users to map + * buffer on DSP with other than default permissions and cache modes + * using HAP_mmap() and HAP_munmap() functions. + */ + FASTRPC_MAP_FD_DELAYED, + + /** Reserved for compatibility **/ + FASTRPC_MAP_RESERVED_4, + FASTRPC_MAP_RESERVED_5, + FASTRPC_MAP_RESERVED_6, + FASTRPC_MAP_RESERVED_7, + FASTRPC_MAP_RESERVED_8, + FASTRPC_MAP_RESERVED_9, + FASTRPC_MAP_RESERVED_10, + FASTRPC_MAP_RESERVED_11, + FASTRPC_MAP_RESERVED_12, + FASTRPC_MAP_RESERVED_13, + FASTRPC_MAP_RESERVED_14, + FASTRPC_MAP_RESERVED_15, + + /** + * This flag is used to skip CPU mapping, + * otherwise behaves similar to FASTRPC_MAP_FD_DELAYED flag. + */ + FASTRPC_MAP_FD_NOMAP, + + /** Update FASTRPC_MAP_MAX when adding new value to this enum **/ +}; + +#endif + +namespace common { + +#ifdef _WIN32 +constexpr const char * kQnnRpcLibName = "libcdsprpc.dll"; +#else +constexpr const char * kQnnRpcLibName = "libcdsprpc.so"; +#endif + +class rpc_interface { + using rpc_mem_init_t = void (*)(); + using rpc_mem_deinit_t = void (*)(); + using rpc_mem_alloc_t = void * (*) (int heapid, uint32_t flags, int size); + using rpc_mem_alloc2_t = void * (*) (int heapid, uint32_t flags, size_t size); + using rpc_mem_free_t = void (*)(void * po); + using rpc_mem_to_fd_t = int (*)(void * po); + using rpc_mem_fastrpc_mmap_t = int (*)(int domain, int fd, void * addr, int offset, size_t length, + enum fastrpc_map_flags flags); + using rpc_mem_fastrpc_munmap_t = int (*)(int domain, int fd, void * addr, size_t length); + using remote_handle_control_t = int (*)(uint32_t req, void * data, uint32_t datalen); + using remote_session_control_t = int (*)(uint32_t req, void * data, uint32_t datalen); + + public: + rpc_interface(const std::string & rpc_lib_path = kQnnRpcLibName) { + _rpc_lib_handle = dl_load(rpc_lib_path); + if (!_rpc_lib_handle) { + LOG_ERROR("failed to load %s, error: %s\n", rpc_lib_path.c_str(), dl_error()); + return; + } + + _rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); + _rpc_mem_deinit = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_deinit")); + _rpc_mem_alloc = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc")); + _rpc_mem_alloc2 = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc2")); + _rpc_mem_free = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_free")); + _rpc_mem_to_fd = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_to_fd")); + _rpc_mem_fastrpc_mmap = reinterpret_cast(dl_sym(_rpc_lib_handle, "fastrpc_mmap")); + _rpc_mem_fastrpc_munmap = reinterpret_cast(dl_sym(_rpc_lib_handle, "fastrpc_munmap")); + _remote_handle_control = + reinterpret_cast(dl_sym(_rpc_lib_handle, "remote_handle_control")); + _remote_session_control = + reinterpret_cast(dl_sym(_rpc_lib_handle, "remote_session_control")); + } + + bool is_valid() const { return _rpc_lib_handle != nullptr; } + + bool is_alloc2_available() const { return _rpc_mem_alloc2 != nullptr; } + + void rpcmem_init() { + if (_rpc_mem_init) { + _rpc_mem_init(); + } + } + + void rpcmem_deinit() { + if (_rpc_mem_deinit) { + _rpc_mem_deinit(); + } + } + + void * rpcmem_alloc(int heapid, uint32_t flags, int size) { + if (!is_valid()) { + return nullptr; + } + + return _rpc_mem_alloc(heapid, flags, size); + } + + void * rpcmem_alloc2(int heapid, uint32_t flags, size_t size) { + if (!is_valid()) { + return nullptr; + } + + return _rpc_mem_alloc2(heapid, flags, size); + } + + void rpcmem_free(void * buf) { + if (is_valid()) { + _rpc_mem_free(buf); + } + } + + int rpcmem_to_fd(void * buf) { + int mem_fd = -1; + if (is_valid()) { + mem_fd = _rpc_mem_to_fd(buf); + } + + return mem_fd; + } + + int fastrpc_mmap(int domain, int fd, void * addr, int offset, size_t length, enum fastrpc_map_flags flags) { + if (!is_valid()) { + return -1; + } + + return _rpc_mem_fastrpc_mmap(domain, fd, addr, offset, length, flags); + } + + int fastrpc_munmap(int domain, int fd, void * addr, size_t length) { + if (!is_valid()) { + return -1; + } + + return _rpc_mem_fastrpc_munmap(domain, fd, addr, length); + } + + int remote_handle_control(uint32_t req, void * data, uint32_t datalen) { + if (!is_valid()) { + return -1; + } + + return _remote_handle_control(req, data, datalen); + } + + int remote_session_control(uint32_t req, void * data, uint32_t datalen) { + if (!is_valid()) { + return -1; + } + + return _remote_session_control(req, data, datalen); + } + + ~rpc_interface() { + if (_rpc_lib_handle) { + if (_rpc_mem_deinit) { + _rpc_mem_deinit(); + } + + dl_unload(_rpc_lib_handle); + } + } + + private: + dl_handler_t _rpc_lib_handle = nullptr; + rpc_mem_init_t _rpc_mem_init = nullptr; + rpc_mem_deinit_t _rpc_mem_deinit = nullptr; + rpc_mem_alloc_t _rpc_mem_alloc = nullptr; + rpc_mem_alloc2_t _rpc_mem_alloc2 = nullptr; + rpc_mem_free_t _rpc_mem_free = nullptr; + rpc_mem_to_fd_t _rpc_mem_to_fd = nullptr; + rpc_mem_fastrpc_mmap_t _rpc_mem_fastrpc_mmap = nullptr; + rpc_mem_fastrpc_munmap_t _rpc_mem_fastrpc_munmap = nullptr; + remote_handle_control_t _remote_handle_control = nullptr; + remote_session_control_t _remote_session_control = nullptr; + + rpc_interface(const rpc_interface &) = delete; + rpc_interface & operator=(const rpc_interface &) = delete; + rpc_interface(rpc_interface &&) = delete; + rpc_interface & operator=(rpc_interface &&) = delete; +}; + +using rpc_interface_ptr = std::shared_ptr; + +} // namespace common diff --git a/ggml/src/ggml-qnn/shared/rpc-mem.hpp b/ggml/src/ggml-qnn/shared/rpc-mem.hpp new file mode 100644 index 0000000000000..ba8449192b5dd --- /dev/null +++ b/ggml/src/ggml-qnn/shared/rpc-mem.hpp @@ -0,0 +1,129 @@ + +#pragma once + +#include +#include + +#include "common.hpp" +#include "dyn-lib-loader.hpp" +#include "rpc-interface.hpp" + +namespace common { + +class rpc_mem { + public: + rpc_mem() { + auto interface = std::make_shared(); + if (!interface->is_valid()) { + LOG_ERROR("failed to load rpcmem lib\n"); + return; + } + + interface->rpcmem_init(); + _rpc_interface = interface; + LOG_DEBUG("load rpcmem lib successfully\n"); + } + + explicit rpc_mem(rpc_interface_ptr interface) { + if (!interface->is_valid()) { + LOG_ERROR("failed to load rpcmem lib\n"); + return; + } + + interface->rpcmem_init(); + _rpc_interface = interface; + LOG_DEBUG("load rpcmem lib successfully\n"); + } + + ~rpc_mem() { + if (!is_valid()) { + LOG_DEBUG("rpc memory not initialized\n"); + return; + } + + if (_rpc_interface) { + _rpc_interface->rpcmem_deinit(); + _rpc_interface.reset(); + } + + LOG_DEBUG("unload rpcmem lib successfully\n"); + } + + bool is_valid() const { return (bool) _rpc_interface; } + + void * alloc(int heapid, uint32_t flags, size_t size) { + if (!is_valid()) { + LOG_ERROR("rpc memory not initialized\n"); + return nullptr; + } + + if (size > get_max_alloc_size()) { + LOG_ERROR("rpc memory size %zu exceeds max alloc size %zu\n", size, get_max_alloc_size()); + return nullptr; + } + + void * buf = nullptr; + if (_rpc_interface->is_alloc2_available()) { + buf = _rpc_interface->rpcmem_alloc2(heapid, flags, size); + } else { + buf = _rpc_interface->rpcmem_alloc(heapid, flags, size); + } + + if (!buf) { + LOG_ERROR("failed to allocate rpc memory, size: %d MB\n", (int) (size / (1 << 20))); + return nullptr; + } + + LOG_DEBUG("rpc buffer allocated, heapid: %d, flags: 0x%x, size: %zu\n", heapid, flags, size); + return buf; + } + + void free(void * buf) { + if (!is_valid()) { + LOG_ERROR("rpc memory not initialized\n"); + } else { + _rpc_interface->rpcmem_free(buf); + } + } + + int to_fd(void * buf) { + int mem_fd = -1; + if (!is_valid()) { + LOG_ERROR("rpc memory not initialized\n"); + } else { + mem_fd = _rpc_interface->rpcmem_to_fd(buf); + } + + return mem_fd; + } + + size_t get_max_alloc_size() { + return _rpc_interface->is_alloc2_available() ? std::numeric_limits::max() : + std::numeric_limits::max(); + } + + int fastrpc_mmap(int domain, int fd, void * addr, int offset, size_t length, enum fastrpc_map_flags flags) { + if (!is_valid()) { + LOG_ERROR("rpc memory not initialized\n"); + return -1; + } + + return _rpc_interface->fastrpc_mmap(domain, fd, addr, offset, length, flags); + } + + int fastrpc_munmap(int domain, int fd, void * addr, size_t length) { + if (!is_valid()) { + LOG_ERROR("rpc memory not initialized\n"); + return -1; + } + + return _rpc_interface->fastrpc_munmap(domain, fd, addr, length); + } + + private: + rpc_interface_ptr _rpc_interface; +}; + +using rpc_mem_ptr = std::shared_ptr; + +} // namespace common