openvinotoolkit · azhai219 · Nov 19, 2025 · Nov 24, 2025
@@ -1,6 +1,6 @@
 [submodule "src/plugins/intel_cpu/thirdparty/onednn"]
 	path = src/plugins/intel_cpu/thirdparty/onednn
-	url = https://github.com/openvinotoolkit/oneDNN.git
+	url = https://github.com/azhai219/oneDNN.git
 	ignore = dirty
 [submodule "thirdparty/xbyak"]
 	path = thirdparty/xbyak

@@ -288,7 +288,7 @@ void BrgemmAMXKernelExecutor::execute_brgemm_copy_a_kernel(
     ctx.current_M_blk = M;
     ctx.zp_b_compensation_buffer_ptr = nullptr;
     ctx.zp_a_compensation_result_ptr = nullptr;
-    ctx.zp_b_neg_value_ptr = nullptr;
+    ctx.zp_b_neg_val_ptr = nullptr;
     ctx.zp_ab_comp_ptr = nullptr;
     ctx.src = src;
     ctx.tr_src = tr_src;

@@ -298,7 +298,7 @@ bool DnnlBlockedMemoryDesc::isCompatible(const BlockedMemoryDesc& rhs, CmpMask c
 
 bool DnnlBlockedMemoryDesc::isCompatible(const CpuBlockedMemoryDesc& rhs, CmpMask cmpMask) const {
     dnnl::impl::memory_desc_wrapper wrapped(desc.get());
-    return wrapped.extra().flags == dnnl_memory_extra_flag_none &&
+    return wrapped.extra().flags == dnnl::impl::memory_extra_flags_t::dnnl_memory_extra_flag_none &&
            BlockedMemoryDesc::isCompatibleInternal(rhs, cmpMask);
 }
 
@@ -470,11 +470,13 @@ static dnnl::memory::desc cloneDescWithNewDims(const dnnl::memory::desc& desc,
     dnnl::memory::desc clonedDesc(DnnlExtensionUtils::clone_desc(desc.get()));
 
     array_copy(clonedDesc.get()->dims, mklDims.data(), mklDims.size());
-    dnnl::memory::dims perm(convert_to_vector<dnnl::memory::dim, size_t>(order.data(), mklDims.size()));
+    std::vector<int> perm(convert_to_vector<int, size_t>(order.data(), mklDims.size()));
     auto innerBlks = clonedDesc.get_inner_blks();
     auto innerIdxs = clonedDesc.get_inner_idxs();
+    std::vector<int> innerBlksInt(innerBlks.begin(), innerBlks.end());
+    std::vector<int> innerIdxsInt(innerIdxs.begin(), innerIdxs.end());
 
-    auto retCode = dnnl::impl::fill_blocked(*clonedDesc.get(), perm, innerBlks, innerIdxs);
+    auto retCode = dnnl::impl::fill_blocked(*clonedDesc.get(), perm, innerBlksInt, innerIdxsInt);
     OPENVINO_ASSERT(retCode == dnnl::impl::status::success,
                     "Can not clone DnnlBlockedMemoryDesc with dims: ",
                     dims2str(dims));

@@ -99,7 +99,7 @@ dnnl::memory::format_kind DnnlMemoryDesc::getFormatKind() const {
 
 bool DnnlMemoryDesc::hasEmptyExtraData() const {
     dnnl::impl::memory_desc_wrapper wrapped(desc.get());
-    return wrapped.extra().flags == dnnl_memory_extra_flag_none;
+    return wrapped.extra().flags == dnnl::impl::dnnl_memory_extra_flag_none;
 }
 
 bool DnnlMemoryDesc::canComputeMemSizeZeroDims() const {

@@ -165,20 +165,20 @@ class jit_convert_array : public jit_kernel {
           _dst_size(sizeof(dst_t)) {
         const auto type = get_f8_type<src_t, dst_t>();
         if (type == f8_type::f8e4m3) {
-            f8_e4m3_emu_ = std::make_shared<fp8_emulation_e4m3_t>(this,
-                                                                  fp8_emu_reserv_1_,
-                                                                  fp8_emu_reserv_2_,
-                                                                  fp8_emu_reserv_3_,
-                                                                  fp8_emu_reserv_4_,
-                                                                  fp8_emu_reserv_5_,
-                                                                  fp8_emu_scratch_);
+            f8_e4m3_emu_ = std::make_shared<fp8_conversion_e4m3_t>(this,
+                                                                   fp8_emu_reserv_1_,
+                                                                   fp8_emu_reserv_2_,
+                                                                   fp8_emu_reserv_3_,
+                                                                   fp8_emu_reserv_4_,
+                                                                   fp8_emu_reserv_5_,
+                                                                   fp8_emu_scratch_);
         } else if (type == f8_type::f8e5m2) {
-            f8_e5m2_emu_ = std::make_shared<fp8_emulation_e5m2_t>(this,
-                                                                  fp8_emu_reserv_1_,
-                                                                  fp8_emu_reserv_2_,
-                                                                  fp8_emu_reserv_3_,
-                                                                  fp8_emu_kmask_aux_,
-                                                                  fp8_emu_scratch_);
+            f8_e5m2_emu_ = std::make_shared<fp8_conversion_e5m2_t>(this,
+                                                                   fp8_emu_reserv_1_,
+                                                                   fp8_emu_reserv_2_,
+                                                                   fp8_emu_reserv_3_,
+                                                                   fp8_emu_kmask_aux_,
+                                                                   fp8_emu_scratch_);
         }
         const bool is_dst_bf16 = std::is_same_v<dst_t, ov::intel_cpu::bfloat16_t>;
         if (is_dst_bf16 && mayiuse(cpu_isa_t::avx512_core)) {
@@ -196,11 +196,11 @@ class jit_convert_array : public jit_kernel {
         return nullptr;
     }
 
-    std::shared_ptr<fp8_emulation_e4m3_t> get_f8_e4m3_emu() const {
+    std::shared_ptr<fp8_conversion_e4m3_t> get_f8_e4m3_emu() const {
         return f8_e4m3_emu_;
     }
 
-    std::shared_ptr<fp8_emulation_e5m2_t> get_f8_e5m2_emu() const {
+    std::shared_ptr<fp8_conversion_e5m2_t> get_f8_e5m2_emu() const {
         return f8_e5m2_emu_;
     }
 
@@ -213,8 +213,8 @@ class jit_convert_array : public jit_kernel {
     size_t _src_size;
     size_t _dst_size;
 
-    std::shared_ptr<fp8_emulation_e4m3_t> f8_e4m3_emu_;
-    std::shared_ptr<fp8_emulation_e5m2_t> f8_e5m2_emu_;
+    std::shared_ptr<fp8_conversion_e4m3_t> f8_e4m3_emu_;
+    std::shared_ptr<fp8_conversion_e5m2_t> f8_e5m2_emu_;
     std::shared_ptr<jit_uni_vcvtneps2bf16> uni_vcvtneps2bf16_;
 
     const Reg64 fp8_emu_scratch_ = rax;

@@ -105,6 +105,7 @@ std::shared_ptr<DnnlFCPrimitive> DnnlFCPrimitive::create(const MemoryArgs& memor
                   dstDesc,
                   shapeAgnosticData->m_primAttrs.attr,
                   attrs.sparseWeights,
+                  attrs.sparseWeightsNonZeroSize,
                   attrs.modelType};
 
     auto builder = [&context](const Key& dnnlKey) {
@@ -305,6 +306,7 @@ static dnnl::inner_product_forward::primitive_desc createDescriptorInternal(cons
                                                                             const dnnl::primitive_attr& attr,
                                                                             const dnnl::engine& engine,
                                                                             const bool useSparseWeights,
+                                                                            const size_t useSparseWeightsNonZeroSize,
                                                                             const bool useWeightsDecompression) {
     const auto normalizedInputDesc = normalizeDescriptor(inputDesc);
     const auto normalizedOutputDesc = normalizeDescriptor(outputDesc);
@@ -331,8 +333,9 @@ static dnnl::inner_product_forward::primitive_desc createDescriptorInternal(cons
         wdt = memory::data_type::s8;
     }
 
+    // TODO: @Xiuchuan support the native sparse feature of stock oneDNN.
     const dnnl::memory::desc weightsDesc =
-        useSparseWeights ? dnnl::memory::desc().sparse_desc(normalizedWeightDesc.get_dims(), wdt)
+        useSparseWeights ? dnnl::memory::desc::packed(normalizedWeightDesc.get_dims(), wdt, useSparseWeightsNonZeroSize)
                          : dnnl::memory::desc(normalizedWeightDesc.get_dims(), wdt, memory::format_tag::any);
 
     return {engine,
@@ -352,6 +355,7 @@ static primitive_desc createPrimitiveDesc(const dnnl::memory::desc& inputDesc,
                                           const dnnl::engine& engine,
                                           const std::vector<impl_desc_type>& implPriorities,
                                           const bool useSparseWeights,
+                                          const size_t useSparseWeightsNonZeroSize,
                                           const bool useWeightsDecompression) {
     auto prim_desc = createDescriptorInternal(inputDesc,
                                               weightDesc,
@@ -360,6 +364,7 @@ static primitive_desc createPrimitiveDesc(const dnnl::memory::desc& inputDesc,
                                               attr,
                                               engine,
                                               useSparseWeights,
+                                              useSparseWeightsNonZeroSize,
                                               useWeightsDecompression);
     OPENVINO_ASSERT(prim_desc, "Failed to create inner_product primitive descriptor");
     auto first_desc = dnnl::inner_product_forward::primitive_desc(prim_desc.get());
@@ -444,6 +449,7 @@ DnnlShapeAgnosticDataPtr DnnlFCPrimitive::createShapeAgnosticData(const FCAttrs&
     const dnnl::memory::desc biaDnnlDesc = MemoryDescUtils::convertToDnnlMemoryDesc(biasDesc)->getDnnlDesc();
 
     const auto useSparseWeights = attrs.sparseWeights;
+    const auto useSparseWeightsNonZeroSize = attrs.sparseWeightsNonZeroSize;
     const auto primDesc = createPrimitiveDesc(srcDnnlDesc,
                                               weiDnnlDesc,
                                               biaDnnlDesc,
@@ -452,6 +458,7 @@ DnnlShapeAgnosticDataPtr DnnlFCPrimitive::createShapeAgnosticData(const FCAttrs&
                                               context->getEngine(),
                                               context->getImplPriorities(),
                                               useSparseWeights,
+                                              useSparseWeightsNonZeroSize,
                                               useWeightsDecompression);
 
     const auto weightsDesc = DnnlExtensionUtils::makeDescriptor(primDesc.weights_desc());
@@ -474,7 +481,7 @@ DnnlShapeAgnosticDataPtr DnnlFCPrimitive::createShapeAgnosticData(const FCAttrs&
 static impl_desc_type implTypeFromPrimDesc(const dnnl::primitive_desc& primDesc) {
     const auto implType = parse_impl_name(primDesc.impl_info_str());
     if (implType == ov::intel_cpu::brgemm_avx512_amx &&
-        primDesc.weights_desc().get_format_kind() == memory::format_kind::sparsed) {
+        primDesc.weights_desc().get_format_kind() == memory::format_kind::sparse) {
         return ov::intel_cpu::brgemm_sparse_avx512_amx;
     }
 
@@ -495,6 +502,7 @@ DnnlFCPrimitive::DnnlFCPrimitive(const Key& key,
           engine,
           implPriorities,
           key.sparseWeights,
+          key.sparseWeightsNonZeroSize,
           useWeightsDecompressionImpl(key.src->getPrecision(), key.wei->getPrecision(), key.modelType))),
       m_implType(implTypeFromPrimDesc(m_primDesc)),
       m_srcDesc(DnnlExtensionUtils::makeDescriptor(m_primDesc.src_desc())),

@@ -31,6 +31,7 @@ class DnnlFCPrimitive {
         DnnlMemoryDescCPtr dst;
         dnnl::primitive_attr attr;
         bool sparseWeights;
+        size_t sparseWeightsNonZeroSize;
         Config::ModelType modelType;
 
         [[nodiscard]] size_t hash() const;

@@ -577,7 +577,7 @@ DnnlShapeAgnosticDataPtr DnnlMatMulPrimitive::createShapeAgnosticData(const MatM
 static impl_desc_type implTypeFromPrimDesc(const dnnl::primitive_desc& primDesc) {
     const auto implType = parse_impl_name(primDesc.impl_info_str());
     if (implType == ov::intel_cpu::brgemm_avx512_amx &&
-        primDesc.weights_desc().get_format_kind() == memory::format_kind::sparsed) {
+        primDesc.weights_desc().get_format_kind() == memory::format_kind::sparse) {
         return ov::intel_cpu::brgemm_sparse_avx512_amx;
     }
 

@@ -16,6 +16,7 @@ namespace ov::intel_cpu {
 struct FCAttrs {
     bool weightsNonTransposed = false;
     bool sparseWeights = false;
+    size_t sparseWeightsNonZeroSize = 0;
     uint64_t dynamicQuantizationGroupSize = 0;
     bool constantWeights = true;
 

@@ -502,35 +502,35 @@ const std::vector<impl_desc_type>& FullyConnected::getDefaultImplPriority() {
 }
 
 // @todo Should be moved to the transformations / optimization stages?
-static bool useSparseWeightsDecompression(const NodePtr& weightsInput,
-                                          const ov::element::Type inputType,
-                                          const float sparseWeiDecompressionRate) {
+static std::pair<bool, size_t> useSparseWeightsDecompression(const NodePtr& weightsInput,
+                                                             const ov::element::Type inputType,
+                                                             const float sparseWeiDecompressionRate) {
     const auto minSparseRate = sparseWeiDecompressionRate;
 
     if (minSparseRate == 1.F) {
-        return false;
+        return {false, 0};
     }
 
     if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) {
-        return false;
+        return {false, 0};
     }
 
     const auto constNode = std::dynamic_pointer_cast<Input>(weightsInput);
     if (!constNode) {
-        return false;
+        return {false, 0};
     }
 
     const auto weiMemory = constNode->getMemoryPtr();
     OPENVINO_ASSERT(weiMemory, "Cannot get const blob");
 
     const auto weiDims = weiMemory->getShape().getStaticDims();
     if (weiDims.size() != 2 || weiDims[0] % 64 != 0 || weiDims[1] % 64 != 0) {
-        return false;
+        return {false, 0};
     }
 
     const auto weightsType = weiMemory->getPrecision();
     if (none_of(inputType, u8, i8) || weightsType != i8) {
-        return false;
+        return {false, 0};
     }
 
     const auto* const weightsData = weiMemory->getDataAs<const int8_t>();
@@ -558,13 +558,16 @@ static bool useSparseWeightsDecompression(const NodePtr& weightsInput,
               "%, use sparse weights = ",
               sparseRate >= minSparseRate);
 
-    return sparseRate >= minSparseRate;
+    return {sparseRate >= minSparseRate, elementsCount - zerosCount};
 }
 
 void FullyConnected::initSupportedPrimitiveDescriptors() {
-    attrs.sparseWeights = useSparseWeightsDecompression(getParentEdgeAt(WEIGHTS)->getParent(),
-                                                        getOriginalInputPrecisionAtPort(DATA),
-                                                        context->getConfig().fcSparseWeiDecompressionRate);
+    auto sparseAttr = useSparseWeightsDecompression(getParentEdgeAt(WEIGHTS)->getParent(),
+                                                    getOriginalInputPrecisionAtPort(DATA),
+                                                    context->getConfig().fcSparseWeiDecompressionRate);
+    attrs.sparseWeights = sparseAttr.first;
+    attrs.sparseWeightsNonZeroSize = sparseAttr.second;
+
     attrs.dynamicQuantizationGroupSize = context->getConfig().fcDynamicQuantizationGroupSize;
     attrs.modelType = context->getConfig().modelType;
 

@@ -493,7 +493,7 @@ void BrgemmKernel::execute_without_scale(bool is_M_tail, void* a, void* b, void*
         ctx.current_M_blk = cur_M_blk;
         ctx.zp_b_compensation_buffer_ptr = nullptr;
         ctx.zp_a_compensation_result_ptr = nullptr;
-        ctx.zp_b_neg_value_ptr = nullptr;
+        ctx.zp_b_neg_val_ptr = nullptr;
         ctx.zp_ab_comp_ptr = nullptr;
         ctx.src = pCopyKernelIn;
         ctx.tr_src = pCopyKernelOut;
@@ -554,7 +554,7 @@ void BrgemmKernel::callBrgemm(brgemmCtx& ctx,
     }
     if (doPostops) {
         brgemm_post_ops_data_t post_ops_data;
-        post_ops_data.scales = bScale;
+        post_ops_data.wei_scales = bScale;
         brgemm_batch_element_t addr_batch;
         addr_batch.ptr.A = pin0;
         addr_batch.ptr.B = pin1;
@@ -620,7 +620,7 @@ void BrgemmKernelQuantized::executeGemm(bool is_M_tail,
         ctx.current_M_blk = cur_M_blk;
         ctx.zp_b_compensation_buffer_ptr = nullptr;
         ctx.zp_a_compensation_result_ptr = nullptr;
-        ctx.zp_b_neg_value_ptr = nullptr;
+        ctx.zp_b_neg_val_ptr = nullptr;
         ctx.zp_ab_comp_ptr = nullptr;
         ctx.src = pCopyKernelIn;
         ctx.tr_src = pCopyKernelOut;

@@ -390,6 +390,8 @@ inline RegistersPool::Ptr RegistersPool::create(dnnl::impl::cpu::x64::cpu_isa_t
     case dnnl::impl::cpu::x64::amx_fp16:
     case dnnl::impl::cpu::x64::avx512_core_amx_fp16:
     case dnnl::impl::cpu::x64::isa_all:
+    case dnnl::impl::cpu::x64::avx10_2_512:
+    case dnnl::impl::cpu::x64::avx10_2_512_amx_2:
         OPENVINO_THROW("Invalid isa argument in RegistersPool::create()");
     }
     OPENVINO_THROW("Invalid isa argument in RegistersPool::create()");

@@ -399,7 +399,7 @@ TEST(MakeUndefinedDnnlDesc, extraData) {
         const auto& [fmt, dims] = item;
         memory::desc origin(dims, dataType, fmt);
 
-        origin.get()->extra.flags = dnnl_memory_extra_flag_compensation_conv_s8s8;
+        origin.get()->extra.flags = dnnl::impl::dnnl_memory_extra_flag_compensation_conv_s8s8;
         origin.get()->extra.compensation_mask = 1;
         origin.get()->extra.scale_adjust = 2.0f;