BrgemmExternalRepackingAdjuster: fixed src offsets calculation

v-Golubev · v-Golubev · commit c734ebe4289b · 2025-11-17T17:35:38.000+01:00
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
@@ -182,28 +182,48 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
         }
 
         const auto& config = static_cast<const BrgemmCopyBKernelConfig&>(executor->get_config());
-        const auto desc = get_desc(planar_shape,
-                                   prc,
-                                   config.get_wei_K_blk(),
-                                   config.get_wei_N_blk(),
-                                   config.are_wei_blocked(),
-                                   config.is_transposed_B());
+        const auto dst_desc = get_desc(planar_shape,
+                                       prc,
+                                       config.get_wei_K_blk(),
+                                       config.get_wei_N_blk(),
+                                       config.are_wei_blocked(),
+                                       config.is_transposed_B());
+
+        auto process_src_offsets = [&config](const ov::snippets::VectorDims& cpu_config_offsets) {
+            const auto orig_size = dnnl_data_type_size(config.get_original_wei_dt());
+            const auto wei_size = dnnl_data_type_size(config.get_wei_dt());
+            if (orig_size == wei_size) {
+                return cpu_config_offsets;
+            }
+            // Note: original cpu config offsets are calculated for the repacked output
+            // If repacked out precision is not equal to original weights precision,
+            // we need to correspondingly scale offsets to correct positions in original memory
+            ov::snippets::VectorDims recalculated_offsets(cpu_config_offsets.size());
+            std::transform(cpu_config_offsets.begin(),
+                           cpu_config_offsets.end(),
+                           recalculated_offsets.begin(),
+                           [orig_size, wei_size](size_t offset) {
+                               return offset / wei_size * orig_size;
+                           });
+            return recalculated_offsets;
+        };
 
         // Save original input offsets for input before repacking.
         // If the shape has not been changed, it means that we already created `InputRepacker` for this input
         // on previous pass call and now `cpu_config->io_data_offsets[i]` contains offsets not for original input -
         // they were updated for blocked shapes/zeroed for previous initialization and we cannot use them as original
         // offsets.
-        const auto in_offsets =
-            shape == cpu_config->latest_shapes[i] ? input_repacker.in_offsets() : cpu_config->io_data_offsets[i];
+        const auto in_offsets = shape == cpu_config->latest_shapes[i]
+                                    ? input_repacker.in_offsets()
+                                    : process_src_offsets(cpu_config->io_data_offsets[i]);
 
         // In parallel case Kernel should not add offsets to repacked inputs because
         // they will be applied during repacking in execution stage
         if (is_impl_parallel) {
             auto& offsets = cpu_config->io_data_offsets[i];
             std::fill(offsets.begin(), offsets.end(), 0);
         } else {
-            const auto blocked_dims = desc->getBlockDims();
+            const auto blocked_dims = dst_desc->getBlockDims();
             const auto inner_blocks_num = blocked_dims.size() - planar_shape.size();
             const auto rank = in_offsets.size() + inner_blocks_num;  // to align with src offsets rank
             OPENVINO_ASSERT(rank >= blocked_dims.size(), "Incorrect target rank for dst offsets");
@@ -218,7 +238,7 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
         }
         const auto out_offsets = cpu_config->io_data_offsets[i];
 
-        input_repacker = InputRepacker(p.second->get_kernel(), desc, in_offsets, out_offsets);
+        input_repacker = InputRepacker(p.second->get_kernel(), dst_desc, in_offsets, out_offsets);
     }
 
     return true;
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
@@ -81,6 +81,17 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMul, MatMul,
                                  ::testing::Values(CPUTestUtils::empty_plugin_config)),
                          MatMul::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulEnforceBF16, MatMul,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(input_shapes),
+                                 ::testing::ValuesIn(precision_f32(2)),
+                                 ::testing::Values(MatMulType::MatMul),
+                                 ::testing::Values(1), // MatMul
+                                 ::testing::Values(1), // Tokenized MatMul
+                                 ::testing::Values(ov::test::utils::DEVICE_CPU),
+                                 ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)),
+                         MatMul::getTestCaseName);
+
 std::vector<std::vector<ov::test::InputShape>> transpose_b_shapes{
     { {{}, {{3, 3, 64, 64}}},   {{}, {{3, 3, 64, 64}}} },
     { {{}, {{1, 1, 32, 128}}},  {{}, {{1, 1, 64, 128}}} },
diff --git a/src/tests/functional/plugin/shared/src/snippets/matmul.cpp b/src/tests/functional/plugin/shared/src/snippets/matmul.cpp
@@ -55,8 +55,11 @@ void MatMulBase::generate_inputs(const std::vector<ov::Shape>& targetInputStatic
         const auto& model_input = model_inputs[i];
         ov::Tensor tensor;
         ov::test::utils::InputGenerateData in_data;
+        const bool bf16_precision =
+            configuration.at(ov::hint::inference_precision.name()).as<ov::element::Type>() == ov::element::bf16 ||
+            model_input.get_element_type() == ov::element::bf16;
         // To avoid big relative errors in the vicinity of zero, only positive values are generated for bf16 precision
-        in_data.start_from = model_input.get_element_type() == ov::element::bf16 ? 0 : -1;
+        in_data.start_from = bf16_precision ? 0 : -1;
         in_data.range = 5;
         in_data.resolution = 256;
         tensor =
@@ -65,7 +68,6 @@ void MatMulBase::generate_inputs(const std::vector<ov::Shape>& targetInputStatic
     }
 }
 
-
 void MatMul::SetUp() {
     const auto& [input_shapes,
                  elem_types,