WA: avoid precision enforcement if Brgemm A matrix has strided access

v-Golubev · chenhu-wang · commit 60c61addd7b0 · 2025-11-19T16:10:09.000+01:00
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -553,25 +553,37 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() {
 
     if (any_of(context->getConfig().inferencePrecision, ov::element::bf16, ov::element::f16) &&
         subgraph_attrs->snippet->has_domain_sensitive_ops()) {
-        // MatMul has to be decomposed to Brgemm operations,
-        // and transposes on inputs/outputs should be fused in the brgemm before enforcement
+        SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(
+            Place::After,
+            ov::snippets::pass::FuseTransposeBrgemm,
+            pass::EnforcePrecision,
+            element::f32,
+            context->getConfig().inferencePrecision,
+            [](const std::shared_ptr<ov::Node>& op) {
+                std::set<std::vector<ov::element::Type>> types;
+                if (ov::is_type<ov::snippets::op::Brgemm>(op)) {
+                    const auto& a_port =
+                        ov::snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(op->input(0));
+                    // WA: We can't perform precision enforcement in case of strided access to A matrix:
+                    // snippets eltwise loops for precision conversion are generated by last 2 dims,
+                    // which are not [M, K] in case of strided access in brgemm A
+                    // There are no limitations for B matrix, since precision conversion is fused in BrgemmCopyB
+                    if (ov::snippets::utils::is_planar_layout(a_port->get_layout())) {
+                        if (ov::intel_cpu::brgemm_utils::is_fp16_supported()) {
+                            types.insert({ov::element::f16, ov::element::f16});
+                        }
+                        if (ov::intel_cpu::brgemm_utils::is_bf16_supported()) {
+                            types.insert({ov::element::bf16, ov::element::bf16});
+                        }
+                    }
+                }
+                return types;
+            });
+        // Note: EnforcePrecision might also eliminate Convert pairs (e.g. bf16->f32->bf16),
+        // so FuseTransposeBrgemm has to be run after it as well
         SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(Place::After,
-                                               ov::snippets::pass::FuseTransposeBrgemm,
                                                pass::EnforcePrecision,
-                                               element::f32,
-                                               context->getConfig().inferencePrecision,
-                                               [](const std::shared_ptr<ov::Node>& op) {
-                                                   std::set<std::vector<ov::element::Type>> types;
-                                                   if (ov::is_type<ov::snippets::op::Brgemm>(op)) {
-                                                       if (ov::intel_cpu::brgemm_utils::is_fp16_supported()) {
-                                                           types.insert({ov::element::f16, ov::element::f16});
-                                                       }
-                                                       if (ov::intel_cpu::brgemm_utils::is_bf16_supported()) {
-                                                           types.insert({ov::element::bf16, ov::element::bf16});
-                                                       }
-                                                   }
-                                                   return types;
-                                               });
+                                               ov::snippets::pass::FuseTransposeBrgemm);
     }
 
     SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(Place::Before,