pytorch · cyyever · Jul 13, 2025 · Jul 15, 2025 · Jul 17, 2025 · Jul 17, 2025
diff --git a/bench/RowwiseAdagradFusedBenchmark.cc b/bench/RowwiseAdagradFusedBenchmark.cc
@@ -19,7 +19,6 @@
 
 #include "./BenchUtils.h"
 #include "fbgemm/Fbgemm.h"
-#include "src/RefImplementations.h" // @manual
 
 using namespace std;
 using namespace fbgemm;

diff --git a/include/fbgemm/FbgemmFPCommon.h b/include/fbgemm/FbgemmFPCommon.h
@@ -10,7 +10,6 @@
 #pragma once
 
 #include <fbgemm/FbgemmPackMatrixB.h>
-#include <fbgemm/SimdUtils.h>
 #include <fbgemm/Types.h>
 #include <fbgemm/Utils.h>
 #include <array>

diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h
@@ -211,7 +211,6 @@ inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
   assert(
       block.col_size <= ncol_per_group &&
       "ReQuantizeOutput should be called at most 1 group at a time.");
-  int g = block.col_start / ncol_per_group;
   if constexpr (
       instSet == inst_set_t::anyarch || !std::is_same_v<outT, float>) {
     for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
@@ -224,6 +223,7 @@ inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
         if constexpr (Q_GRAN == QuantizationGranularity::TENSOR) {
           Bq_zero_point_idx = 0;
         } else if constexpr (Q_GRAN == QuantizationGranularity::GROUP) {
+          int g = block.col_start / ncol_per_group;
           Bq_zero_point_idx = g;
         } else if constexpr (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
           Bq_zero_point_idx = j;

diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
@@ -15,6 +15,8 @@
 #include <array>
 #include <cassert>
 #include <cmath>
+#include <iomanip>
+#include <iostream>
 #include <string>
 #include <type_traits>
 
@@ -96,16 +98,38 @@ FBGEMM_API int compare_buffers(
     float atol = 1e-3);
 
 /**
- * @brief Debugging helper.
+ * @brief Print the matrix.
+ * @param op Transpose type of the matrix.
+ * @param R The height of the matrix.
+ * @param C The width of the matrix.
+ * @param ld The leading dimension of the matrix.
+ * @param name The prefix string before printing the matrix.
  */
 template <typename T>
 void printMatrix(
-    matrix_op_t trans,
+    matrix_op_t op,
     const T* inp,
     size_t R,
     size_t C,
     size_t ld,
-    std::string name);
+    const std::string& name) {
+  // R: number of rows in op(inp)
+  // C: number of cols in op(inp)
+  // ld: leading dimension in inp
+  std::cout << name << ":" << "[" << R << ", " << C << "]" << '\n';
+  bool tr = (op == matrix_op_t::Transpose);
+  for (size_t r = 0; r < R; ++r) {
+    for (size_t c = 0; c < C; ++c) {
+      T res = tr ? inp[c * ld + r] : inp[r * ld + c];
+      if constexpr (std::is_integral_v<T>) {
+        std::cout << std::setw(5) << static_cast<int64_t>(res) << " ";
+      } else {
+        std::cout << std::setw(5) << res << " ";
+      }
+    }
+    std::cout << '\n';
+  }
+}
 
 /**
  * @brief Transpose a matrix.

diff --git a/src/ExecuteKernelU8S8.h b/src/ExecuteKernelU8S8.h
@@ -7,7 +7,8 @@
  */
 
 #pragma once
-#include "./ExecuteKernel.h" // @manual
+#include <cstdint>
+#include "./ExecuteKernelGeneric.h" // @manual
 
 namespace fbgemm {
 

diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
@@ -9,9 +9,8 @@
 #define FBGEMM_EXPORTS
 #include "fbgemm/Fbgemm.h"
 #include <cpuinfo.h>
-#include <functional>
 #include <stdexcept>
-#include "./ExecuteKernel.h" // @manual
+#include "./ExecuteKernelU8S8.h" // @manual
 
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
 double packing_time = 0.0;

diff --git a/src/FbgemmBfloat16Convert.cc b/src/FbgemmBfloat16Convert.cc
@@ -9,7 +9,7 @@
 #define FBGEMM_EXPORTS
 #include "fbgemm/FbgemmConvert.h"
 
-#include "./RefImplementations.h" // @manual
+#include <stdexcept>
 
 #ifdef USE_MKL
 #include <mkl.h>

diff --git a/src/FbgemmFPCommon.cc b/src/FbgemmFPCommon.cc
@@ -300,6 +300,7 @@ partition_array_t partition_sve128 = {
   }
 };
 
+#ifdef FBGEMM_ENABLE_KLEIDIAI
 partition_array_t partition_neon = {
   // NOTE: clang-format wants to use a different formatting but the current
   // formatting should be easier to read.
@@ -427,6 +428,7 @@ partition_array_t partition_neon = {
     {{ { 8, 15 }, { 0, 0 } } }, // 120
   }
 };
+#endif
 
 
 partition_array_t partition_avx512 = {

diff --git a/src/FbgemmFloat16Convert.cc b/src/FbgemmFloat16Convert.cc
@@ -9,8 +9,6 @@
 #define FBGEMM_EXPORTS
 #include "fbgemm/FbgemmConvert.h"
 
-#include "./RefImplementations.h" // @manual
-
 #ifdef USE_MKL
 #include <mkl.h>
 #endif

diff --git a/src/GenerateKernelU8S8S32ACC16Avx512.cc b/src/GenerateKernelU8S8S32ACC16Avx512.cc
@@ -7,7 +7,6 @@
  */
 
 #include <iostream>
-#include "./CodeGenHelpers.h" // @manual
 #include "./GenerateKernel.h" // @manual
 
 namespace fbgemm {

diff --git a/src/GroupwiseConv.cc b/src/GroupwiseConv.cc
@@ -15,8 +15,6 @@
 #include <tuple>
 #include <type_traits>
 #include "./CodeGenHelpers.h" // @manual
-#include "./RefImplementations.h" // @manual
-#include "./TransposeUtils.h" // @manual
 #include "fbgemm/Fbgemm.h"
 #include "fbgemm/QuantUtilsAvx512.h"
 #include "fbgemm/SimdUtils.h"

diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc
@@ -9,7 +9,6 @@
 #define FBGEMM_EXPORTS
 #include "./CodeGenHelpers.h" // @manual
 #include "./GroupwiseConv.h" // @manual
-#include "fbgemm/Fbgemm.h"
 
 namespace fbgemm {
 

diff --git a/src/PackWeightsForDirectConv.cc b/src/PackWeightsForDirectConv.cc
@@ -15,17 +15,12 @@
 #include <cassert>
 
 #include "./DirectConv.h" // @manual
-#include "./ExecuteKernel.h" // @manual
-#include "./MaskAvx2.h" // @manual
 #include "fbgemm/ConvUtils.h"
 #include "fbgemm/Fbgemm.h"
 #include "fbgemm/FbgemmBuild.h"
 #include "fbgemm/UtilsAvx2.h"
 
-#include "./CodeGenHelpers.h" // @manual
 #include "./OptimizedKernelsAvx2.h" // @manual
-#include "./RefImplementations.h" // @manual
-#include "./TransposeUtils.h" // @manual
 namespace fbgemm {
 
 PackedDirectConvMatrix::PackedDirectConvMatrix(

diff --git a/src/Utils.cc b/src/Utils.cc
@@ -16,7 +16,6 @@
 #include <cmath>
 #include <cstdint>
 #include <cstring>
-#include <iomanip>
 #include <iostream>
 #include <limits>
 #include <new>
@@ -84,39 +83,6 @@ int compare_buffers(
   return 0;
 }
 
-/**
- * @brief Print the matrix.
- * @param op Transpose type of the matrix.
- * @param R The height of the matrix.
- * @param C The width of the matrix.
- * @param ld The leading dimension of the matrix.
- * @param name The prefix string before printing the matrix.
- */
-template <typename T>
-void printMatrix(
-    matrix_op_t op,
-    const T* inp,
-    size_t R,
-    size_t C,
-    size_t ld,
-    const std::string& name) {
-  // R: number of rows in op(inp)
-  // C: number of cols in op(inp)
-  // ld: leading dimension in inp
-  std::cout << name << ":" << "[" << R << ", " << C << "]" << '\n';
-  bool tr = (op == matrix_op_t::Transpose);
-  for (size_t r = 0; r < R; ++r) {
-    for (size_t c = 0; c < C; ++c) {
-      T res = tr ? inp[c * ld + r] : inp[r * ld + c];
-      if constexpr (std::is_integral_v<T>) {
-        std::cout << std::setw(5) << static_cast<int64_t>(res) << " ";
-      } else {
-        std::cout << std::setw(5) << res << " ";
-      }
-    }
-    std::cout << '\n';
-  }
-}
 
 template int compare_buffers<float>(
     const float* ref,
@@ -154,35 +120,6 @@ template int compare_buffers<int64_t>(
     size_t max_mismatches_to_report,
     float atol);
 
-template void printMatrix<float>(
-    matrix_op_t op,
-    const float* inp,
-    size_t R,
-    size_t C,
-    size_t ld,
-    const std::string& name);
-template void printMatrix<int8_t>(
-    matrix_op_t op,
-    const int8_t* inp,
-    size_t R,
-    size_t C,
-    size_t ld,
-    const std::string& name);
-template void printMatrix<uint8_t>(
-    matrix_op_t op,
-    const uint8_t* inp,
-    size_t R,
-    size_t C,
-    size_t ld,
-    const std::string& name);
-template void printMatrix<int32_t>(
-    matrix_op_t op,
-    const int32_t* inp,
-    size_t R,
-    size_t C,
-    size_t ld,
-    const std::string& name);
-
 namespace {
 inst_set_t g_forced_isa = inst_set_t::anyarch;
 std::optional<bool> g_Avx512_Ymm_enabled{std::nullopt};