pytorch · mortzur · Feb 19, 2019 · Feb 21, 2019 · Feb 25, 2019 · Feb 26, 2019
diff --git a/src/q8gemm/4x8-neon.c b/src/q8gemm/4x8-neon.c
diff --git a/src/qnnpack/pack.h b/src/qnnpack/pack.h
@@ -48,6 +48,45 @@ static inline void pack_q8gemm_w(
   }
 }
 
+static inline void pack_q8gemm_w_per_channel(
+    size_t nc,         // num output channels
+    size_t kc,         // num input channels
+    uint32_t nr,       // kernel-n-block-size
+    uint32_t np,       // packed-n
+    uint32_t kr,
+    uint8_t izp,
+    uint8_t* kzp,
+    const uint8_t* k,
+    const int32_t* b,
+    void* packed_w)
+{
+  for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
+    const size_t nr_block_size = min(nc - nr_block_start, nr);
+    int32_t* packed_b = (int32_t*) packed_w;
+    for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+      *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] +
+          (int32_t) kc * (int32_t) izp * (int32_t) kzp[nr_block_start + nr_block_offset];
+      packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
+    }
+    packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
+    for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
+      const size_t kr_block_size = min(kc - kr_block_start, kr);
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
+        int32_t ksum = 0;
+        for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
+          const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
+          ksum += (int32_t) kv;
+          *((uint8_t*) packed_w) = kv;
+          packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
+        }
+        packed_b[nr_block_offset] -= ksum * (int32_t) izp;
+        packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
+      }
+      packed_w = (void*) ((uintptr_t) packed_w + ((nr - nr_block_size) & (np - 1)) * kr * sizeof(uint8_t));
+    }
+  }
+}
+
 static inline void pack_q8conv_w(
   size_t n,
   size_t ks,

diff --git a/src/qnnpack/params.h b/src/qnnpack/params.h
@@ -145,6 +145,9 @@ union qnnp_conv_quantization_params {
     int16_t output_zero_point;
     uint8_t output_max;
     uint8_t output_min;
+    uint8_t* kernel_zero_point_v;
+    int32_t* multiplier_v;
+    int32_t* right_shift_v;
   } neon;
 #endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
 #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
@@ -275,6 +278,18 @@ typedef void (*q8gemm_ukernel_function)(
     size_t c_stride,
     const union qnnp_conv_quantization_params* quantization_params);
 
+typedef void (*q8gemm_per_channel_ukernel_function)(
+    size_t mr,
+    size_t nr,
+    size_t k,
+    const uint8_t* a,
+    size_t a_stride,
+    const void* w,
+    uint8_t* c,
+    size_t c_stride,
+    const union qnnp_conv_quantization_params* quantization_params,
+    size_t kernel_quantization_params_offset);
+
 typedef void (*q8conv_ukernel_function)(
     size_t mr,
     size_t nr,

diff --git a/src/qnnpack/q8gemm.h b/src/qnnpack/q8gemm.h
@@ -43,6 +43,21 @@ DECLARE_Q8GEMM_UKERNEL_FUNCTION(q8gemm_ukernel_8x8__aarch64_neon)
 DECLARE_Q8GEMM_UKERNEL_FUNCTION(q8gemm_ukernel_2x4c8__sse2)
 DECLARE_Q8GEMM_UKERNEL_FUNCTION(q8gemm_ukernel_4x4c2__sse2)
 
+#define DECLARE_Q8GEMM_PER_CHANNEL_UKERNEL_FUNCTION(fn_name)                      \
+  QNNP_INTERNAL void fn_name(                                         \
+      size_t mr,                                                      \
+      size_t nr,                                                      \
+      size_t k,                                                       \
+      const uint8_t* a,                                               \
+      size_t a_stride,                                                \
+      const void* w,                                                  \
+      uint8_t* c,                                                     \
+      size_t c_stride,                                                \
+      const union qnnp_conv_quantization_params* quantization_params, \
+      size_t kernel_quantization_params_offset);
+
+DECLARE_Q8GEMM_PER_CHANNEL_UKERNEL_FUNCTION(q8gemm_per_channel_ukernel_4x8__neon)
+
 #define DECLARE_Q8GEMM_XZP_UKERNEL_FUNCTION(fn_name) \
   QNNP_INTERNAL void fn_name(                        \
       size_t mr,                                     \

diff --git a/src/qnnpack/requantization.h b/src/qnnpack/requantization.h
@@ -197,6 +197,110 @@ static inline union qnnp_conv_quantization_params qnnp_compute_conv_quantization
   return params;
 }
 
+static inline union qnnp_conv_quantization_params qnnp_compute_conv_quantization_params_per_channel(
+  uint8_t input_zero_point,
+  size_t kernel_params_size, // should be identical to group_output_channels
+  uint8_t* kernel_zero_point_v,
+  const float* scale_v,
+  int32_t* multiplier_v,         // pre-allocated in operator-create
+  int32_t* right_shift_v,        // pre-allocated in operator-create
+  uint8_t output_zero_point,
+  uint8_t output_min,
+  uint8_t output_max)
+{
+  const float scale = *scale_v;
+  const uint8_t kernel_zero_point = *kernel_zero_point_v;
+  /* Compute requantization parameters */
+  const uint32_t scale_bits = fp32_to_bits(scale);
+
+  /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+  const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+  assert(multiplier >= INT32_C(0x40000000));
+  assert(multiplier <= INT32_C(0x7FFFFF80));
+
+  /* Shift is in [0, 31] range */
+  const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+  assert(shift >= 0);
+  assert(shift < 32);
+
+  union qnnp_conv_quantization_params params;
+  #if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+    const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+    const uint32_t remainder_threshold = remainder_mask >> 1;
+    for (uint32_t i = 0; i < 8; i++) {
+      params.sse2.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
+      params.sse2.kernel_zero_point[i] = (int16_t) (uint16_t) kernel_zero_point;
+    }
+    params.sse2.multiplier[0] = multiplier;
+    params.sse2.multiplier[1] = multiplier;
+    params.sse2.multiplier[2] = multiplier;
+    params.sse2.multiplier[3] = multiplier;
+    params.sse2.rounding[0] = UINT64_C(0x40000000);
+    params.sse2.rounding[1] = UINT64_C(0x40000000);
+    params.sse2.remainder_mask[0] = (int32_t) remainder_mask;
+    params.sse2.remainder_mask[1] = (int32_t) remainder_mask;
+    params.sse2.remainder_mask[2] = (int32_t) remainder_mask;
+    params.sse2.remainder_mask[3] = (int32_t) remainder_mask;
+    params.sse2.remainder_threshold[0] = (int32_t) remainder_threshold;
+    params.sse2.remainder_threshold[1] = (int32_t) remainder_threshold;
+    params.sse2.remainder_threshold[2] = (int32_t) remainder_threshold;
+    params.sse2.remainder_threshold[3] = (int32_t) remainder_threshold;
+    params.sse2.shift[0] = (uint64_t) (uint32_t) shift;
+    params.sse2.shift[1] = (uint64_t) (uint32_t) shift;
+    for (uint32_t i = 0; i < 8; i++) {
+      params.sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
+    }
+    for (uint32_t i = 0; i < 16; i++) {
+      params.sse2.output_max[i] = output_max;
+      params.sse2.output_min[i] = output_min;
+    }
+  #elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+    params.neon.input_zero_point = (int16_t) (uint16_t) input_zero_point;
+    params.neon.kernel_zero_point = (int16_t) (uint16_t) kernel_zero_point;
+    params.neon.multiplier = multiplier;
+    params.neon.right_shift = -shift;
+    params.neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
+    params.neon.output_max = output_max;
+    params.neon.output_min = output_min;
+    params.neon.kernel_zero_point_v = kernel_zero_point_v;
+    params.neon.multiplier_v = multiplier_v;
+    params.neon.right_shift_v = right_shift_v;
+    for (uint32_t i = 0; i < kernel_params_size; ++i) {
+      const float s = scale_v[i];
+      const uint8_t kzp = kernel_zero_point_v[i];
+      /* Compute requantization parameters */
+      const uint32_t sbits = fp32_to_bits(s);
+      /* Multiplier is in [0x40000000, 0x7FFFFF80] range */
+      const int32_t m = (int32_t)(((sbits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+      assert(m >= INT32_C(0x40000000));
+      assert(m <= INT32_C(0x7FFFFF80));
+
+      /* Shift is in [0, 31] range */
+      const int32_t rs = 127 + 31 - 32 - (fp32_to_bits(s) >> 23);
+      assert(rs >= 0);
+      assert(rs < 32);
+      params.neon.multiplier_v[i] = m;
+      params.neon.right_shift_v[i] = -rs;
+    }
+
+  #else
+    const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+    const uint32_t remainder_threshold = remainder_mask >> 1;
+    params.scalar.input_zero_point = (int32_t) (uint32_t) input_zero_point;
+    params.scalar.kernel_zero_point = (int32_t) (uint32_t) kernel_zero_point;
+    params.scalar.multiplier = multiplier;
+    params.scalar.remainder_mask = (int32_t) remainder_mask;
+    params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+    params.scalar.shift = (uint32_t) shift;
+    params.scalar.output_min_less_zero_point =
+      (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
+    params.scalar.output_max_less_zero_point =
+      (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
+    params.scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
+  #endif
+  return params;
+}
+
 static inline union qnnp_avgpool_quantization_params qnnp_compute_avgpool_quantization_params(
   int32_t bias,
   float scale,

diff --git a/test/gemm-microkernel-tester.h b/test/gemm-microkernel-tester.h
@@ -275,6 +275,128 @@ class GemmMicrokernelTester {
     }
   }
 
+  void test(q8gemm_per_channel_ukernel_function qgemm) const {
+    ASSERT_LE(m(), mr());
+    ASSERT_LE(n(), nr());
+    ASSERT_GE(k(), kr());
+
+    std::random_device randomDevice;
+    auto rng = std::mt19937(randomDevice());
+    auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
+    auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
+
+    std::vector<uint8_t> a((m() - 1) * aStride() + k() + 8);
+    std::vector<uint8_t> b(n() * k());
+    std::vector<int32_t> bias(n());
+    std::vector<uint8_t, AlignedAllocator<uint8_t, 32>> packedW(packedN() * packedK() + biasN() * sizeof(uint32_t) / sizeof(uint8_t));
+    std::vector<uint8_t> c((m() - 1) * cStride() + n());
+    std::vector<int32_t> acc(m() * n());
+    std::vector<uint8_t> cRef(m() * n());
+
+    // Per-Channel quantization parameters
+    std::vector<uint8_t> kernelZeroPointPerChannel(nr());
+    std::vector<float> kernelAndInputScalePerChannel(nr());
+    std::vector<float> requantizationScalePerChannel(nr());
+    std::vector<int32_t> multiplierPerChannel(nr());
+    std::vector<int32_t> rightShiftPerChannel(nr());
+
+    // 1) Fill zero-point per-channel around bZeroPoint() as center value.
+    // 2) Fill kernel-and-input per-channel using linear interpolation between min and max values.
+    //    (Maintain: requantization_scale < 1 ;
+    //               requantization_scale := input_scale * kernel_scale / output_scale)
+    const float scale_min = 0.5f;
+    const float scale_max = 0.99999f;
+    for (size_t i = 0; i < nr(); ++i) {
+      kernelZeroPointPerChannel[i] =
+        static_cast<uint8_t>(std::min(255, std::max(0, bZeroPoint() + (int)(i - nr()/2))));
+      kernelAndInputScalePerChannel[i] = scale_min + i * (scale_max -  scale_min) / nr();
+    }
+
+    const uint8_t* aPtr = a.data() + 8;
+
+    for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      std::generate(a.begin(), a.end(), std::ref(u8rng));
+      std::generate(b.begin(), b.end(), std::ref(u8rng));
+      std::generate(bias.begin(), bias.end(), std::ref(s32rng));
+      std::fill(c.begin(), c.end(), 0xA5);
+
+      std::fill(packedW.begin(), packedW.end(), bZeroPoint());
+      pack_q8gemm_w_per_channel(n(), k(),
+        nr(), np(), kr(),
+        aZeroPoint(), kernelZeroPointPerChannel.data(),
+        b.data(), bias.data(), packedW.data());
+
+      ASSERT_NE(*std::max_element(a.cbegin(), a.cend()), *std::min_element(a.cbegin(), a.cend()));
+      ASSERT_NE(*std::max_element(b.cbegin(), b.cend()), *std::min_element(b.cbegin(), b.cend()));
+
+      /* Compute 32-bit results and output quantization arguments */
+      std::fill(acc.begin(), acc.end(), 0);
+      for (size_t mIndex = 0; mIndex < m(); mIndex++) {
+        for (size_t nIndex = 0; nIndex < n(); nIndex++) {
+          for (size_t kIndex = 0; kIndex < k(); kIndex++) {
+            ASSERT_LE(n(), packedN());
+            ASSERT_LT(mIndex * n() + nIndex, acc.size());
+            ASSERT_LT(mIndex * k() + kIndex, a.size());
+            acc[mIndex * n() + nIndex] +=
+                (int32_t(aPtr[mIndex * aStride() + kIndex]) - int32_t(aZeroPoint())) *
+                (int32_t(b[nIndex * k() + kIndex]) - int32_t(kernelZeroPointPerChannel[nIndex]));
+          }
+          acc[mIndex * n() + nIndex] += bias[nIndex];
+        }
+      }
+
+      const int32_t accMin = *std::min_element(acc.cbegin(), acc.cend());
+      const int32_t accMax = *std::max_element(acc.cbegin(), acc.cend());
+      if (m() * n() >= 3) {
+        ASSERT_NE(accMax, accMin)
+            << "Mr x Nr x Kr = " << mr() << " x " << nr() << " x " << kr()
+            << ", M x N x K = " << m() << " x " << n() << " x " << k();
+      }
+
+      const double cScale = uint32_t(accMax - accMin) >= 256 ? double(uint32_t(accMax - accMin)) / 255.0 : 1.00001;
+      const uint8_t cZeroPoint = uint8_t(std::max(std::min(
+        lrint(127.5 - 0.5 * double(accMin + accMax) / cScale),
+        long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
+
+      for (size_t nIndex = 0; nIndex < nr(); nIndex++) {
+        requantizationScalePerChannel[nIndex] = kernelAndInputScalePerChannel[nIndex] / float(cScale);
+      }
+      const union qnnp_conv_quantization_params quantizationParams =
+        qnnp_compute_conv_quantization_params_per_channel(
+          aZeroPoint(), nr(), kernelZeroPointPerChannel.data(),
+          requantizationScalePerChannel.data(), multiplierPerChannel.data(), rightShiftPerChannel.data(),  cZeroPoint, qmin(), qmax());
+
+      qgemm(
+        m(), n(), k(),
+        aPtr, aStride() * sizeof(uint8_t),
+        packedW.data(),
+        c.data(), cStride() * sizeof(uint8_t),
+        &quantizationParams, 0);
+
+      for (size_t mIndex = 0; mIndex < m(); mIndex++) {
+        for (size_t nIndex = 0; nIndex < n(); nIndex++) {
+          const union qnnp_q31_requantization_params scalarRequantizationParams =
+            qnnp_compute_scalar_requantization_params(
+              requantizationScalePerChannel[nIndex], cZeroPoint, qmin(), qmax());
+          cRef[mIndex * n() + nIndex] = qnnp_q31_requantize(acc[mIndex * n() + nIndex], scalarRequantizationParams);
+        }
+      }
+
+      for (size_t mIndex = 0; mIndex < m(); mIndex++) {
+        for (size_t nIndex = 0; nIndex < n(); nIndex++) {
+          ASSERT_LE(uint32_t(c[mIndex * cStride() + nIndex]), uint32_t(qmax()));
+          ASSERT_GE(uint32_t(c[mIndex * cStride() + nIndex]), uint32_t(qmin()));
+          ASSERT_EQ(uint32_t(c[mIndex * cStride() + nIndex]), uint32_t(cRef[mIndex * n() + nIndex]))
+              << "at " << mIndex << ", " << nIndex << ": reference = " << (uint32_t) cRef[mIndex * n() + nIndex]
+              << " (accumulator = " << acc[mIndex * n() + nIndex]
+              << "), optimized = " << (uint32_t) c[mIndex * cStride() + nIndex] << ", Mr x Nr x Kr = " << mr() << " x "
+              << nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k()
+              << ", requantization scale = " << requantizationScalePerChannel[nIndex] << ", output zero point = " << int32_t(cZeroPoint);
+        }
+      }
+    }
+  }
+
   void test(q8conv_ukernel_function qconv) const {
     ASSERT_LE(m(), mr());
     ASSERT_LE(n(), nr());