Skip to content
This repository was archived by the owner on Oct 1, 2020. It is now read-only.
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
356 changes: 356 additions & 0 deletions src/q8gemm/4x8-neon.c

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions src/qnnpack/pack.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,45 @@ static inline void pack_q8gemm_w(
}
}

static inline void pack_q8gemm_w_per_channel(
size_t nc, // num output channels
size_t kc, // num input channels
uint32_t nr, // kernel-n-block-size
uint32_t np, // packed-n
uint32_t kr,
uint8_t izp,
uint8_t* kzp,
const uint8_t* k,
const int32_t* b,
void* packed_w)
{
for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
const size_t nr_block_size = min(nc - nr_block_start, nr);
int32_t* packed_b = (int32_t*) packed_w;
for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
*((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] +
(int32_t) kc * (int32_t) izp * (int32_t) kzp[nr_block_start + nr_block_offset];
packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
}
packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
const size_t kr_block_size = min(kc - kr_block_start, kr);
for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
int32_t ksum = 0;
for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
ksum += (int32_t) kv;
*((uint8_t*) packed_w) = kv;
packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
}
packed_b[nr_block_offset] -= ksum * (int32_t) izp;
packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
}
packed_w = (void*) ((uintptr_t) packed_w + ((nr - nr_block_size) & (np - 1)) * kr * sizeof(uint8_t));
}
}
}

static inline void pack_q8conv_w(
size_t n,
size_t ks,
Expand Down
15 changes: 15 additions & 0 deletions src/qnnpack/params.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ union qnnp_conv_quantization_params {
int16_t output_zero_point;
uint8_t output_max;
uint8_t output_min;
uint8_t* kernel_zero_point_v;
int32_t* multiplier_v;
int32_t* right_shift_v;
} neon;
#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
Expand Down Expand Up @@ -275,6 +278,18 @@ typedef void (*q8gemm_ukernel_function)(
size_t c_stride,
const union qnnp_conv_quantization_params* quantization_params);

typedef void (*q8gemm_per_channel_ukernel_function)(
size_t mr,
size_t nr,
size_t k,
const uint8_t* a,
size_t a_stride,
const void* w,
uint8_t* c,
size_t c_stride,
const union qnnp_conv_quantization_params* quantization_params,
size_t kernel_quantization_params_offset);

typedef void (*q8conv_ukernel_function)(
size_t mr,
size_t nr,
Expand Down
15 changes: 15 additions & 0 deletions src/qnnpack/q8gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,21 @@ DECLARE_Q8GEMM_UKERNEL_FUNCTION(q8gemm_ukernel_8x8__aarch64_neon)
DECLARE_Q8GEMM_UKERNEL_FUNCTION(q8gemm_ukernel_2x4c8__sse2)
DECLARE_Q8GEMM_UKERNEL_FUNCTION(q8gemm_ukernel_4x4c2__sse2)

#define DECLARE_Q8GEMM_PER_CHANNEL_UKERNEL_FUNCTION(fn_name) \
QNNP_INTERNAL void fn_name( \
size_t mr, \
size_t nr, \
size_t k, \
const uint8_t* a, \
size_t a_stride, \
const void* w, \
uint8_t* c, \
size_t c_stride, \
const union qnnp_conv_quantization_params* quantization_params, \
size_t kernel_quantization_params_offset);

DECLARE_Q8GEMM_PER_CHANNEL_UKERNEL_FUNCTION(q8gemm_per_channel_ukernel_4x8__neon)

#define DECLARE_Q8GEMM_XZP_UKERNEL_FUNCTION(fn_name) \
QNNP_INTERNAL void fn_name( \
size_t mr, \
Expand Down
104 changes: 104 additions & 0 deletions src/qnnpack/requantization.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,110 @@ static inline union qnnp_conv_quantization_params qnnp_compute_conv_quantization
return params;
}

static inline union qnnp_conv_quantization_params qnnp_compute_conv_quantization_params_per_channel(
uint8_t input_zero_point,
size_t kernel_params_size, // should be identical to group_output_channels
uint8_t* kernel_zero_point_v,
const float* scale_v,
int32_t* multiplier_v, // pre-allocated in operator-create
int32_t* right_shift_v, // pre-allocated in operator-create
uint8_t output_zero_point,
uint8_t output_min,
uint8_t output_max)
{
const float scale = *scale_v;
const uint8_t kernel_zero_point = *kernel_zero_point_v;
/* Compute requantization parameters */
const uint32_t scale_bits = fp32_to_bits(scale);

/* Multiplier is in [0x40000000, 0x7FFFFF80] range */
const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
assert(multiplier >= INT32_C(0x40000000));
assert(multiplier <= INT32_C(0x7FFFFF80));

/* Shift is in [0, 31] range */
const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
assert(shift >= 0);
assert(shift < 32);

union qnnp_conv_quantization_params params;
#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
const uint32_t remainder_threshold = remainder_mask >> 1;
for (uint32_t i = 0; i < 8; i++) {
params.sse2.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
params.sse2.kernel_zero_point[i] = (int16_t) (uint16_t) kernel_zero_point;
}
params.sse2.multiplier[0] = multiplier;
params.sse2.multiplier[1] = multiplier;
params.sse2.multiplier[2] = multiplier;
params.sse2.multiplier[3] = multiplier;
params.sse2.rounding[0] = UINT64_C(0x40000000);
params.sse2.rounding[1] = UINT64_C(0x40000000);
params.sse2.remainder_mask[0] = (int32_t) remainder_mask;
params.sse2.remainder_mask[1] = (int32_t) remainder_mask;
params.sse2.remainder_mask[2] = (int32_t) remainder_mask;
params.sse2.remainder_mask[3] = (int32_t) remainder_mask;
params.sse2.remainder_threshold[0] = (int32_t) remainder_threshold;
params.sse2.remainder_threshold[1] = (int32_t) remainder_threshold;
params.sse2.remainder_threshold[2] = (int32_t) remainder_threshold;
params.sse2.remainder_threshold[3] = (int32_t) remainder_threshold;
params.sse2.shift[0] = (uint64_t) (uint32_t) shift;
params.sse2.shift[1] = (uint64_t) (uint32_t) shift;
for (uint32_t i = 0; i < 8; i++) {
params.sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
}
for (uint32_t i = 0; i < 16; i++) {
params.sse2.output_max[i] = output_max;
params.sse2.output_min[i] = output_min;
}
#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
params.neon.input_zero_point = (int16_t) (uint16_t) input_zero_point;
params.neon.kernel_zero_point = (int16_t) (uint16_t) kernel_zero_point;
params.neon.multiplier = multiplier;
params.neon.right_shift = -shift;
params.neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
params.neon.output_max = output_max;
params.neon.output_min = output_min;
params.neon.kernel_zero_point_v = kernel_zero_point_v;
params.neon.multiplier_v = multiplier_v;
params.neon.right_shift_v = right_shift_v;
for (uint32_t i = 0; i < kernel_params_size; ++i) {
const float s = scale_v[i];
const uint8_t kzp = kernel_zero_point_v[i];
/* Compute requantization parameters */
const uint32_t sbits = fp32_to_bits(s);
/* Multiplier is in [0x40000000, 0x7FFFFF80] range */
const int32_t m = (int32_t)(((sbits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
assert(m >= INT32_C(0x40000000));
assert(m <= INT32_C(0x7FFFFF80));

/* Shift is in [0, 31] range */
const int32_t rs = 127 + 31 - 32 - (fp32_to_bits(s) >> 23);
assert(rs >= 0);
assert(rs < 32);
params.neon.multiplier_v[i] = m;
params.neon.right_shift_v[i] = -rs;
}

#else
const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
const uint32_t remainder_threshold = remainder_mask >> 1;
params.scalar.input_zero_point = (int32_t) (uint32_t) input_zero_point;
params.scalar.kernel_zero_point = (int32_t) (uint32_t) kernel_zero_point;
params.scalar.multiplier = multiplier;
params.scalar.remainder_mask = (int32_t) remainder_mask;
params.scalar.remainder_threshold = (int32_t) remainder_threshold;
params.scalar.shift = (uint32_t) shift;
params.scalar.output_min_less_zero_point =
(int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
params.scalar.output_max_less_zero_point =
(int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
params.scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
#endif
return params;
}

static inline union qnnp_avgpool_quantization_params qnnp_compute_avgpool_quantization_params(
int32_t bias,
float scale,
Expand Down
122 changes: 122 additions & 0 deletions test/gemm-microkernel-tester.h
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,128 @@ class GemmMicrokernelTester {
}
}

void test(q8gemm_per_channel_ukernel_function qgemm) const {
ASSERT_LE(m(), mr());
ASSERT_LE(n(), nr());
ASSERT_GE(k(), kr());

std::random_device randomDevice;
auto rng = std::mt19937(randomDevice());
auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);

std::vector<uint8_t> a((m() - 1) * aStride() + k() + 8);
std::vector<uint8_t> b(n() * k());
std::vector<int32_t> bias(n());
std::vector<uint8_t, AlignedAllocator<uint8_t, 32>> packedW(packedN() * packedK() + biasN() * sizeof(uint32_t) / sizeof(uint8_t));
std::vector<uint8_t> c((m() - 1) * cStride() + n());
std::vector<int32_t> acc(m() * n());
std::vector<uint8_t> cRef(m() * n());

// Per-Channel quantization parameters
std::vector<uint8_t> kernelZeroPointPerChannel(nr());
std::vector<float> kernelAndInputScalePerChannel(nr());
std::vector<float> requantizationScalePerChannel(nr());
std::vector<int32_t> multiplierPerChannel(nr());
std::vector<int32_t> rightShiftPerChannel(nr());

// 1) Fill zero-point per-channel around bZeroPoint() as center value.
// 2) Fill kernel-and-input per-channel using linear interpolation between min and max values.
// (Maintain: requantization_scale < 1 ;
// requantization_scale := input_scale * kernel_scale / output_scale)
const float scale_min = 0.5f;
const float scale_max = 0.99999f;
for (size_t i = 0; i < nr(); ++i) {
kernelZeroPointPerChannel[i] =
static_cast<uint8_t>(std::min(255, std::max(0, bZeroPoint() + (int)(i - nr()/2))));
kernelAndInputScalePerChannel[i] = scale_min + i * (scale_max - scale_min) / nr();
}

const uint8_t* aPtr = a.data() + 8;

for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), std::ref(u8rng));
std::generate(b.begin(), b.end(), std::ref(u8rng));
std::generate(bias.begin(), bias.end(), std::ref(s32rng));
std::fill(c.begin(), c.end(), 0xA5);

std::fill(packedW.begin(), packedW.end(), bZeroPoint());
pack_q8gemm_w_per_channel(n(), k(),
nr(), np(), kr(),
aZeroPoint(), kernelZeroPointPerChannel.data(),
b.data(), bias.data(), packedW.data());

ASSERT_NE(*std::max_element(a.cbegin(), a.cend()), *std::min_element(a.cbegin(), a.cend()));
ASSERT_NE(*std::max_element(b.cbegin(), b.cend()), *std::min_element(b.cbegin(), b.cend()));

/* Compute 32-bit results and output quantization arguments */
std::fill(acc.begin(), acc.end(), 0);
for (size_t mIndex = 0; mIndex < m(); mIndex++) {
for (size_t nIndex = 0; nIndex < n(); nIndex++) {
for (size_t kIndex = 0; kIndex < k(); kIndex++) {
ASSERT_LE(n(), packedN());
ASSERT_LT(mIndex * n() + nIndex, acc.size());
ASSERT_LT(mIndex * k() + kIndex, a.size());
acc[mIndex * n() + nIndex] +=
(int32_t(aPtr[mIndex * aStride() + kIndex]) - int32_t(aZeroPoint())) *
(int32_t(b[nIndex * k() + kIndex]) - int32_t(kernelZeroPointPerChannel[nIndex]));
}
acc[mIndex * n() + nIndex] += bias[nIndex];
}
}

const int32_t accMin = *std::min_element(acc.cbegin(), acc.cend());
const int32_t accMax = *std::max_element(acc.cbegin(), acc.cend());
if (m() * n() >= 3) {
ASSERT_NE(accMax, accMin)
<< "Mr x Nr x Kr = " << mr() << " x " << nr() << " x " << kr()
<< ", M x N x K = " << m() << " x " << n() << " x " << k();
}

const double cScale = uint32_t(accMax - accMin) >= 256 ? double(uint32_t(accMax - accMin)) / 255.0 : 1.00001;
const uint8_t cZeroPoint = uint8_t(std::max(std::min(
lrint(127.5 - 0.5 * double(accMin + accMax) / cScale),
long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));

for (size_t nIndex = 0; nIndex < nr(); nIndex++) {
requantizationScalePerChannel[nIndex] = kernelAndInputScalePerChannel[nIndex] / float(cScale);
}
const union qnnp_conv_quantization_params quantizationParams =
qnnp_compute_conv_quantization_params_per_channel(
aZeroPoint(), nr(), kernelZeroPointPerChannel.data(),
requantizationScalePerChannel.data(), multiplierPerChannel.data(), rightShiftPerChannel.data(), cZeroPoint, qmin(), qmax());

qgemm(
m(), n(), k(),
aPtr, aStride() * sizeof(uint8_t),
packedW.data(),
c.data(), cStride() * sizeof(uint8_t),
&quantizationParams, 0);

for (size_t mIndex = 0; mIndex < m(); mIndex++) {
for (size_t nIndex = 0; nIndex < n(); nIndex++) {
const union qnnp_q31_requantization_params scalarRequantizationParams =
qnnp_compute_scalar_requantization_params(
requantizationScalePerChannel[nIndex], cZeroPoint, qmin(), qmax());
cRef[mIndex * n() + nIndex] = qnnp_q31_requantize(acc[mIndex * n() + nIndex], scalarRequantizationParams);
}
}

for (size_t mIndex = 0; mIndex < m(); mIndex++) {
for (size_t nIndex = 0; nIndex < n(); nIndex++) {
ASSERT_LE(uint32_t(c[mIndex * cStride() + nIndex]), uint32_t(qmax()));
ASSERT_GE(uint32_t(c[mIndex * cStride() + nIndex]), uint32_t(qmin()));
ASSERT_EQ(uint32_t(c[mIndex * cStride() + nIndex]), uint32_t(cRef[mIndex * n() + nIndex]))
<< "at " << mIndex << ", " << nIndex << ": reference = " << (uint32_t) cRef[mIndex * n() + nIndex]
<< " (accumulator = " << acc[mIndex * n() + nIndex]
<< "), optimized = " << (uint32_t) c[mIndex * cStride() + nIndex] << ", Mr x Nr x Kr = " << mr() << " x "
<< nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k()
<< ", requantization scale = " << requantizationScalePerChannel[nIndex] << ", output zero point = " << int32_t(cZeroPoint);
}
}
}
}

void test(q8conv_ukernel_function qconv) const {
ASSERT_LE(m(), mr());
ASSERT_LE(n(), nr());
Expand Down
Loading