diff --git a/src/layer/vulkan/gemm_vulkan.cpp b/src/layer/vulkan/gemm_vulkan.cpp index bcc38060e5c..5ee1bfe5d40 100644 --- a/src/layer/vulkan/gemm_vulkan.cpp +++ b/src/layer/vulkan/gemm_vulkan.cpp @@ -4,6 +4,7 @@ #include "gemm_vulkan.h" #include "layer_shader_type.h" +#include "pipelinecache.h" namespace ncnn { @@ -30,7 +31,9 @@ int Gemm_vulkan::load_param(const ParamDict& pd) if (int8_scale_term) { +#ifndef NCNN_INT8 support_vulkan = false; +#endif } return ret; @@ -69,6 +72,79 @@ int Gemm_vulkan::create_pipeline(const Option& opt) C_data_packed = C_data; } +#ifdef NCNN_INT8 + if (int8_scale_term) + { + { + std::vector specializations(15); + specializations[0].f = alpha; + specializations[1].f = beta; + specializations[2].i = transA; + specializations[3].i = transB; + specializations[4].i = constantA; + specializations[5].i = constantB; + specializations[6].i = constantC; + specializations[7].i = constantM; + specializations[8].i = constantN; + specializations[9].i = constantK; + specializations[10].i = constant_broadcast_type_C; + specializations[11].i = output_N1M; + specializations[12].i = output_elempack; + specializations[13].i = output_elemtype; + specializations[14].i = output_transpose; + + Mat local_size_xyz; + // if (shape_packed.dims == 2) + // { + // local_size_xyz.w = std::min(8, shape_packed.w); + // local_size_xyz.h = std::min(8, shape_packed.h); + // local_size_xyz.c = 1; + // } + + // pack1 + // if (shape.dims == 0 || elempack == 1) + { + pipeline_gemm_int8 = new Pipeline(vkdev); + pipeline_gemm_int8->set_optimal_local_size_xyz(local_size_xyz); + if (opt.use_shader_local_memory) + { + pipeline_gemm_int8->set_local_size_xyz(8, 8, 1); + } + pipeline_gemm_int8->create(LayerShaderType::gemm_int8, opt, specializations); + } + } + { + std::vector specializations(0); + + Mat local_size_xyz; + { + pipeline_reduce_scale = new Pipeline(vkdev); + pipeline_reduce_scale->set_optimal_local_size_xyz(local_size_xyz); + pipeline_reduce_scale->create(LayerShaderType::gemm_reduce_scale, opt, specializations); + } + } + { + std::vector specializations(0); + + Mat local_size_xyz; + { + pipeline_quantize = new Pipeline(vkdev); + pipeline_quantize->set_optimal_local_size_xyz(local_size_xyz); + pipeline_quantize->create(LayerShaderType::gemm_quantize, opt, specializations); + } + } + + if (opt.lightmode) + { + A_data.release(); + B_data.release(); + C_data.release(); + } + + return 0; + } +#endif + use_cooperative_matrix = vkdev->info.support_cooperative_matrix() && opt.use_cooperative_matrix && (opt.use_fp16_storage || opt.use_fp16_packed); if (use_cooperative_matrix) @@ -176,6 +252,20 @@ int Gemm_vulkan::destroy_pipeline(const Option& /*opt*/) delete pipeline_gemm; pipeline_gemm = 0; +#ifdef NCNN_INT8 + if (int8_scale_term) + { + delete pipeline_gemm_int8; + pipeline_gemm_int8 = 0; + + delete pipeline_reduce_scale; + pipeline_reduce_scale = 0; + + delete pipeline_quantize; + pipeline_quantize = 0; + } +#endif + return 0; } @@ -184,7 +274,6 @@ int Gemm_vulkan::upload_model(VkTransfer& cmd, const Option& opt) if (constantA) { cmd.record_upload(A_data_packed, A_data_gpu, opt); - A_data_packed.release(); } @@ -202,11 +291,41 @@ int Gemm_vulkan::upload_model(VkTransfer& cmd, const Option& opt) C_data_packed.release(); } +#ifdef NCNN_INT8 + if (int8_scale_term) + { + Option opt_unpack_fp32 = opt; + opt_unpack_fp32.use_fp16_storage = false; + opt_unpack_fp32.use_fp16_packed = false; + opt_unpack_fp32.use_fp16_arithmetic = false; + opt_unpack_fp32.use_bf16_storage = false; + + if (constantA == 1) + { + cmd.record_upload(A_data_int8_scales, A_data_int8_scales_gpu, opt_unpack_fp32); + A_data_int8_scales.release(); + } + + if (constantB == 1) + { + Mat B_data_int8_scales(1); + B_data_int8_scales[0] = B_data_int8_scale; + cmd.record_upload(B_data_int8_scales, B_data_int8_scales_gpu, opt_unpack_fp32); + } + } +#endif + return 0; } int Gemm_vulkan::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const { +#ifdef NCNN_INT8 + if (int8_scale_term) + { + return forward_int8(bottom_blobs, top_blobs, cmd, opt); + } +#endif const VkMat& A0 = constantA ? A_data_gpu : bottom_blobs[0]; const VkMat& B0 = constantB ? B_data_gpu : constantA ? bottom_blobs[0] : bottom_blobs[1]; @@ -376,4 +495,284 @@ int Gemm_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c return ret; } +int Gemm_vulkan::forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + const VkMat& A0 = constantA ? A_data_gpu : bottom_blobs[0]; + const VkMat& B0 = constantB ? B_data_gpu : constantA ? bottom_blobs[0] : bottom_blobs[1]; + + VkMat A; + VkMat B; + vkdev->convert_packing(A0, A, 1, cmd, opt); + vkdev->convert_packing(B0, B, 1, cmd, opt); + + const int M = constantM ? constantM : transA ? A.w : (A.dims == 3 ? A.c : A.h); + const int K = constantK ? constantK : transA ? (A.dims == 3 ? A.c : A.h) : A.w; + const int N = constantN ? constantN : transB ? (B.dims == 3 ? B.c : B.h) : B.w; + + VkMat C; + int broadcast_type_C = -1; + if (constantC && constant_broadcast_type_C != -1) + { + vkdev->convert_packing(C_data_gpu, C, 1, cmd, opt); + broadcast_type_C = constant_broadcast_type_C; + } + else + { + VkMat C0; + if (constantA && constantB) + { + C0 = bottom_blobs.size() == 1 ? bottom_blobs[0] : VkMat(); + } + else if (constantA) + { + C0 = bottom_blobs.size() == 2 ? bottom_blobs[1] : VkMat(); + } + else if (constantB) + { + C0 = bottom_blobs.size() == 2 ? bottom_blobs[1] : VkMat(); + } + else + { + C0 = bottom_blobs.size() == 3 ? bottom_blobs[2] : VkMat(); + } + + if (!C0.empty()) + { + vkdev->convert_packing(C0, C, 1, cmd, opt); + + if (C.dims == 1 && C.w == 1) + { + // scalar + broadcast_type_C = 0; + } + if (C.dims == 1 && C.w == M) + { + // M + // auto broadcast from h to w is the ncnn-style convention + broadcast_type_C = 1; + } + if (C.dims == 1 && C.w == N) + { + // N + broadcast_type_C = 4; + } + if (C.dims == 2 && C.w == 1 && C.h == M) + { + // Mx1 + broadcast_type_C = 2; + } + if (C.dims == 2 && C.w == N && C.h == M) + { + // MxN + broadcast_type_C = 3; + } + if (C.dims == 2 && C.w == N && C.h == 1) + { + // 1xN + broadcast_type_C = 4; + } + } + } + + int elempack = A.elempack; + size_t elemsize = (opt.use_fp16_storage || opt.use_bf16_storage) ? 2 : 4; + + VkMat& top_blob = top_blobs[0]; + if (output_transpose) + { + if (output_N1M) + top_blob.create(M, 1, N, elemsize, opt.blob_vkallocator); + else + top_blob.create(M, N, elemsize, opt.blob_vkallocator); + } + else + { + if (output_N1M) + top_blob.create(N, 1, M, elemsize, opt.blob_vkallocator); + else + top_blob.create(N, M, elemsize, opt.blob_vkallocator); + } + if (top_blob.empty()) + return -100; + + VkMat A_int8, A_int8_scales; + VkMat B_int8, B_int8_scale; + + { + if (constantA == 1) + { + A_int8 = A; + A_int8_scales = A_data_int8_scales_gpu; + } + else + { + if (A.dims == 2) + A_int8.create(A.w, A.h, 1u, 1, opt.blob_vkallocator); + else + A_int8.create(A.w, A.h, A.c, 1u, 1, opt.blob_vkallocator); + if (transA) + A_int8_scales.create(A.w, 4u, 1, opt.blob_vkallocator); + else + A_int8_scales.create(A.dims == 2 ? A.h : A.c, 4u, 1, opt.blob_vkallocator); + + { + std::vector bindings(2); + bindings[0] = A; + bindings[1] = A_int8_scales; + + std::vector constants(5); + constants[0].i = A.w; + constants[1].i = A.dims == 2 ? A.h : A.c; + constants[2].i = A.dims == 2 ? A.w : A.cstep; + constants[3].i = transA; + constants[4].i = 0; // A + + { + const Pipeline* pipeline = pipeline_reduce_scale; + + VkMat dispatcher; + dispatcher.w = A_int8_scales.w; + dispatcher.h = 1; + dispatcher.c = 1; + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + } + } + { + std::vector bindings(3); + bindings[0] = A; + bindings[1] = A_int8; + bindings[2] = A_int8_scales; + + std::vector constants(5); + constants[0].i = A.w; + constants[1].i = A.dims == 2 ? A.h : A.c; + constants[2].i = A.dims == 2 ? A.w : A.cstep; + constants[3].i = transA; + constants[4].i = 0; // A + + { + const Pipeline* pipeline = pipeline_quantize; + + VkMat dispatcher; + dispatcher.w = A_int8.w; + dispatcher.h = A.dims == 2 ? A.h : A.c; + dispatcher.c = 1; + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + } + } + } + } + + { + if (constantB == 1) + { + B_int8 = B; + B_int8_scale = B_data_int8_scales_gpu; + } + else + { + if (B.dims == 2) + B_int8.create(B.w, B.h, 1u, 1, opt.blob_vkallocator); + else + B_int8.create(B.w, B.h, B.c, 1u, 1, opt.blob_vkallocator); + B_int8_scale.create(1, 4u, 1, opt.blob_vkallocator); + + { + std::vector bindings(2); + bindings[0] = B; + bindings[1] = B_int8_scale; + + std::vector constants(5); + constants[0].i = B.w; + constants[1].i = B.dims == 2 ? B.h : B.c; + constants[2].i = B.dims == 2 ? B.w : B.cstep; + constants[3].i = transB; + constants[4].i = 1; // B + + { + const Pipeline* pipeline = pipeline_reduce_scale; + + VkMat dispatcher; + dispatcher.w = 1; + dispatcher.h = 1; + dispatcher.c = 1; + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + } + } + { + std::vector bindings(3); + bindings[0] = B; + bindings[1] = B_int8; + bindings[2] = B_int8_scale; + + std::vector constants(5); + constants[0].i = B.w; + constants[1].i = B.dims == 2 ? B.h : B.c; + constants[2].i = B.dims == 2 ? B.w : B.cstep; + constants[3].i = transB; + constants[4].i = 1; // B + + { + const Pipeline* pipeline = pipeline_quantize; + + VkMat dispatcher; + dispatcher.w = B_int8.w; + dispatcher.h = B.dims == 2 ? B.h : B.c; + dispatcher.c = 1; + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + } + } + } + } + + { + std::vector bindings(6); + bindings[0] = top_blob; + bindings[1] = A_int8; + bindings[2] = B_int8; + bindings[3] = C; + bindings[4] = A_int8_scales; + bindings[5] = B_int8_scale; + + std::vector constants(10); + constants[0].i = M; + constants[1].i = N; + constants[2].i = K; + constants[3].i = broadcast_type_C; + constants[4].i = A.dims; + constants[5].i = A.dims == 3 ? A.cstep : transA ? M : K; + constants[6].i = B.dims; + constants[7].i = B.dims == 3 ? B.cstep : transB ? K : N; + constants[8].i = top_blob.dims; + constants[9].i = top_blob.dims == 3 ? top_blob.cstep : top_blob.w; + + { + const Pipeline* pipeline = pipeline_gemm_int8; + + VkMat dispatcher; + dispatcher.w = (N + 1) / 2; + dispatcher.h = (M + 1) / 2; + dispatcher.c = 1; + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + } + } + + int out_elempack = 1; + { + int outh = output_transpose ? N : M; + out_elempack = outh % 4 == 0 ? 4 : 1; + } + if (output_elempack) + out_elempack = output_elempack; + + if (out_elempack != 1) + { + VkMat top_blob0; + vkdev->convert_packing(top_blob, top_blob0, out_elempack, cmd, opt); + top_blobs[0] = top_blob0; + } + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/gemm_vulkan.h b/src/layer/vulkan/gemm_vulkan.h index 2ea90162f08..e40d15f1a3b 100644 --- a/src/layer/vulkan/gemm_vulkan.h +++ b/src/layer/vulkan/gemm_vulkan.h @@ -24,6 +24,10 @@ class Gemm_vulkan : public Gemm virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; +#ifdef NCNN_INT8 + int forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; +#endif + public: Mat A_data_packed; Mat B_data_packed; @@ -35,6 +39,15 @@ class Gemm_vulkan : public Gemm Pipeline* pipeline_gemm; +#ifdef NCNN_INT8 + Pipeline* pipeline_gemm_int8; + Pipeline* pipeline_reduce_scale; + Pipeline* pipeline_quantize; + + VkMat A_data_int8_scales_gpu; + VkMat B_data_int8_scales_gpu; +#endif + // cooperative matrix bool use_cooperative_matrix; int coopmat_M; diff --git a/src/layer/vulkan/shader/gemm_int8.comp b/src/layer/vulkan/shader/gemm_int8.comp new file mode 100644 index 00000000000..5ab1cee02bf --- /dev/null +++ b/src/layer/vulkan/shader/gemm_int8.comp @@ -0,0 +1,428 @@ +// Copyright 2025 pchar.cn +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#define LOCAL_MEMORY_UNROLL_INCH 8 + +#extension GL_EXT_integer_dot_product : require + +#extension GL_EXT_shader_explicit_arithmetic_types : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable + +layout (constant_id = 0) const float alpha = 1.f; +layout (constant_id = 1) const float beta = 1.f; +layout (constant_id = 2) const int transA = 0; +layout (constant_id = 3) const int transB = 0; +layout (constant_id = 4) const int constantA = 0; +layout (constant_id = 5) const int constantB = 0; +layout (constant_id = 6) const int constantC = 0; +layout (constant_id = 7) const int M = 0; +layout (constant_id = 8) const int N = 0; +layout (constant_id = 9) const int K = 0; +layout (constant_id = 10) const int constant_broadcast_type_C = 0; +layout (constant_id = 11) const int output_N1M = 0; +layout (constant_id = 12) const int output_elempack = 0; +layout (constant_id = 13) const int output_elemtype = 0; +layout (constant_id = 14) const int output_transpose = 0; + +// TODO psc more + +layout (binding = 0) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 1) readonly buffer A_blob { sint8 A_blob_data[]; }; +layout (binding = 2) readonly buffer B_blob { sint8 B_blob_data[]; }; +layout (binding = 3) readonly buffer C_blob { sfp C_blob_data[]; }; +layout (binding = 4) readonly buffer A_scales { float A_scales_data[]; }; +layout (binding = 5) readonly buffer B_scales { float B_scales_data[]; }; + +#if NCNN_shader_local_memory +shared int8_t tmp_a[8][LOCAL_MEMORY_UNROLL_INCH][2]; +shared int8_t tmp_b[8][LOCAL_MEMORY_UNROLL_INCH][2]; +#endif + +layout (push_constant) uniform parameter +{ + int M; + int N; + int K; + int broadcast_type_C; + int A_dims; + int A_hstep; + int B_dims; + int B_hstep; + int outdims; + int outhstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) * 2; + int gy = int(gl_GlobalInvocationID.y) * 2; + int gz = int(gl_GlobalInvocationID.z); + +#if !NCNN_shader_local_memory + if (gx >= psc(N) || gy >= psc(M) || gz >= 1) + return; +#endif + + int sum0 = 0; + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; + +#if NCNN_shader_local_memory + const int NN = psc(K); + + const int lx = int(gl_LocalInvocationID.x); + const int ly = int(gl_LocalInvocationID.y); + + int k = 0; + for (; k + (LOCAL_MEMORY_UNROLL_INCH - 1) < NN; k += LOCAL_MEMORY_UNROLL_INCH) + { + { + if (transA == 1) + { + const int ai = (k + lx) * p.A_hstep + gy; + tmp_a[ly][lx][0] = A_blob_data[ai]; + tmp_a[ly][lx][1] = A_blob_data[ai + 1]; + } + else + { + const int ai = gy * p.A_hstep + (k + lx); + tmp_a[ly][lx][0] = A_blob_data[ai]; + tmp_a[ly][lx][1] = A_blob_data[ai + p.A_hstep]; + } + + if (transB == 1) + { + const int bi = gx * p.B_hstep + (k + ly); + tmp_b[lx][ly][0] = B_blob_data[bi]; + tmp_b[lx][ly][1] = B_blob_data[bi + p.B_hstep]; + } + else + { + const int bi = (k + ly) * p.B_hstep + gx; + tmp_b[lx][ly][0] = B_blob_data[bi]; + tmp_b[lx][ly][1] = B_blob_data[bi + 1]; + } + } + + barrier(); + + int k4 = 0; + for (; k4 + 3 < LOCAL_MEMORY_UNROLL_INCH; k4+=4) + { + i8vec4 a0 = i8vec4( + tmp_a[ly][k4 + 0][0], + tmp_a[ly][k4 + 1][0], + tmp_a[ly][k4 + 2][0], + tmp_a[ly][k4 + 3][0] + ); + i8vec4 a1 = i8vec4( + tmp_a[ly][k4 + 0][1], + tmp_a[ly][k4 + 1][1], + tmp_a[ly][k4 + 2][1], + tmp_a[ly][k4 + 3][1] + ); + i8vec4 b0 = i8vec4( + tmp_b[lx][k4 + 0][0], + tmp_b[lx][k4 + 1][0], + tmp_b[lx][k4 + 2][0], + tmp_b[lx][k4 + 3][0] + ); + i8vec4 b1 = i8vec4( + tmp_b[lx][k4 + 0][1], + tmp_b[lx][k4 + 1][1], + tmp_b[lx][k4 + 2][1], + tmp_b[lx][k4 + 3][1] + ); + + sum0 += dotEXT(a0,b0); + sum1 += dotEXT(a0,b1); + sum2 += dotEXT(a1,b0); + sum3 += dotEXT(a1,b1); + } + for (; k4 < LOCAL_MEMORY_UNROLL_INCH; k4++) + { + int a0 = tmp_a[ly][k4][0]; + int a1 = tmp_a[ly][k4][1]; + int b0 = tmp_b[lx][k4][0]; + int b1 = tmp_b[lx][k4][1]; + + sum0 += a0 * b0; + sum1 += a0 * b1; + sum2 += a1 * b0; + sum3 += a1 * b1; + } + barrier(); + } + + if (k < NN) + { + const int remain = NN - k; + + if (lx < remain) + { + if (transA == 1) + { + const int ai = (k + lx) * p.A_hstep + gy; + tmp_a[ly][lx][0] = A_blob_data[ai]; + tmp_a[ly][lx][1] = A_blob_data[ai + 1]; + } + else + { + const int ai = gy * p.A_hstep + (k + lx); + tmp_a[ly][lx][0] = A_blob_data[ai]; + tmp_a[ly][lx][1] = A_blob_data[ai + p.A_hstep]; + } + } + + if (ly < remain) + { + if (transB == 1) + { + const int bi = gx * p.B_hstep + (k + ly); + tmp_b[lx][ly][0] = B_blob_data[bi]; + tmp_b[lx][ly][1] = B_blob_data[bi + p.B_hstep]; + } + else + { + const int bi = (k + ly) * p.B_hstep + gx; + tmp_b[lx][ly][0] = B_blob_data[bi]; + tmp_b[lx][ly][1] = B_blob_data[bi + 1]; + } + } + + barrier(); + + for (int k4 = 0; k4 < remain; k4++) + { + int a0 = tmp_a[ly][k4][0]; + int a1 = tmp_a[ly][k4][1]; + int b0 = tmp_b[lx][k4][0]; + int b1 = tmp_b[lx][k4][1]; + + sum0 += a0 * b0; + sum1 += a0 * b1; + sum2 += a1 * b0; + sum3 += a1 * b1; + } + } +#else + int k = 0; + + for (; k + 3 < psc(K); k+=4) + { + i8vec4 a0; + i8vec4 a1; + i8vec4 b0; + i8vec4 b1; + if (transA == 1) + { + const int ai = k * p.A_hstep + gy; + a0 = i8vec4( + A_blob_data[ai], + A_blob_data[ai + p.A_hstep], + A_blob_data[ai + p.A_hstep * 2], + A_blob_data[ai + p.A_hstep * 3] + ); + // a0 = A_blob_data[ai]; + a1 = i8vec4( + A_blob_data[ai + 1], + A_blob_data[ai + p.A_hstep + 1], + A_blob_data[ai + p.A_hstep * 2 + 1], + A_blob_data[ai + p.A_hstep * 3 + 1] + ); + // a1 = A_blob_data[ai + 1]; + } + else + { + const int ai = gy * p.A_hstep + k; + a0 = i8vec4( + A_blob_data[ai], + A_blob_data[ai + 1], + A_blob_data[ai + 2], + A_blob_data[ai + 3] + ); + // a0 = A_blob_data[ai]; + a1 = i8vec4( + A_blob_data[ai + p.A_hstep], + A_blob_data[ai + p.A_hstep + 1], + A_blob_data[ai + p.A_hstep + 2], + A_blob_data[ai + p.A_hstep + 3] + ); + // a1 = A_blob_data[ai + p.A_hstep]; + } + + if (transB == 1) + { + const int bi = gx * p.B_hstep + k; + b0 = i8vec4( + B_blob_data[bi], + B_blob_data[bi + 1], + B_blob_data[bi + 2], + B_blob_data[bi + 3] + ); + // b0 = B_blob_data[bi]; + b1 = i8vec4( + B_blob_data[bi + p.B_hstep], + B_blob_data[bi + p.B_hstep + 1], + B_blob_data[bi + p.B_hstep + 2], + B_blob_data[bi + p.B_hstep + 3] + ); + // b1 = B_blob_data[bi + p.B_hstep]; + } + else + { + const int bi = k * p.B_hstep + gx; + b0 = i8vec4( + B_blob_data[bi], + B_blob_data[bi + p.B_hstep], + B_blob_data[bi + p.B_hstep * 2], + B_blob_data[bi + p.B_hstep * 3] + ); + // b0 = B_blob_data[bi]; + b1 = i8vec4( + B_blob_data[bi + 1], + B_blob_data[bi + p.B_hstep + 1], + B_blob_data[bi + p.B_hstep * 2 + 1], + B_blob_data[bi + p.B_hstep * 3 + 1] + ); + // b1 = B_blob_data[bi + 1]; + } + + // https://github.com/KhronosGroup/GLSL/blob/main/extensions/ext/GLSL_EXT_integer_dot_product.txt + sum0 += dotEXT(a0,b0); + sum1 += dotEXT(a0,b1); + sum2 += dotEXT(a1,b0); + sum3 += dotEXT(a1,b1); + } + + for (; k < psc(K); k++) + { + int a0; + int a1; + int b0; + int b1; + if (transA == 1) + { + const int ai = k * p.A_hstep + gy; + a0 = A_blob_data[ai]; + a1 = A_blob_data[ai + 1]; + } + else + { + const int ai = gy * p.A_hstep + k; + a0 = A_blob_data[ai]; + a1 = A_blob_data[ai + p.A_hstep]; + } + + if (transB == 1) + { + const int bi = gx * p.B_hstep + k; + b0 = B_blob_data[bi]; + b1 = B_blob_data[bi + p.B_hstep]; + } + else + { + const int bi = k * p.B_hstep + gx; + b0 = B_blob_data[bi]; + b1 = B_blob_data[bi + 1]; + } + + sum0 += a0 * b0; + sum1 += a0 * b1; + sum2 += a1 * b0; + sum3 += a1 * b1; + } +#endif + +#if NCNN_shader_local_memory + if (gx >= psc(N) || gy >= psc(M) || gz >= 1) + return; +#endif + + afp fsum0 = afp(0.f); + afp fsum1 = afp(0.f); + afp fsum2 = afp(0.f); + afp fsum3 = afp(0.f); + + const int broadcast_type_C = constantC == 1 ? constant_broadcast_type_C : p.broadcast_type_C; + + if (broadcast_type_C == 0) + { + fsum0 = buffer_ld1(C_blob_data, 0); + fsum1 = fsum0; + fsum2 = fsum0; + fsum3 = fsum0; + } + if (broadcast_type_C == 1 || broadcast_type_C == 2) + { + fsum0 = buffer_ld1(C_blob_data, gy); + fsum1 = fsum0; + fsum2 = buffer_ld1(C_blob_data, gy + 1); + fsum3 = fsum2; + } + if (broadcast_type_C == 3) + { + const int ci = gy * psc(N) + gx; + fsum0 = buffer_ld1(C_blob_data, ci); + fsum1 = buffer_ld1(C_blob_data, ci + 1); + fsum2 = buffer_ld1(C_blob_data, ci + psc(N)); + fsum3 = buffer_ld1(C_blob_data, ci + psc(N) + 1); + } + if (broadcast_type_C == 4) + { + fsum0 = buffer_ld1(C_blob_data, gx); + fsum1 = buffer_ld1(C_blob_data, gx + 1); + fsum2 = fsum0; + fsum3 = fsum1; + } + + fsum0 *= afp(beta); + fsum1 *= afp(beta); + fsum2 *= afp(beta); + fsum3 *= afp(beta); + + float B_int8_scale = B_scales_data[0]; + float A_int8_scale0 = A_scales_data[gy]; + float A_int8_scale1 = A_scales_data[gy + 1]; + + float scale0 = A_int8_scale0 * B_int8_scale; + float scale1 = A_int8_scale1 * B_int8_scale; + + fsum0 += float(sum0) / scale0; + fsum1 += float(sum1) / scale0; + fsum2 += float(sum2) / scale1; + fsum3 += float(sum3) / scale1; + + fsum0 *= afp(alpha); + fsum1 *= afp(alpha); + fsum2 *= afp(alpha); + fsum3 *= afp(alpha); + + if (output_transpose == 1) + { + const int gi = gx * p.outhstep + gy; + + buffer_st1(top_blob_data, gi, fsum0); + if (gy + 1 < psc(M)) buffer_st1(top_blob_data, gi + 1, fsum2); + if (gx + 1 < psc(N)) + { + buffer_st1(top_blob_data, gi + p.outhstep, afp(fsum1)); + if (gy + 1 < psc(M)) buffer_st1(top_blob_data, gi + p.outhstep + 1, fsum3); + } + } + else + { + const int gi = gy * p.outhstep + gx; + + buffer_st1(top_blob_data, gi, fsum0); + if (gx + 1 < psc(N)) buffer_st1(top_blob_data, gi + 1, fsum1); + if (gy + 1 < psc(M)) + { + buffer_st1(top_blob_data, gi + p.outhstep, fsum2); + if (gx + 1 < psc(N)) buffer_st1(top_blob_data, gi + p.outhstep + 1, fsum3); + } + } +} \ No newline at end of file diff --git a/src/layer/vulkan/shader/gemm_quantize.comp b/src/layer/vulkan/shader/gemm_quantize.comp new file mode 100644 index 00000000000..1f15123ff3f --- /dev/null +++ b/src/layer/vulkan/shader/gemm_quantize.comp @@ -0,0 +1,31 @@ +#version 450 + +#extension GL_EXT_shader_explicit_arithmetic_types : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable + +layout (binding = 0) readonly buffer in_blob { sfp in_blob_data[]; }; +layout (binding = 1) writeonly buffer out_blob { sint8 out_blob_data[]; }; +layout (binding = 2) readonly buffer scales_blob { float scales_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int hstep; + int trans; + int is_B; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + + if (gx >= p.w || gy >= p.h) + return; + + afp v = buffer_ld1(in_blob_data, gy * p.hstep + gx); + float scale = scales_blob_data[p.is_B == 0 ? (p.trans == 0 ? gy : gx) : 0]; + sint8 q = sint8(clamp(round(float(v) * scale), -127.f, 127.f)); + i8buffer_st1(out_blob_data, gy * p.hstep + gx, q); +} \ No newline at end of file diff --git a/src/layer/vulkan/shader/gemm_reduce_scale.comp b/src/layer/vulkan/shader/gemm_reduce_scale.comp new file mode 100644 index 00000000000..d94fc814882 --- /dev/null +++ b/src/layer/vulkan/shader/gemm_reduce_scale.comp @@ -0,0 +1,66 @@ + #version 450 + +#extension GL_EXT_shader_explicit_arithmetic_types : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable + +layout (binding = 0) readonly buffer in_blob { sfp in_blob_data[]; }; +layout (binding = 1) writeonly buffer out_scales { float out_scales_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int hstep; + int trans; + int is_B; +} p; + +void main() +{ + int gi = int(gl_GlobalInvocationID.x); + + if ((gi >= p.h) && (p.is_B == 0) && (p.trans == 0)) + return; + if ((gi >= p.w) && (p.is_B == 0) && (p.trans == 1)) + return; + if ((p.is_B == 1) && (gi >= 1)) + return; + + float absmax = afp(0.f); + + if (p.is_B == 0) { + if (p.trans == 0) + { + for (int i = 0; i < p.w; i++) + { + int idx = gi * p.hstep + i; + float v = buffer_ld1(in_blob_data, idx); + absmax = max(absmax, abs(v)); + } + } + else + { + for (int i = 0; i < p.h; i++) + { + int idx = i * p.hstep + gi; + float v = buffer_ld1(in_blob_data, idx); + absmax = max(absmax, abs(v)); + } + } + } + else + { + // TODO: better reduction for B matrix + for (int i = 0; i < p.h; i++) + { + for (int j = 0; j < p.w; j++) + { + int idx = i * p.hstep + j; + afp v = buffer_ld1(in_blob_data, idx); + absmax = max(absmax, abs(v)); + } + } + } + + out_scales_data[gi] = absmax < afp(1e-6f) ? afp(1.f) : 127.f / absmax; +} \ No newline at end of file