diff --git a/src/layer/vulkan/gemm_vulkan.cpp b/src/layer/vulkan/gemm_vulkan.cpp
index bcc38060e5c..5ee1bfe5d40 100644
--- a/src/layer/vulkan/gemm_vulkan.cpp
+++ b/src/layer/vulkan/gemm_vulkan.cpp
@@ -4,6 +4,7 @@
 #include "gemm_vulkan.h"
 
 #include "layer_shader_type.h"
+#include "pipelinecache.h"
 
 namespace ncnn {
 
@@ -30,7 +31,9 @@ int Gemm_vulkan::load_param(const ParamDict& pd)
 
     if (int8_scale_term)
     {
+#ifndef NCNN_INT8
         support_vulkan = false;
+#endif
     }
 
     return ret;
@@ -69,6 +72,79 @@ int Gemm_vulkan::create_pipeline(const Option& opt)
         C_data_packed = C_data;
     }
 
+#ifdef NCNN_INT8
+    if (int8_scale_term)
+    {
+        {
+            std::vector<vk_specialization_type> specializations(15);
+            specializations[0].f = alpha;
+            specializations[1].f = beta;
+            specializations[2].i = transA;
+            specializations[3].i = transB;
+            specializations[4].i = constantA;
+            specializations[5].i = constantB;
+            specializations[6].i = constantC;
+            specializations[7].i = constantM;
+            specializations[8].i = constantN;
+            specializations[9].i = constantK;
+            specializations[10].i = constant_broadcast_type_C;
+            specializations[11].i = output_N1M;
+            specializations[12].i = output_elempack;
+            specializations[13].i = output_elemtype;
+            specializations[14].i = output_transpose;
+
+            Mat local_size_xyz;
+            // if (shape_packed.dims == 2)
+            // {
+            //     local_size_xyz.w = std::min(8, shape_packed.w);
+            //     local_size_xyz.h = std::min(8, shape_packed.h);
+            //     local_size_xyz.c = 1;
+            // }
+
+            // pack1
+            // if (shape.dims == 0 || elempack == 1)
+            {
+                pipeline_gemm_int8 = new Pipeline(vkdev);
+                pipeline_gemm_int8->set_optimal_local_size_xyz(local_size_xyz);
+                if (opt.use_shader_local_memory)
+                {
+                    pipeline_gemm_int8->set_local_size_xyz(8, 8, 1);
+                }
+                pipeline_gemm_int8->create(LayerShaderType::gemm_int8, opt, specializations);
+            }
+        }
+        {
+            std::vector<vk_specialization_type> specializations(0);
+
+            Mat local_size_xyz;
+            {
+                pipeline_reduce_scale = new Pipeline(vkdev);
+                pipeline_reduce_scale->set_optimal_local_size_xyz(local_size_xyz);
+                pipeline_reduce_scale->create(LayerShaderType::gemm_reduce_scale, opt, specializations);
+            }
+        }
+        {
+            std::vector<vk_specialization_type> specializations(0);
+
+            Mat local_size_xyz;
+            {
+                pipeline_quantize = new Pipeline(vkdev);
+                pipeline_quantize->set_optimal_local_size_xyz(local_size_xyz);
+                pipeline_quantize->create(LayerShaderType::gemm_quantize, opt, specializations);
+            }
+        }
+
+        if (opt.lightmode)
+        {
+            A_data.release();
+            B_data.release();
+            C_data.release();
+        }
+
+        return 0;
+    }
+#endif
+
     use_cooperative_matrix = vkdev->info.support_cooperative_matrix() && opt.use_cooperative_matrix && (opt.use_fp16_storage || opt.use_fp16_packed);
 
     if (use_cooperative_matrix)
@@ -176,6 +252,20 @@ int Gemm_vulkan::destroy_pipeline(const Option& /*opt*/)
     delete pipeline_gemm;
     pipeline_gemm = 0;
 
+#ifdef NCNN_INT8
+    if (int8_scale_term)
+    {
+        delete pipeline_gemm_int8;
+        pipeline_gemm_int8 = 0;
+
+        delete pipeline_reduce_scale;
+        pipeline_reduce_scale = 0;
+
+        delete pipeline_quantize;
+        pipeline_quantize = 0;
+    }
+#endif
+
     return 0;
 }
 
@@ -184,7 +274,6 @@ int Gemm_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
     if (constantA)
     {
         cmd.record_upload(A_data_packed, A_data_gpu, opt);
-
         A_data_packed.release();
     }
 
@@ -202,11 +291,41 @@ int Gemm_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
         C_data_packed.release();
     }
 
+#ifdef NCNN_INT8
+    if (int8_scale_term)
+    {
+        Option opt_unpack_fp32 = opt;
+        opt_unpack_fp32.use_fp16_storage = false;
+        opt_unpack_fp32.use_fp16_packed = false;
+        opt_unpack_fp32.use_fp16_arithmetic = false;
+        opt_unpack_fp32.use_bf16_storage = false;
+
+        if (constantA == 1)
+        {
+            cmd.record_upload(A_data_int8_scales, A_data_int8_scales_gpu, opt_unpack_fp32);
+            A_data_int8_scales.release();
+        }
+
+        if (constantB == 1)
+        {
+            Mat B_data_int8_scales(1);
+            B_data_int8_scales[0] = B_data_int8_scale;
+            cmd.record_upload(B_data_int8_scales, B_data_int8_scales_gpu, opt_unpack_fp32);
+        }
+    }
+#endif
+
     return 0;
 }
 
 int Gemm_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const
 {
+#ifdef NCNN_INT8
+    if (int8_scale_term)
+    {
+        return forward_int8(bottom_blobs, top_blobs, cmd, opt);
+    }
+#endif
     const VkMat& A0 = constantA ? A_data_gpu : bottom_blobs[0];
     const VkMat& B0 = constantB ? B_data_gpu : constantA ? bottom_blobs[0] : bottom_blobs[1];
 
@@ -376,4 +495,284 @@ int Gemm_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c
     return ret;
 }
 
+int Gemm_vulkan::forward_int8(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const
+{
+    const VkMat& A0 = constantA ? A_data_gpu : bottom_blobs[0];
+    const VkMat& B0 = constantB ? B_data_gpu : constantA ? bottom_blobs[0] : bottom_blobs[1];
+
+    VkMat A;
+    VkMat B;
+    vkdev->convert_packing(A0, A, 1, cmd, opt);
+    vkdev->convert_packing(B0, B, 1, cmd, opt);
+
+    const int M = constantM ? constantM : transA ? A.w : (A.dims == 3 ? A.c : A.h);
+    const int K = constantK ? constantK : transA ? (A.dims == 3 ? A.c : A.h) : A.w;
+    const int N = constantN ? constantN : transB ? (B.dims == 3 ? B.c : B.h) : B.w;
+
+    VkMat C;
+    int broadcast_type_C = -1;
+    if (constantC && constant_broadcast_type_C != -1)
+    {
+        vkdev->convert_packing(C_data_gpu, C, 1, cmd, opt);
+        broadcast_type_C = constant_broadcast_type_C;
+    }
+    else
+    {
+        VkMat C0;
+        if (constantA && constantB)
+        {
+            C0 = bottom_blobs.size() == 1 ? bottom_blobs[0] : VkMat();
+        }
+        else if (constantA)
+        {
+            C0 = bottom_blobs.size() == 2 ? bottom_blobs[1] : VkMat();
+        }
+        else if (constantB)
+        {
+            C0 = bottom_blobs.size() == 2 ? bottom_blobs[1] : VkMat();
+        }
+        else
+        {
+            C0 = bottom_blobs.size() == 3 ? bottom_blobs[2] : VkMat();
+        }
+
+        if (!C0.empty())
+        {
+            vkdev->convert_packing(C0, C, 1, cmd, opt);
+
+            if (C.dims == 1 && C.w == 1)
+            {
+                // scalar
+                broadcast_type_C = 0;
+            }
+            if (C.dims == 1 && C.w == M)
+            {
+                // M
+                // auto broadcast from h to w is the ncnn-style convention
+                broadcast_type_C = 1;
+            }
+            if (C.dims == 1 && C.w == N)
+            {
+                // N
+                broadcast_type_C = 4;
+            }
+            if (C.dims == 2 && C.w == 1 && C.h == M)
+            {
+                // Mx1
+                broadcast_type_C = 2;
+            }
+            if (C.dims == 2 && C.w == N && C.h == M)
+            {
+                // MxN
+                broadcast_type_C = 3;
+            }
+            if (C.dims == 2 && C.w == N && C.h == 1)
+            {
+                // 1xN
+                broadcast_type_C = 4;
+            }
+        }
+    }
+
+    int elempack = A.elempack;
+    size_t elemsize = (opt.use_fp16_storage || opt.use_bf16_storage) ? 2 : 4;
+
+    VkMat& top_blob = top_blobs[0];
+    if (output_transpose)
+    {
+        if (output_N1M)
+            top_blob.create(M, 1, N, elemsize, opt.blob_vkallocator);
+        else
+            top_blob.create(M, N, elemsize, opt.blob_vkallocator);
+    }
+    else
+    {
+        if (output_N1M)
+            top_blob.create(N, 1, M, elemsize, opt.blob_vkallocator);
+        else
+            top_blob.create(N, M, elemsize, opt.blob_vkallocator);
+    }
+    if (top_blob.empty())
+        return -100;
+
+    VkMat A_int8, A_int8_scales;
+    VkMat B_int8, B_int8_scale;
+
+    {
+        if (constantA == 1)
+        {
+            A_int8 = A;
+            A_int8_scales = A_data_int8_scales_gpu;
+        }
+        else
+        {
+            if (A.dims == 2)
+                A_int8.create(A.w, A.h, 1u, 1, opt.blob_vkallocator);
+            else
+                A_int8.create(A.w, A.h, A.c, 1u, 1, opt.blob_vkallocator);
+            if (transA)
+                A_int8_scales.create(A.w, 4u, 1, opt.blob_vkallocator);
+            else
+                A_int8_scales.create(A.dims == 2 ? A.h : A.c, 4u, 1, opt.blob_vkallocator);
+
+            {
+                std::vector<VkMat> bindings(2);
+                bindings[0] = A;
+                bindings[1] = A_int8_scales;
+
+                std::vector<vk_constant_type> constants(5);
+                constants[0].i = A.w;
+                constants[1].i = A.dims == 2 ? A.h : A.c;
+                constants[2].i = A.dims == 2 ? A.w : A.cstep;
+                constants[3].i = transA;
+                constants[4].i = 0; // A
+
+                {
+                    const Pipeline* pipeline = pipeline_reduce_scale;
+
+                    VkMat dispatcher;
+                    dispatcher.w = A_int8_scales.w;
+                    dispatcher.h = 1;
+                    dispatcher.c = 1;
+                    cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+                }
+            }
+            {
+                std::vector<VkMat> bindings(3);
+                bindings[0] = A;
+                bindings[1] = A_int8;
+                bindings[2] = A_int8_scales;
+
+                std::vector<vk_constant_type> constants(5);
+                constants[0].i = A.w;
+                constants[1].i = A.dims == 2 ? A.h : A.c;
+                constants[2].i = A.dims == 2 ? A.w : A.cstep;
+                constants[3].i = transA;
+                constants[4].i = 0; // A
+
+                {
+                    const Pipeline* pipeline = pipeline_quantize;
+
+                    VkMat dispatcher;
+                    dispatcher.w = A_int8.w;
+                    dispatcher.h = A.dims == 2 ? A.h : A.c;
+                    dispatcher.c = 1;
+                    cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+                }
+            }
+        }
+    }
+
+    {
+        if (constantB == 1)
+        {
+            B_int8 = B;
+            B_int8_scale = B_data_int8_scales_gpu;
+        }
+        else
+        {
+            if (B.dims == 2)
+                B_int8.create(B.w, B.h, 1u, 1, opt.blob_vkallocator);
+            else
+                B_int8.create(B.w, B.h, B.c, 1u, 1, opt.blob_vkallocator);
+            B_int8_scale.create(1, 4u, 1, opt.blob_vkallocator);
+
+            {
+                std::vector<VkMat> bindings(2);
+                bindings[0] = B;
+                bindings[1] = B_int8_scale;
+
+                std::vector<vk_constant_type> constants(5);
+                constants[0].i = B.w;
+                constants[1].i = B.dims == 2 ? B.h : B.c;
+                constants[2].i = B.dims == 2 ? B.w : B.cstep;
+                constants[3].i = transB;
+                constants[4].i = 1; // B
+
+                {
+                    const Pipeline* pipeline = pipeline_reduce_scale;
+
+                    VkMat dispatcher;
+                    dispatcher.w = 1;
+                    dispatcher.h = 1;
+                    dispatcher.c = 1;
+                    cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+                }
+            }
+            {
+                std::vector<VkMat> bindings(3);
+                bindings[0] = B;
+                bindings[1] = B_int8;
+                bindings[2] = B_int8_scale;
+
+                std::vector<vk_constant_type> constants(5);
+                constants[0].i = B.w;
+                constants[1].i = B.dims == 2 ? B.h : B.c;
+                constants[2].i = B.dims == 2 ? B.w : B.cstep;
+                constants[3].i = transB;
+                constants[4].i = 1; // B
+
+                {
+                    const Pipeline* pipeline = pipeline_quantize;
+
+                    VkMat dispatcher;
+                    dispatcher.w = B_int8.w;
+                    dispatcher.h = B.dims == 2 ? B.h : B.c;
+                    dispatcher.c = 1;
+                    cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+                }
+            }
+        }
+    }
+
+    {
+        std::vector<VkMat> bindings(6);
+        bindings[0] = top_blob;
+        bindings[1] = A_int8;
+        bindings[2] = B_int8;
+        bindings[3] = C;
+        bindings[4] = A_int8_scales;
+        bindings[5] = B_int8_scale;
+
+        std::vector<vk_constant_type> constants(10);
+        constants[0].i = M;
+        constants[1].i = N;
+        constants[2].i = K;
+        constants[3].i = broadcast_type_C;
+        constants[4].i = A.dims;
+        constants[5].i = A.dims == 3 ? A.cstep : transA ? M : K;
+        constants[6].i = B.dims;
+        constants[7].i = B.dims == 3 ? B.cstep : transB ? K : N;
+        constants[8].i = top_blob.dims;
+        constants[9].i = top_blob.dims == 3 ? top_blob.cstep : top_blob.w;
+
+        {
+            const Pipeline* pipeline = pipeline_gemm_int8;
+
+            VkMat dispatcher;
+            dispatcher.w = (N + 1) / 2;
+            dispatcher.h = (M + 1) / 2;
+            dispatcher.c = 1;
+            cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+        }
+    }
+
+    int out_elempack = 1;
+    {
+        int outh = output_transpose ? N : M;
+        out_elempack = outh % 4 == 0 ? 4 : 1;
+    }
+    if (output_elempack)
+        out_elempack = output_elempack;
+
+    if (out_elempack != 1)
+    {
+        VkMat top_blob0;
+        vkdev->convert_packing(top_blob, top_blob0, out_elempack, cmd, opt);
+        top_blobs[0] = top_blob0;
+    }
+
+    return 0;
+}
+
 } // namespace ncnn
diff --git a/src/layer/vulkan/gemm_vulkan.h b/src/layer/vulkan/gemm_vulkan.h
index 2ea90162f08..e40d15f1a3b 100644
--- a/src/layer/vulkan/gemm_vulkan.h
+++ b/src/layer/vulkan/gemm_vulkan.h
@@ -24,6 +24,10 @@ class Gemm_vulkan : public Gemm
     virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
     virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
 
+#ifdef NCNN_INT8
+    int forward_int8(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+#endif
+
 public:
     Mat A_data_packed;
     Mat B_data_packed;
@@ -35,6 +39,15 @@ class Gemm_vulkan : public Gemm
 
     Pipeline* pipeline_gemm;
 
+#ifdef NCNN_INT8
+    Pipeline* pipeline_gemm_int8;
+    Pipeline* pipeline_reduce_scale;
+    Pipeline* pipeline_quantize;
+
+    VkMat A_data_int8_scales_gpu;
+    VkMat B_data_int8_scales_gpu;
+#endif
+
     // cooperative matrix
     bool use_cooperative_matrix;
     int coopmat_M;
diff --git a/src/layer/vulkan/shader/gemm_int8.comp b/src/layer/vulkan/shader/gemm_int8.comp
new file mode 100644
index 00000000000..5ab1cee02bf
--- /dev/null
+++ b/src/layer/vulkan/shader/gemm_int8.comp
@@ -0,0 +1,428 @@
+// Copyright 2025 pchar.cn
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#define LOCAL_MEMORY_UNROLL_INCH 8
+
+#extension GL_EXT_integer_dot_product : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
+
+layout (constant_id = 0) const float alpha = 1.f;
+layout (constant_id = 1) const float beta = 1.f;
+layout (constant_id = 2) const int transA = 0;
+layout (constant_id = 3) const int transB = 0;
+layout (constant_id = 4) const int constantA = 0;
+layout (constant_id = 5) const int constantB = 0;
+layout (constant_id = 6) const int constantC = 0;
+layout (constant_id = 7) const int M = 0;
+layout (constant_id = 8) const int N = 0;
+layout (constant_id = 9) const int K = 0;
+layout (constant_id = 10) const int constant_broadcast_type_C = 0;
+layout (constant_id = 11) const int output_N1M = 0;
+layout (constant_id = 12) const int output_elempack = 0;
+layout (constant_id = 13) const int output_elemtype = 0;
+layout (constant_id = 14) const int output_transpose = 0;
+
+// TODO psc more
+
+layout (binding = 0) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 1) readonly buffer A_blob { sint8 A_blob_data[]; };
+layout (binding = 2) readonly buffer B_blob { sint8 B_blob_data[]; };
+layout (binding = 3) readonly buffer C_blob { sfp C_blob_data[]; };
+layout (binding = 4) readonly buffer A_scales { float A_scales_data[]; };
+layout (binding = 5) readonly buffer B_scales { float B_scales_data[]; };
+
+#if NCNN_shader_local_memory
+shared int8_t tmp_a[8][LOCAL_MEMORY_UNROLL_INCH][2];
+shared int8_t tmp_b[8][LOCAL_MEMORY_UNROLL_INCH][2];
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int M;
+    int N;
+    int K;
+    int broadcast_type_C;
+    int A_dims;
+    int A_hstep;
+    int B_dims;
+    int B_hstep;
+    int outdims;
+    int outhstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+    int gz = int(gl_GlobalInvocationID.z);
+
+#if !NCNN_shader_local_memory
+    if (gx >= psc(N) || gy >= psc(M) || gz >= 1)
+        return;
+#endif
+
+    int sum0 = 0;
+    int sum1 = 0;
+    int sum2 = 0;
+    int sum3 = 0;
+
+#if NCNN_shader_local_memory
+    const int NN = psc(K);
+
+    const int lx = int(gl_LocalInvocationID.x);
+    const int ly = int(gl_LocalInvocationID.y);
+
+    int k = 0;
+    for (; k + (LOCAL_MEMORY_UNROLL_INCH - 1) < NN; k += LOCAL_MEMORY_UNROLL_INCH)
+    {
+        {
+            if (transA == 1)
+            {
+                const int ai = (k + lx) * p.A_hstep + gy;
+                tmp_a[ly][lx][0] = A_blob_data[ai];
+                tmp_a[ly][lx][1] = A_blob_data[ai + 1];
+            }
+            else
+            {
+                const int ai = gy * p.A_hstep + (k + lx);
+                tmp_a[ly][lx][0] = A_blob_data[ai];
+                tmp_a[ly][lx][1] = A_blob_data[ai + p.A_hstep];
+            }
+
+            if (transB == 1)
+            {
+                const int bi = gx * p.B_hstep + (k + ly);
+                tmp_b[lx][ly][0] = B_blob_data[bi];
+                tmp_b[lx][ly][1] = B_blob_data[bi + p.B_hstep];
+            }
+            else
+            {
+                const int bi = (k + ly) * p.B_hstep + gx;
+                tmp_b[lx][ly][0] = B_blob_data[bi];
+                tmp_b[lx][ly][1] = B_blob_data[bi + 1];
+            }
+        }
+
+        barrier();
+
+        int k4 = 0;
+        for (; k4 + 3 < LOCAL_MEMORY_UNROLL_INCH; k4+=4)
+        {
+            i8vec4 a0 = i8vec4(
+                tmp_a[ly][k4 + 0][0],
+                tmp_a[ly][k4 + 1][0],
+                tmp_a[ly][k4 + 2][0],
+                tmp_a[ly][k4 + 3][0]
+            );
+            i8vec4 a1 = i8vec4(
+                tmp_a[ly][k4 + 0][1],
+                tmp_a[ly][k4 + 1][1],
+                tmp_a[ly][k4 + 2][1],
+                tmp_a[ly][k4 + 3][1]
+            );
+            i8vec4 b0 = i8vec4(
+                tmp_b[lx][k4 + 0][0],
+                tmp_b[lx][k4 + 1][0],
+                tmp_b[lx][k4 + 2][0],
+                tmp_b[lx][k4 + 3][0]
+            );
+            i8vec4 b1 = i8vec4(
+                tmp_b[lx][k4 + 0][1],
+                tmp_b[lx][k4 + 1][1],
+                tmp_b[lx][k4 + 2][1],
+                tmp_b[lx][k4 + 3][1]
+            );
+
+            sum0 += dotEXT(a0,b0);
+            sum1 += dotEXT(a0,b1);
+            sum2 += dotEXT(a1,b0);
+            sum3 += dotEXT(a1,b1);
+        }
+        for (; k4 < LOCAL_MEMORY_UNROLL_INCH; k4++)
+        {
+            int a0 = tmp_a[ly][k4][0];
+            int a1 = tmp_a[ly][k4][1];
+            int b0 = tmp_b[lx][k4][0];
+            int b1 = tmp_b[lx][k4][1];
+
+            sum0 += a0 * b0;
+            sum1 += a0 * b1;
+            sum2 += a1 * b0;
+            sum3 += a1 * b1;
+        }
+        barrier();
+    }
+
+    if (k < NN)
+    {
+        const int remain = NN - k;
+
+        if (lx < remain)
+        {
+            if (transA == 1)
+            {
+                const int ai = (k + lx) * p.A_hstep + gy;
+                tmp_a[ly][lx][0] = A_blob_data[ai];
+                tmp_a[ly][lx][1] = A_blob_data[ai + 1];
+            }
+            else
+            {
+                const int ai = gy * p.A_hstep + (k + lx);
+                tmp_a[ly][lx][0] = A_blob_data[ai];
+                tmp_a[ly][lx][1] = A_blob_data[ai + p.A_hstep];
+            }
+        }
+
+        if (ly < remain)
+        {
+            if (transB == 1)
+            {
+                const int bi = gx * p.B_hstep + (k + ly);
+                tmp_b[lx][ly][0] = B_blob_data[bi];
+                tmp_b[lx][ly][1] = B_blob_data[bi + p.B_hstep];
+            }
+            else
+            {
+                const int bi = (k + ly) * p.B_hstep + gx;
+                tmp_b[lx][ly][0] = B_blob_data[bi];
+                tmp_b[lx][ly][1] = B_blob_data[bi + 1];
+            }
+        }
+
+        barrier();
+
+        for (int k4 = 0; k4 < remain; k4++)
+        {
+            int a0 = tmp_a[ly][k4][0];
+            int a1 = tmp_a[ly][k4][1];
+            int b0 = tmp_b[lx][k4][0];
+            int b1 = tmp_b[lx][k4][1];
+
+            sum0 += a0 * b0;
+            sum1 += a0 * b1;
+            sum2 += a1 * b0;
+            sum3 += a1 * b1;
+        }
+    }
+#else
+    int k = 0;
+
+    for (; k + 3 < psc(K); k+=4)
+    {
+        i8vec4 a0;
+        i8vec4 a1;
+        i8vec4 b0;
+        i8vec4 b1;
+        if (transA == 1)
+        {
+            const int ai = k * p.A_hstep + gy;
+            a0 = i8vec4(
+                A_blob_data[ai],
+                A_blob_data[ai + p.A_hstep],
+                A_blob_data[ai + p.A_hstep * 2],
+                A_blob_data[ai + p.A_hstep * 3]
+            );
+            // a0 = A_blob_data[ai];
+            a1 = i8vec4(
+                A_blob_data[ai + 1],
+                A_blob_data[ai + p.A_hstep + 1],
+                A_blob_data[ai + p.A_hstep * 2 + 1],
+                A_blob_data[ai + p.A_hstep * 3 + 1]
+            );
+            // a1 = A_blob_data[ai + 1];
+        }
+        else
+        {
+            const int ai = gy * p.A_hstep + k;
+            a0 = i8vec4(
+                A_blob_data[ai],
+                A_blob_data[ai + 1],
+                A_blob_data[ai + 2],
+                A_blob_data[ai + 3]
+            );
+            // a0 = A_blob_data[ai];
+            a1 = i8vec4(
+                A_blob_data[ai + p.A_hstep],
+                A_blob_data[ai + p.A_hstep + 1],
+                A_blob_data[ai + p.A_hstep + 2],
+                A_blob_data[ai + p.A_hstep + 3]
+            );
+            // a1 = A_blob_data[ai + p.A_hstep];
+        }
+
+        if (transB == 1)
+        {
+            const int bi = gx * p.B_hstep + k;
+            b0 = i8vec4(
+                B_blob_data[bi],
+                B_blob_data[bi + 1],
+                B_blob_data[bi + 2],
+                B_blob_data[bi + 3]
+            );
+            // b0 = B_blob_data[bi];
+            b1 = i8vec4(
+                B_blob_data[bi + p.B_hstep],
+                B_blob_data[bi + p.B_hstep + 1],
+                B_blob_data[bi + p.B_hstep + 2],
+                B_blob_data[bi + p.B_hstep + 3]
+            );
+            // b1 = B_blob_data[bi + p.B_hstep];
+        }
+        else
+        {
+            const int bi = k * p.B_hstep + gx;
+            b0 = i8vec4(
+                B_blob_data[bi],
+                B_blob_data[bi + p.B_hstep],
+                B_blob_data[bi + p.B_hstep * 2],
+                B_blob_data[bi + p.B_hstep * 3]
+            );
+            // b0 = B_blob_data[bi];
+            b1 = i8vec4(
+                B_blob_data[bi + 1],
+                B_blob_data[bi + p.B_hstep + 1],
+                B_blob_data[bi + p.B_hstep * 2 + 1],
+                B_blob_data[bi + p.B_hstep * 3 + 1]
+            );
+            // b1 = B_blob_data[bi + 1];
+        }
+
+        // https://github.com/KhronosGroup/GLSL/blob/main/extensions/ext/GLSL_EXT_integer_dot_product.txt
+        sum0 += dotEXT(a0,b0);
+        sum1 += dotEXT(a0,b1);
+        sum2 += dotEXT(a1,b0);
+        sum3 += dotEXT(a1,b1);
+    }
+
+    for (; k < psc(K); k++)
+    {
+        int a0;
+        int a1;
+        int b0;
+        int b1;
+        if (transA == 1)
+        {
+            const int ai = k * p.A_hstep + gy;
+            a0 = A_blob_data[ai];
+            a1 = A_blob_data[ai + 1];
+        }
+        else
+        {
+            const int ai = gy * p.A_hstep + k;
+            a0 = A_blob_data[ai];
+            a1 = A_blob_data[ai + p.A_hstep];
+        }
+
+        if (transB == 1)
+        {
+            const int bi = gx * p.B_hstep + k;
+            b0 = B_blob_data[bi];
+            b1 = B_blob_data[bi + p.B_hstep];
+        }
+        else
+        {
+            const int bi = k * p.B_hstep + gx;
+            b0 = B_blob_data[bi];
+            b1 = B_blob_data[bi + 1];
+        }
+
+        sum0 += a0 * b0;
+        sum1 += a0 * b1;
+        sum2 += a1 * b0;
+        sum3 += a1 * b1;
+    }
+#endif
+
+#if NCNN_shader_local_memory
+    if (gx >= psc(N) || gy >= psc(M) || gz >= 1)
+        return;
+#endif
+
+    afp fsum0 = afp(0.f);
+    afp fsum1 = afp(0.f);
+    afp fsum2 = afp(0.f);
+    afp fsum3 = afp(0.f);
+
+    const int broadcast_type_C = constantC == 1 ? constant_broadcast_type_C : p.broadcast_type_C;
+
+    if (broadcast_type_C == 0)
+    {
+        fsum0 = buffer_ld1(C_blob_data, 0);
+        fsum1 = fsum0;
+        fsum2 = fsum0;
+        fsum3 = fsum0;
+    }
+    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+    {
+        fsum0 = buffer_ld1(C_blob_data, gy);
+        fsum1 = fsum0;
+        fsum2 = buffer_ld1(C_blob_data, gy + 1);
+        fsum3 = fsum2;
+    }
+    if (broadcast_type_C == 3)
+    {
+        const int ci = gy * psc(N) + gx;
+        fsum0 = buffer_ld1(C_blob_data, ci);
+        fsum1 = buffer_ld1(C_blob_data, ci + 1);
+        fsum2 = buffer_ld1(C_blob_data, ci + psc(N));
+        fsum3 = buffer_ld1(C_blob_data, ci + psc(N) + 1);
+    }
+    if (broadcast_type_C == 4)
+    {
+        fsum0 = buffer_ld1(C_blob_data, gx);
+        fsum1 = buffer_ld1(C_blob_data, gx + 1);
+        fsum2 = fsum0;
+        fsum3 = fsum1;
+    }
+
+    fsum0 *= afp(beta);
+    fsum1 *= afp(beta);
+    fsum2 *= afp(beta);
+    fsum3 *= afp(beta);
+
+    float B_int8_scale = B_scales_data[0];
+    float A_int8_scale0 = A_scales_data[gy];
+    float A_int8_scale1 = A_scales_data[gy + 1];
+
+    float scale0 = A_int8_scale0 * B_int8_scale;
+    float scale1 = A_int8_scale1 * B_int8_scale;
+
+    fsum0 += float(sum0) / scale0;
+    fsum1 += float(sum1) / scale0;
+    fsum2 += float(sum2) / scale1;
+    fsum3 += float(sum3) / scale1;
+
+    fsum0 *= afp(alpha);
+    fsum1 *= afp(alpha);
+    fsum2 *= afp(alpha);
+    fsum3 *= afp(alpha);
+
+    if (output_transpose == 1)
+    {
+        const int gi = gx * p.outhstep + gy;
+
+        buffer_st1(top_blob_data, gi, fsum0);
+        if (gy + 1 < psc(M)) buffer_st1(top_blob_data, gi + 1, fsum2);
+        if (gx + 1 < psc(N))
+        {
+            buffer_st1(top_blob_data, gi + p.outhstep, afp(fsum1));
+            if (gy + 1 < psc(M)) buffer_st1(top_blob_data, gi + p.outhstep + 1, fsum3);
+        }
+    }
+    else
+    {
+        const int gi = gy * p.outhstep + gx;
+
+        buffer_st1(top_blob_data, gi, fsum0);
+        if (gx + 1 < psc(N)) buffer_st1(top_blob_data, gi + 1, fsum1);
+        if (gy + 1 < psc(M))
+        {
+            buffer_st1(top_blob_data, gi + p.outhstep, fsum2);
+            if (gx + 1 < psc(N)) buffer_st1(top_blob_data, gi + p.outhstep + 1, fsum3);
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/gemm_quantize.comp b/src/layer/vulkan/shader/gemm_quantize.comp
new file mode 100644
index 00000000000..1f15123ff3f
--- /dev/null
+++ b/src/layer/vulkan/shader/gemm_quantize.comp
@@ -0,0 +1,31 @@
+#version 450
+
+#extension GL_EXT_shader_explicit_arithmetic_types : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
+
+layout (binding = 0) readonly buffer in_blob { sfp in_blob_data[]; };
+layout (binding = 1) writeonly buffer out_blob { sint8 out_blob_data[]; };
+layout (binding = 2) readonly buffer scales_blob { float scales_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int hstep;
+    int trans;
+    int is_B;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+
+    if (gx >= p.w || gy >= p.h)
+        return;
+
+    afp v = buffer_ld1(in_blob_data, gy * p.hstep + gx);
+    float scale = scales_blob_data[p.is_B == 0 ? (p.trans == 0 ? gy : gx) : 0];
+    sint8 q = sint8(clamp(round(float(v) * scale), -127.f, 127.f));
+    i8buffer_st1(out_blob_data, gy * p.hstep + gx, q);
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/gemm_reduce_scale.comp b/src/layer/vulkan/shader/gemm_reduce_scale.comp
new file mode 100644
index 00000000000..d94fc814882
--- /dev/null
+++ b/src/layer/vulkan/shader/gemm_reduce_scale.comp
@@ -0,0 +1,66 @@
+ #version 450
+
+#extension GL_EXT_shader_explicit_arithmetic_types : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable
+
+layout (binding = 0) readonly buffer in_blob { sfp in_blob_data[]; };
+layout (binding = 1) writeonly buffer out_scales { float out_scales_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int hstep;
+    int trans;
+    int is_B;
+} p;
+
+void main()
+{
+    int gi = int(gl_GlobalInvocationID.x);
+
+    if ((gi >= p.h) && (p.is_B == 0) && (p.trans == 0))
+        return;
+    if ((gi >= p.w) && (p.is_B == 0) && (p.trans == 1))
+        return;
+    if ((p.is_B == 1) && (gi >= 1))
+        return;
+
+    float absmax = afp(0.f);
+
+    if (p.is_B == 0) {
+        if (p.trans == 0)
+        {
+            for (int i = 0; i < p.w; i++)
+            {
+                int idx = gi * p.hstep + i;
+                float v = buffer_ld1(in_blob_data, idx);
+                absmax = max(absmax, abs(v));
+            }
+        }
+        else
+        {
+            for (int i = 0; i < p.h; i++)
+            {
+                int idx = i * p.hstep + gi;
+                float v = buffer_ld1(in_blob_data, idx);
+                absmax = max(absmax, abs(v));
+            }
+        }
+    }
+    else
+    {
+        // TODO: better reduction for B matrix
+        for (int i = 0; i < p.h; i++)
+        {
+            for (int j = 0; j < p.w; j++)
+            {
+                int idx = i * p.hstep + j;
+                afp v = buffer_ld1(in_blob_data, idx);
+                absmax = max(absmax, abs(v));
+            }
+        }
+    }
+
+    out_scales_data[gi] = absmax < afp(1e-6f) ? afp(1.f) : 127.f / absmax;
+}
\ No newline at end of file