InfiniTensor
diff --git a/‎include/infiniop.h‎
Lines changed: 3 additions & 0 deletions b/‎include/infiniop.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/infiniop/ops/dequantize.h‎
Lines changed: 31 additions & 0 deletions b/‎include/infiniop/ops/dequantize.h‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎include/infiniop/ops/rope_v2.h‎
Lines changed: 32 additions & 0 deletions b/‎include/infiniop/ops/rope_v2.h‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎include/infiniop/ops/topkrouter.h‎
Lines changed: 21 additions & 0 deletions b/‎include/infiniop/ops/topkrouter.h‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/infiniop/ops/dequantize/dequantize.h‎
Lines changed: 55 additions & 0 deletions b/‎src/infiniop/ops/dequantize/dequantize.h‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎src/infiniop/ops/dequantize/info.h‎
Lines changed: 39 additions & 0 deletions b/‎src/infiniop/ops/dequantize/info.h‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎src/infiniop/ops/dequantize/nvidia/dequantize_w42f16_kernel.cuh‎
Lines changed: 85 additions & 0 deletions b/‎src/infiniop/ops/dequantize/nvidia/dequantize_w42f16_kernel.cuh‎
Lines changed: 85 additions & 0 deletions
@@ -7,15 +7,18 @@
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
+#include "infiniop/ops/dequantize.h"
 #include "infiniop/ops/gemm.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"
 #include "infiniop/ops/relu.h"
 #include "infiniop/ops/rms_norm.h"
 #include "infiniop/ops/rope.h"
+#include "infiniop/ops/rope_v2.h"
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
+#include "infiniop/ops/topkrouter.h"
 #include "infiniop/tensor_descriptor.h"
 
 #endif // __INFINIOP_API_H__
@@ -0,0 +1,31 @@
+#ifndef __INFINIOP_DEQUANTIZE_API_H__
+#define __INFINIOP_DEQUANTIZE_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopDequantizeDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateDequantizeDescriptor(infiniopHandle_t handle,
+                                                               infiniopDequantizeDescriptor_t *desc_ptr,
+                                                               infiniopTensorDescriptor_t out_desc,
+                                                               infiniopTensorDescriptor_t qweight_desc,
+                                                               infiniopTensorDescriptor_t scales_desc,
+                                                               infiniopTensorDescriptor_t zeros_desc);
+
+__C __export infiniStatus_t infiniopGetDequantizeWorkspaceSize(infiniopDequantizeDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopDequantize(infiniopDequantizeDescriptor_t desc,
+                                               void *workspace,
+                                               size_t workspace_size,
+                                               void *out,
+                                               const void *qweight,
+                                               const void *scales,
+                                               const void *zeros,
+                                               size_t split_k_iters,
+                                               size_t thx,
+                                               size_t thy,
+                                               void *stream);
+
+__C __export infiniStatus_t infiniopDestroyDequantizeDescriptor(infiniopDequantizeDescriptor_t desc);
+
+#endif
@@ -0,0 +1,32 @@
+#ifndef __INFINIOP_ROPE_V2_API_H__
+#define __INFINIOP_ROPE_V2_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopRoPEv2Descriptor_t;
+
+__C __export infiniStatus_t infiniopCreateRoPEv2Descriptor(
+    infiniopHandle_t handle,
+    infiniopRoPEv2Descriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t pos_ids,
+    infiniopTensorDescriptor_t sin_table,
+    infiniopTensorDescriptor_t cos_table);
+
+__C __export infiniStatus_t infiniopGetRoPEv2WorkspaceSize(infiniopRoPEv2Descriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopRoPEv2(
+    infiniopRoPEv2Descriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void const *pos_ids,
+    void const *sin_table,
+    void const *cos_table,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyRoPEv2Descriptor(infiniopRoPEv2Descriptor_t desc);
+
+#endif
@@ -0,0 +1,21 @@
+#ifndef __INFINIOP_TOPKRouter_API_H__
+#define __INFINIOP_TOPKRouter_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTopkrouterDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTopkrouterDescriptor(
+    infiniopHandle_t handle,
+    infiniopTopkrouterDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t correction_bias_desc);
+
+__C __export infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void *workspace, size_t workspace_size,
+                                               void *values, void *indices, void *x, void *correction_bias, float routed_scaling_factor, size_t topk, void *stream);
+
+__C __export infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescriptor_t desc);
+
+#endif
@@ -0,0 +1,55 @@
+#ifndef __DEQUANTIZE_H__
+#define __DEQUANTIZE_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::dequantize::NAMESPACE {                        \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        DequantizeInfo _info;                                    \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            DequantizeInfo info,                                 \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t out_desc,                 \
+            infiniopTensorDescriptor_t qweight_desc,             \
+            infiniopTensorDescriptor_t scales_desc,              \
+            infiniopTensorDescriptor_t zeros_desc);              \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *out,                                           \
+            const void *qweight,                                 \
+            const void *scales,                                  \
+            const void *zeros,                                   \
+            int split_k_iters,                                   \
+            int thx,                                             \
+            int thy,                                             \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+#endif
@@ -0,0 +1,39 @@
+#ifndef __DEQUANTIZE_INFO_H__
+#define __DEQUANTIZE_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::dequantize {
+
+class DequantizeInfo {
+    DequantizeInfo() = default;
+
+public:
+    int _in_c, _qout_c, _G;
+
+    int in_c() const { return _in_c; }
+    int qout_c() const { return _qout_c; }
+    int G() const { return _G; }
+
+    static utils::Result<DequantizeInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t qweight_desc,
+        infiniopTensorDescriptor_t scales_desc,
+        infiniopTensorDescriptor_t zeros_desc) {
+
+        int _in_c = qweight_desc->dim(0);
+        int _qout_c = qweight_desc->dim(1);
+        int _G = scales_desc->dim(0);
+
+        return utils::Result<DequantizeInfo>(DequantizeInfo{
+            _in_c,
+            _qout_c,
+            _G});
+    }
+};
+
+} // namespace op::dequantize
+
+#endif // __DEQUANTIZE_INFO_H__
@@ -0,0 +1,85 @@
+#pragma once
+
+__device__ uint4 dequantize_s4_to_fp16x2(uint32_t const &source) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
+    assert(false);
+#else
+    uint4 result;
+
+    uint32_t *h = reinterpret_cast<uint32_t *>(&result);
+    uint32_t const i4s = reinterpret_cast<uint32_t const &>(source);
+
+    // First, we extract the i4s and construct an intermediate fp16 number.
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+    static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
+    static constexpr uint32_t TOP_MASK = 0x00f000f0;
+    static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
+
+    // Note that the entire sequence only requires 1 shift instruction. This is
+    // thanks to the register packing format and the fact that we force our
+    // integers to be unsigned, and account for this in the fp16 subtractions. In
+    // addition, I exploit the fact that sub and fma have the same throughput in
+    // order to convert elt_23 and elt_67 to fp16 without having to shift them to
+    // the bottom bits before hand.
+
+    // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW
+    // dependency if we issue immediately before required.
+    const uint32_t top_i4s = i4s >> 8;
+    // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[0])
+                 : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                   "n"(immLut));
+    // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[1])
+                 : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                   "n"(immLut));
+    // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[2])
+                 : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                   "n"(immLut));
+    // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[3])
+                 : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                   "n"(immLut));
+
+    // I use inline PTX below because I am not sure if the compiler will emit
+    // float2half instructions if I use the half2 ctor. In this case, I chose
+    // performance reliability over code readability.
+
+    // This is the half2 {1032, 1032} represented as an integer.
+    // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
+    // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
+    static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
+    // This is the half2 {1 / 16, 1 / 16} represented as an integer.
+    static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
+    // This is the half2 {-72, -72} represented as an integer.
+    // static constexpr uint32_t NEG_72 = 0xd480d480;
+    // Haotian: Let's use {-64, -64}.
+    static constexpr uint32_t NEG_64 = 0xd400d400;
+
+    // Finally, we construct the output numbers.
+    // Convert elt_01
+    asm volatile("sub.f16x2 %0, %1, %2;\n"
+                 : "=r"(h[0])
+                 : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
+    // Convert elt_23
+    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+                 : "=r"(h[1])
+                 : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+    // Convert elt_45
+    asm volatile("sub.f16x2 %0, %1, %2;\n"
+                 : "=r"(h[2])
+                 : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
+    // Convert elt_67
+    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+                 : "=r"(h[3])
+                 : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+
+    return result;
+#endif
+    __builtin_unreachable(); // Suppress missing return statement warning
+}