resolve comments

RunningLeon · RunningLeon · commit 2dba1a725521 · 2024-12-27T15:28:12.000+08:00
diff --git a/docs/zh_cn/quantization/w8a8.md b/docs/zh_cn/quantization/w8a8.md
@@ -13,12 +13,12 @@ pip install lmdeploy[all]
 进行 8-bit 权重量化需要经历以下三步：
 
 1. **权重平滑**：首先对语言模型的权重进行平滑处理，以便更好地进行量化。
-2. **模块替换**：使用 `QRSMNorm` 和 `QLinear` 模块替换原模型 `DecoderLayer` 中的 `RSMNorm` 模块和 `nn.Linear` 模块。`lmdeploy/pytorch/models/q_modules.py` 文件中定义了这些量化模块。
+2. **模块替换**：使用 `QRMSNorm` 和 `QLinear` 模块替换原模型 `DecoderLayer` 中的 `RMSNorm` 模块和 `nn.Linear` 模块。`lmdeploy/pytorch/models/q_modules.py` 文件中定义了这些量化模块。
 3. **保存量化模型**：完成上述必要的替换后，我们即可保存新的量化模型。
 
 lmdeploy 提供了命令行工具 `lmdeploy lite smooth_quant` 实现了以上三个步骤。并且其中命令行参数 `--quant-dtype` 可以用来控制是进行8-bit整数还是浮点数类型的量化。更多命令行工具使用方式，请执行 `lmdeploy lite smooth_quant --help` 查看。
 
-以下示例演示了进行in8或fp8的量化命令。
+以下示例演示了进行 int8 或 fp8 的量化命令。
 
 - int8
 
diff --git a/lmdeploy/pytorch/backends/cuda/norm.py b/lmdeploy/pytorch/backends/cuda/norm.py
@@ -9,7 +9,7 @@
 class TritonRMSNormImpl(RMSNormImpl):
     """triton RMS norm implementation."""
 
-    def __init__(self, hidden_size: int, eps: float = 1e-6, **kwargs):
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
         self.hidden_size = hidden_size
         self.eps = eps
 
@@ -30,6 +30,6 @@ class TritonRMSNormBuilder(RMSNormBuilder):
     """triton RMS norm implementation builder."""
 
     @staticmethod
-    def build(weight: torch.Tensor, eps: float = 1e-6, **kwargs):
+    def build(weight: torch.Tensor, eps: float = 1e-6):
         """build."""
-        return TritonRMSNormImpl(weight, eps, **kwargs)
+        return TritonRMSNormImpl(weight, eps)
diff --git a/lmdeploy/pytorch/backends/norm.py b/lmdeploy/pytorch/backends/norm.py
@@ -21,7 +21,7 @@ class RMSNormBuilder(ABC):
 
     @staticmethod
     @abstractmethod
-    def build(hidden_size: int, eps: float = 1e-6, **kwargs):
+    def build(hidden_size: int, eps: float = 1e-6):
         """build."""
         raise NotImplementedError
 
diff --git a/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py b/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py
@@ -14,14 +14,13 @@
     tl_round = tl.math.round
 
 
-def per_channel_quant(x: torch.Tensor, n_bits: int, dtype: torch.dtype):
+def per_channel_quant(x: torch.Tensor, dtype: torch.dtype):
     """Quantize the input tensor 'x' channel-wise using the given number of
     bits.
 
     Args:
         x (torch.Tensor): The input tensor to be quantized. Must be a
             2-dimensional tensor.
-        n_bits (int): The number of bits to use for quantization.
         dtype (torch.dtype): The data type to which the quantized tensor should
             be converted.
 
@@ -527,7 +526,7 @@ def linear_torch(x, b):
         return F.linear(x, b)
 
     linear_weight_quant, linear_scale = per_channel_quant(
-        linear_weight, 8, quant_dtype)
+        linear_weight, quant_dtype)
 
     rms_out, rms_scale = rms_norm_dynamic_quant(x,
                                                 rms_weight,
@@ -627,7 +626,7 @@ def y_fwd():
             quant_dtype = torch.float8_e5m2
 
         linear_weight_quant, linear_scale = per_channel_quant(
-            linear_weight, 8, quant_dtype)
+            linear_weight, quant_dtype)
 
         alpha = max(x.max().abs(), x.min().abs())
         if quant_dtype.is_floating_point:
diff --git a/lmdeploy/pytorch/models/q_modules.py b/lmdeploy/pytorch/models/q_modules.py
@@ -130,7 +130,7 @@ def from_float(cls,
                     quant_dtype=quant_dtype)
 
         if initialization:
-            weight_quant, scale = per_channel_quant(mod.weight.detach(), 8,
+            weight_quant, scale = per_channel_quant(mod.weight.detach(),
                                                     quant_dtype)
             q_mod.weight.data = weight_quant
             q_mod.scale = scale
diff --git a/lmdeploy/pytorch/nn/norm.py b/lmdeploy/pytorch/nn/norm.py
@@ -39,7 +39,12 @@ def __init__(self,
             builder = backend.get_layer_impl_builder(OpType.RMSNorm)
         self.register_parameter('weight',
                                 self.create_weight(hidden_size, dtype, device))
-        self.impl = builder.build(hidden_size, eps, quant_dtype=quant_dtype)
+        if w8a8_flag:
+            self.impl = builder.build(hidden_size,
+                                      eps,
+                                      quant_dtype=quant_dtype)
+        else:
+            self.impl = builder.build(hidden_size, eps)
 
     @staticmethod
     def create_weight(hidden_size: int,