fix(server): fix quantization python requirements (huggingface#708)

c3-pakazad · Jul 27, 2023 · 8bd0adb · 8bd0adb
1 parent e64a658
commit 8bd0adb
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 21 deletions.
diff --git a/server/poetry.lock b/server/poetry.lock
diff --git a/server/pyproject.toml b/server/pyproject.toml
@@ -28,6 +28,7 @@ tokenizers = "0.13.3"
 huggingface-hub = "^0.14.1"
 transformers = "4.29.2"
 einops = "^0.6.1"
+texttable = "^1.6.7"
 
 [tool.poetry.extras]
 accelerate = ["accelerate"]

diff --git a/server/requirements.txt b/server/requirements.txt
@@ -35,6 +35,7 @@ requests==2.31.0 ; python_version >= "3.9" and python_version < "4.0"
 safetensors==0.3.1 ; python_version >= "3.9" and python_version < "4.0"
 sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "4.0"
 setuptools==68.0.0 ; python_version >= "3.9" and python_version < "4.0"
+texttable==1.6.7 ; python_version >= "3.9" and python_version < "4.0"
 tokenizers==0.13.3 ; python_version >= "3.9" and python_version < "4.0"
 tqdm==4.65.0 ; python_version >= "3.9" and python_version < "4.0"
 transformers==4.29.2 ; python_version >= "3.9" and python_version < "4.0"

diff --git a/server/text_generation_server/models/flash_rw.py b/server/text_generation_server/models/flash_rw.py
@@ -61,7 +61,6 @@ def __init__(
         if config.quantize == "gptq":
             weights._set_gptq_params(model_id)
 
-
         model = FlashRWForCausalLM(config, weights)
 
         torch.distributed.barrier(group=self.process_group)

diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
@@ -105,21 +105,21 @@ async def Decode(self, request, context):
 
 
 def serve(
-        model_id: str,
-        revision: Optional[str],
-        sharded: bool,
-        quantize: Optional[str],
-        dtype: Optional[str],
-        trust_remote_code: bool,
-        uds_path: Path,
+    model_id: str,
+    revision: Optional[str],
+    sharded: bool,
+    quantize: Optional[str],
+    dtype: Optional[str],
+    trust_remote_code: bool,
+    uds_path: Path,
 ):
     async def serve_inner(
-            model_id: str,
-            revision: Optional[str],
-            sharded: bool = False,
-            quantize: Optional[str] = None,
-            dtype: Optional[str] = None,
-            trust_remote_code: bool = False,
+        model_id: str,
+        revision: Optional[str],
+        sharded: bool = False,
+        quantize: Optional[str] = None,
+        dtype: Optional[str] = None,
+        trust_remote_code: bool = False,
     ):
         unix_socket_template = "unix://{}-{}"
         if sharded:

diff --git a/server/text_generation_server/utils/gptq/quantize.py b/server/text_generation_server/utils/gptq/quantize.py
@@ -1,18 +1,14 @@
-import argparse
 import time
-import numpy as np
-import torch
 import torch.nn as nn
 import math
 import json
 import os
+import torch
+import transformers
 
 from texttable import Texttable
 from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
-import transformers
 from huggingface_hub import HfApi
-import numpy as np
-import torch
 from accelerate import init_empty_weights
 from text_generation_server.utils import initialize_torch_distributed, Weights
 from text_generation_server.utils.hub import weight_files