From 18873d324efa53eee76dc711d87e80474df0af43 Mon Sep 17 00:00:00 2001
From: Marcin Antas <antas.marcin@gmail.com>
Date: Sat, 5 Oct 2024 21:07:14 +0200
Subject: [PATCH] Add support for nomic-ai/nomic-embed-text-v1.5 model

---
 .github/workflows/main.yaml |  5 +++
 Dockerfile                  |  2 +
 app.py                      | 24 ++++++++++--
 cicd/build.sh               |  4 ++
 cicd/docker_push.sh         |  5 +++
 download.py                 | 23 ++++++++++--
 meta.py                     | 15 +++++---
 smoke_test.py               | 10 ++++-
 vectorizer.py               | 74 +++++++++++++++++++++++++------------
 9 files changed, 124 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 39c3ed9..564449f 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -133,6 +133,10 @@ jobs:
           - model_name: Snowflake/snowflake-arctic-embed-m-v1.5
             model_tag_name: snowflake-snowflake-arctic-embed-m-v1.5
             onnx_runtime: true
+          - model_name: nomic-ai/nomic-embed-text-v1.5
+            model_tag_name: nomic-ai-nomic-embed-text-v1.5
+            use_sentence_transformers_vectorizer: true
+            trust_remote_code: true
     env:
       LOCAL_REPO: transformers-inference
       REMOTE_REPO: semitechnologies/transformers-inference
@@ -140,6 +144,7 @@ jobs:
       MODEL_TAG_NAME: ${{matrix.model_tag_name}}
       ONNX_RUNTIME: ${{matrix.onnx_runtime}}
       USE_SENTENCE_TRANSFORMERS_VECTORIZER: ${{matrix.use_sentence_transformers_vectorizer}}
+      TRUST_REMOTE_CODE: ${{matrix.trust_remote_code}}
     steps:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
diff --git a/Dockerfile b/Dockerfile
index 0fb30f5..2857bba 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -16,6 +16,8 @@ ARG TARGETARCH
 ARG MODEL_NAME
 ARG ONNX_RUNTIME
 ENV ONNX_CPU=${TARGETARCH}
+ARG TRUST_REMOTE_CODE
+ARG USE_SENTENCE_TRANSFORMERS_VECTORIZER
 RUN mkdir nltk_data
 COPY download.py .
 RUN ./download.py
diff --git a/app.py b/app.py
index ed593d4..94373df 100644
--- a/app.py
+++ b/app.py
@@ -1,6 +1,8 @@
 import os
 from logging import getLogger
 from fastapi import FastAPI, Response, status
+from typing import Union
+from config import TRUST_REMOTE_CODE
 from vectorizer import Vectorizer, VectorInput
 from meta import Meta
 
@@ -55,7 +57,7 @@ def startup_event():
 
     model_dir = "./models/model"
 
-    def get_model_directory() -> (str, bool):
+    def get_model_name() -> Union[str, bool]:
         if os.path.exists(f"{model_dir}/model_name"):
             with open(f"{model_dir}/model_name", "r") as f:
                 model_name = f.read()
@@ -70,6 +72,13 @@ def get_onnx_runtime() -> bool:
                 return onnx_runtime == "true"
         return False
 
+    def get_trust_remote_code() -> bool:
+        if os.path.exists(f"{model_dir}/trust_remote_code"):
+            with open(f"{model_dir}/trust_remote_code", "r") as f:
+                trust_remote_code = f.read()
+                return trust_remote_code == "true"
+        return TRUST_REMOTE_CODE
+
     def log_info_about_onnx(onnx_runtime: bool):
         if onnx_runtime:
             onnx_quantization_info = "missing"
@@ -80,11 +89,17 @@ def log_info_about_onnx(onnx_runtime: bool):
                 f"Running ONNX vectorizer with quantized model for {onnx_quantization_info}"
             )
 
-    model_name, use_sentence_transformer_vectorizer = get_model_directory()
+    model_name, use_sentence_transformer_vectorizer = get_model_name()
     onnx_runtime = get_onnx_runtime()
+    trust_remote_code = get_trust_remote_code()
     log_info_about_onnx(onnx_runtime)
 
-    meta_config = Meta(model_dir, model_name, use_sentence_transformer_vectorizer)
+    meta_config = Meta(
+        model_dir,
+        model_name,
+        use_sentence_transformer_vectorizer,
+        trust_remote_code,
+    )
     vec = Vectorizer(
         model_dir,
         cuda_support,
@@ -96,6 +111,7 @@ def log_info_about_onnx(onnx_runtime: bool):
         onnx_runtime,
         use_sentence_transformer_vectorizer,
         model_name,
+        trust_remote_code,
     )
 
 
@@ -112,7 +128,7 @@ def meta():
 
 @app.post("/vectors")
 @app.post("/vectors/")
-async def read_item(item: VectorInput, response: Response):
+async def vectorize(item: VectorInput, response: Response):
     try:
         vector = await vec.vectorize(item.text, item.config)
         return {"text": item.text, "vector": vector.tolist(), "dim": len(vector)}
diff --git a/cicd/build.sh b/cicd/build.sh
index c705643..e4f7681 100755
--- a/cicd/build.sh
+++ b/cicd/build.sh
@@ -5,8 +5,12 @@ set -eou pipefail
 local_repo=${LOCAL_REPO?Variable LOCAL_REPO is required}
 model_name=${MODEL_NAME?Variable MODEL_NAME is required}
 onnx_runtime=${ONNX_RUNTIME?Variable ONNX_RUNTIME is required}
+trust_remote_code=${TRUST_REMOTE_CODE:-false}
+use_sentence_transformers_vectorizer=${USE_SENTENCE_TRANSFORMERS_VECTORIZER:-false}
 
 docker build \
   --build-arg "MODEL_NAME=$model_name" \
   --build-arg "ONNX_RUNTIME=$onnx_runtime" \
+  --build-arg "TRUST_REMOTE_CODE=$trust_remote_code" \
+  --build-arg "USE_SENTENCE_TRANSFORMERS_VECTORIZER=$use_sentence_transformers_vectorizer" \
   -t "$local_repo" .
diff --git a/cicd/docker_push.sh b/cicd/docker_push.sh
index 48535c6..c44156b 100755
--- a/cicd/docker_push.sh
+++ b/cicd/docker_push.sh
@@ -7,6 +7,8 @@ model_name=${MODEL_NAME?Variable MODEL_NAME is required}
 docker_username=${DOCKER_USERNAME?Variable DOCKER_USERNAME is required}
 docker_password=${DOCKER_PASSWORD?Variable DOCKER_PASSWORD is required}
 onnx_runtime=${ONNX_RUNTIME?Variable ONNX_RUNTIME is required}
+trust_remote_code=${TRUST_REMOTE_CODE:-false}
+use_sentence_transformers_vectorizer=${USE_SENTENCE_TRANSFORMERS_VECTORIZER:-false}
 original_model_name=$model_name
 git_tag=$GITHUB_REF_NAME
 
@@ -16,6 +18,7 @@ function main() {
   echo "git ref name is $GITHUB_REF_NAME"
   echo "git tag is $git_tag"
   echo "onnx_runtime is $onnx_runtime"
+  echo "trust_remote_code is $trust_remote_code"
   push_tag
 }
 
@@ -46,6 +49,8 @@ function push_tag() {
     docker buildx build --platform=linux/arm64,linux/amd64 \
       --build-arg "MODEL_NAME=$original_model_name" \
       --build-arg "ONNX_RUNTIME=$onnx_runtime" \
+      --build-arg "TRUST_REMOTE_CODE=$trust_remote_code" \
+      --build-arg "USE_SENTENCE_TRANSFORMERS_VECTORIZER=$use_sentence_transformers_vectorizer" \
       --push \
       --tag "$tag_git" \
       --tag "$tag_latest" \
diff --git a/download.py b/download.py
index 0b025a6..a7458da 100755
--- a/download.py
+++ b/download.py
@@ -3,6 +3,7 @@
 import os
 import sys
 import nltk
+import json
 from transformers import (
     AutoModel,
     AutoTokenizer,
@@ -98,6 +99,18 @@ def quantization_config(onnx_cpu_arch: str):
 
 
 def download_model(model_name: str, model_dir: str, trust_remote_code: bool = False):
+    def save_model_name(model_name: str):
+        with open(f"{model_dir}/model_name", "w") as f:
+            f.write(model_name)
+
+    def save_trust_remote_code(trust_remote_code: bool):
+        with open(f"{model_dir}/trust_remote_code", "w") as f:
+            f.write(f"{trust_remote_code}")
+
+    def save_model_config(model_config):
+        with open(f"{model_dir}/model_config", "w") as f:
+            f.write(json.dumps(model_config))
+
     print(
         f"Downloading model {model_name} from huggingface model hub ({trust_remote_code=})"
     )
@@ -107,9 +120,11 @@ def download_model(model_name: str, model_dir: str, trust_remote_code: bool = Fa
     if (
         model_type is not None and model_type == "t5"
     ) or use_sentence_transformers_vectorizer.lower() == "true":
-        SentenceTransformer(model_name, cache_folder=model_dir)
-        with open(f"{model_dir}/model_name", "w") as f:
-            f.write(model_name)
+        SentenceTransformer(
+            model_name, cache_folder=model_dir, trust_remote_code=trust_remote_code
+        )
+        save_model_name(model_name)
+        save_model_config(config.to_dict())
     else:
         if config.architectures and not force_automodel:
             print(f"Using class {config.architectures[0]} to load model weights")
@@ -136,6 +151,8 @@ def download_model(model_name: str, model_dir: str, trust_remote_code: bool = Fa
         model.save_pretrained(model_dir)
         tokenizer.save_pretrained(model_dir)
 
+    save_trust_remote_code(trust_remote_code)
+
     nltk.download("punkt", download_dir=nltk_dir)
     nltk.download("punkt_tab", download_dir=nltk_dir)
 
diff --git a/meta.py b/meta.py
index 1998866..cb67896 100644
--- a/meta.py
+++ b/meta.py
@@ -1,22 +1,25 @@
+import json
+import os
 from transformers import AutoConfig
 
-from config import TRUST_REMOTE_CODE
-
 
 class Meta:
-    config: AutoConfig
-
     def __init__(
         self,
         model_path: str,
         model_name: str,
         use_sentence_transformer_vectorizer: bool,
+        trust_remote_code: bool,
     ):
         if use_sentence_transformer_vectorizer:
-            self.config = {"model_name": model_name, "model_type": None}
+            if os.path.exists(f"{model_path}/model_config"):
+                with open(f"{model_path}/model_config", "r") as f:
+                    self.config = json.loads(f.read())
+            else:
+                self.config = {"model_name": model_name, "model_type": None}
         else:
             self.config = AutoConfig.from_pretrained(
-                model_path, trust_remote_code=TRUST_REMOTE_CODE
+                model_path, trust_remote_code=trust_remote_code
             ).to_dict()
 
     def get(self):
diff --git a/smoke_test.py b/smoke_test.py
index 46f57a4..f7d8f21 100755
--- a/smoke_test.py
+++ b/smoke_test.py
@@ -37,9 +37,15 @@ def test_meta(self):
         self.assertIsInstance(res.json(), dict)
 
     def test_vectorizing(self):
-        def try_to_vectorize(url):
-            print(f"url: {url}")
+        def get_req_body(task_type: str = ""):
             req_body = {"text": "The London Eye is a ferris wheel at the River Thames."}
+            if task_type != "":
+                req_body["config"] = {"task_type": task_type}
+            return req_body
+
+        def try_to_vectorize(url, task_type: str = ""):
+            print(f"url: {url}")
+            req_body = get_req_body(task_type)
 
             res = requests.post(url, json=req_body)
             resBody = res.json()
diff --git a/vectorizer.py b/vectorizer.py
index b284bde..b6888d6 100644
--- a/vectorizer.py
+++ b/vectorizer.py
@@ -20,7 +20,6 @@
     T5Tokenizer,
 )
 
-from config import TRUST_REMOTE_CODE
 
 # limit transformer batch size to limit parallel inference, otherwise we run
 # into memory problems
@@ -29,7 +28,8 @@
 
 
 class VectorInputConfig(BaseModel):
-    pooling_strategy: str
+    pooling_strategy: Optional[str] = None
+    task_type: Optional[str] = None
 
 
 class VectorInput(BaseModel):
@@ -52,14 +52,15 @@ def __init__(
         onnx_runtime: bool,
         use_sentence_transformer_vectorizer: bool,
         model_name: str,
+        trust_remote_code: bool,
     ):
         self.executor = ThreadPoolExecutor()
         if onnx_runtime:
-            self.vectorizer = ONNXVectorizer(model_path)
+            self.vectorizer = ONNXVectorizer(model_path, trust_remote_code)
         else:
             if model_type == "t5" or use_sentence_transformer_vectorizer:
                 self.vectorizer = SentenceTransformerVectorizer(
-                    model_path, model_name, cuda_core
+                    model_path, model_name, cuda_core, trust_remote_code
                 )
             else:
                 self.vectorizer = HuggingFaceVectorizer(
@@ -70,6 +71,7 @@ def __init__(
                     model_type,
                     architecture,
                     direct_tokenize,
+                    trust_remote_code,
                 )
 
     async def vectorize(self, text: str, config: VectorInputConfig):
@@ -82,10 +84,18 @@ class SentenceTransformerVectorizer:
     model: SentenceTransformer
     cuda_core: str
 
-    def __init__(self, model_path: str, model_name: str, cuda_core: str):
+    def __init__(
+        self, model_path: str, model_name: str, cuda_core: str, trust_remote_code: bool
+    ):
         self.cuda_core = cuda_core
+        print(
+            f"model_name={model_name}, cache_folder={model_path} device:{self.get_device()} trust_remote_code:{trust_remote_code}"
+        )
         self.model = SentenceTransformer(
-            model_name, cache_folder=model_path, device=self.get_device()
+            model_name,
+            cache_folder=model_path,
+            device=self.get_device(),
+            trust_remote_code=trust_remote_code,
         )
         self.model.eval()  # make sure we're in inference mode, not training
 
@@ -108,15 +118,15 @@ class ONNXVectorizer:
     model: ORTModelForFeatureExtraction
     tokenizer: AutoTokenizer
 
-    def __init__(self, model_path) -> None:
+    def __init__(self, model_path, trust_remote_code: bool) -> None:
         onnx_path = Path(model_path)
         self.model = ORTModelForFeatureExtraction.from_pretrained(
             onnx_path,
             file_name="model_quantized.onnx",
-            trust_remote_code=TRUST_REMOTE_CODE,
+            trust_remote_code=trust_remote_code,
         )
         self.tokenizer = AutoTokenizer.from_pretrained(
-            onnx_path, trust_remote_code=TRUST_REMOTE_CODE
+            onnx_path, trust_remote_code=trust_remote_code
         )
 
     def mean_pooling(self, model_output, attention_mask):
@@ -155,6 +165,7 @@ class HuggingFaceVectorizer:
     cuda_core: str
     model_type: str
     direct_tokenize: bool
+    trust_remote_code: bool
 
     def __init__(
         self,
@@ -165,15 +176,17 @@ def __init__(
         model_type: str,
         architecture: str,
         direct_tokenize: bool,
+        trust_remote_code: bool,
     ):
         self.cuda = cuda_support
         self.cuda_core = cuda_core
         self.cuda_per_process_memory_fraction = cuda_per_process_memory_fraction
         self.model_type = model_type
         self.direct_tokenize = direct_tokenize
+        self.trust_remote_code = trust_remote_code
 
         self.model_delegate: HFModel = ModelFactory.model(
-            model_type, architecture, cuda_support, cuda_core
+            model_type, architecture, cuda_support, cuda_core, trust_remote_code
         )
         self.model = self.model_delegate.create_model(model_path)
 
@@ -246,22 +259,23 @@ def vectorize(self, text: str, config: VectorInputConfig):
 
 class HFModel:
 
-    def __init__(self, cuda_support: bool, cuda_core: str):
+    def __init__(self, cuda_support: bool, cuda_core: str, trust_remote_code: bool):
         super().__init__()
         self.model = None
         self.tokenizer = None
         self.cuda = cuda_support
         self.cuda_core = cuda_core
+        self.trust_remote_code = trust_remote_code
 
     def create_tokenizer(self, model_path):
         self.tokenizer = AutoTokenizer.from_pretrained(
-            model_path, trust_remote_code=TRUST_REMOTE_CODE
+            model_path, trust_remote_code=self.trust_remote_code
         )
         return self.tokenizer
 
     def create_model(self, model_path):
         self.model = AutoModel.from_pretrained(
-            model_path, trust_remote_code=TRUST_REMOTE_CODE
+            model_path, trust_remote_code=self.trust_remote_code
         )
         return self.model
 
@@ -318,19 +332,26 @@ def pool_sum(self, embeddings, attention_mask):
 
 class DPRModel(HFModel):
 
-    def __init__(self, architecture: str, cuda_support: bool, cuda_core: str):
+    def __init__(
+        self,
+        architecture: str,
+        cuda_support: bool,
+        cuda_core: str,
+        trust_remote_code: bool,
+    ):
         super().__init__(cuda_support, cuda_core)
         self.model = None
         self.architecture = architecture
+        self.trust_remote_code = trust_remote_code
 
     def create_model(self, model_path):
         if self.architecture == "DPRQuestionEncoder":
             self.model = DPRQuestionEncoder.from_pretrained(
-                model_path, trust_remote_code=TRUST_REMOTE_CODE
+                model_path, trust_remote_code=self.trust_remote_code
             )
         else:
             self.model = DPRContextEncoder.from_pretrained(
-                model_path, trust_remote_code=TRUST_REMOTE_CODE
+                model_path, trust_remote_code=self.trust_remote_code
             )
         return self.model
 
@@ -344,22 +365,23 @@ def pool_embedding(self, batch_results, tokens, config: VectorInputConfig):
 
 class T5Model(HFModel):
 
-    def __init__(self, cuda_support: bool, cuda_core: str):
+    def __init__(self, cuda_support: bool, cuda_core: str, trust_remote_code: bool):
         super().__init__(cuda_support, cuda_core)
         self.model = None
         self.tokenizer = None
         self.cuda = cuda_support
         self.cuda_core = cuda_core
+        self.trust_remote_code = trust_remote_code
 
     def create_model(self, model_path):
         self.model = T5ForConditionalGeneration.from_pretrained(
-            model_path, trust_remote_code=TRUST_REMOTE_CODE
+            model_path, trust_remote_code=self.trust_remote_code
         )
         return self.model
 
     def create_tokenizer(self, model_path):
         self.tokenizer = T5Tokenizer.from_pretrained(
-            model_path, trust_remote_code=TRUST_REMOTE_CODE
+            model_path, trust_remote_code=self.trust_remote_code
         )
         return self.tokenizer
 
@@ -386,10 +408,16 @@ def get_batch_results(self, tokens, text):
 class ModelFactory:
 
     @staticmethod
-    def model(model_type, architecture, cuda_support: bool, cuda_core: str):
+    def model(
+        model_type,
+        architecture,
+        cuda_support: bool,
+        cuda_core: str,
+        trust_remote_code: bool,
+    ):
         if model_type == "t5":
-            return T5Model(cuda_support, cuda_core)
+            return T5Model(cuda_support, cuda_core, trust_remote_code)
         elif model_type == "dpr":
-            return DPRModel(architecture, cuda_support, cuda_core)
+            return DPRModel(architecture, cuda_support, cuda_core, trust_remote_code)
         else:
-            return HFModel(cuda_support, cuda_core)
+            return HFModel(cuda_support, cuda_core, trust_remote_code)