feat: add mxbai and set api keys from dotenv

monotykamary · monotykamary · commit 71395758aa7f · 2024-04-22T16:22:20.000+07:00
diff --git a/.env.example b/.env.example
@@ -0,0 +1,2 @@
+INFINITY_API_KEY=""
+VLLM_API_KEY=""
diff --git a/devbox.json b/devbox.json
@@ -7,7 +7,7 @@
   "shell": {
     "init_hook": [
       ". $VENV_DIR/bin/activate",
-      "pip install modal",
+      "pip install modal python-dotenv",
       "modal profile activate default"
     ],
     "scripts": {
diff --git a/infinity_mxbai_embed_large_v1.py b/infinity_mxbai_embed_large_v1.py
@@ -0,0 +1,80 @@
+# # Fast inference with Infinity (mixedbread-ai/mxbai-embed-large-v1)
+
+import os
+import subprocess
+import secrets
+
+from modal import Image, Secret, Stub, enter, gpu, method, web_server
+
+MODEL_DIR = "/model"
+BASE_MODEL = "mixedbread-ai/mxbai-embed-large-v1"
+
+# ## Define a container image
+
+
+# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this
+# is that the container no longer has to re-download the model from Huggingface - instead, it will take
+# advantage of Modal's internal filesystem for faster cold starts.
+#
+# ### Download the weights
+# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`.
+#
+# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run.
+def download_model_to_folder():
+    from huggingface_hub import snapshot_download
+    from transformers.utils import move_cache
+
+    os.makedirs(MODEL_DIR, exist_ok=True)
+
+    snapshot_download(
+        BASE_MODEL,
+        local_dir=MODEL_DIR,
+        ignore_patterns=["*.pt", "*.bin"],  # Using safetensors
+    )
+    move_cache()
+
+
+# ### Image definition
+# We'll start from a recommended Docker Hub image and install `vLLM`.
+# Then we'll use `run_function` to run the function defined above to ensure the weights of
+# the model are saved within the container image.
+image = (
+    Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
+    .pip_install(
+        "wheel==0.43.0",
+        "huggingface_hub==0.22.2",
+        "hf-transfer==0.1.6",
+        "torch==2.2.1",
+    )
+    .apt_install("git")
+    .run_commands(
+        "git clone https://github.com/monotykamary/infinity.git",
+        "cd infinity/libs/infinity_emb && git checkout c8121b9e19fcd7658aa87aea2457979b07c9fd25 && pip install .[all]",
+    )
+    # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+    .run_function(
+        download_model_to_folder,
+        secrets=[Secret.from_name("huggingface")],
+        timeout=60 * 20,
+    )
+)
+
+stub = Stub("infinity-mxbai-embed-large-v1", image=image)
+GPU_CONFIG = gpu.T4(count=1)
+
+
+# Run a web server on port 8000 and expose vLLM OpenAI compatible server
+@stub.function(
+    allow_concurrent_inputs=100,
+    container_idle_timeout=60,
+    gpu=GPU_CONFIG,
+    secrets=[
+        Secret.from_name("huggingface"),
+        Secret.from_dotenv(),
+    ],
+)
+@web_server(7997, startup_timeout=300)
+def infinity_embeddings_server():
+    cmd = f"infinity_emb --model-name-or-path {BASE_MODEL}"
+    subprocess.Popen(cmd, shell=True)
diff --git a/infinity_mxbai_rerank_large_v1.py b/infinity_mxbai_rerank_large_v1.py
@@ -0,0 +1,81 @@
+# # Fast inference with Infinity (mixedbread-ai/mxbai-rerank-large-v1)
+
+import os
+import subprocess
+import secrets
+
+from modal import Image, Secret, Stub, enter, gpu, method, web_server
+
+MODEL_DIR = "/model"
+BASE_MODEL = "mixedbread-ai/mxbai-rerank-large-v1"
+
+# ## Define a container image
+
+
+# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this
+# is that the container no longer has to re-download the model from Huggingface - instead, it will take
+# advantage of Modal's internal filesystem for faster cold starts.
+#
+# ### Download the weights
+# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`.
+#
+# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run.
+def download_model_to_folder():
+    from huggingface_hub import snapshot_download
+    from transformers.utils import move_cache
+
+    os.makedirs(MODEL_DIR, exist_ok=True)
+
+    snapshot_download(
+        BASE_MODEL,
+        local_dir=MODEL_DIR,
+        ignore_patterns=["*.pt", "*.bin"],  # Using safetensors
+    )
+    move_cache()
+
+
+# ### Image definition
+# We'll start from a recommended Docker Hub image and install `vLLM`.
+# Then we'll use `run_function` to run the function defined above to ensure the weights of
+# the model are saved within the container image.
+image = (
+    Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
+    .pip_install(
+        "wheel==0.43.0",
+        "huggingface_hub==0.22.2",
+        "hf-transfer==0.1.6",
+        "torch==2.2.1",
+        "sentence-transformers==2.7.0",
+    )
+    .apt_install("git")
+    .run_commands(
+        "git clone https://github.com/monotykamary/infinity.git",
+        "cd infinity/libs/infinity_emb && git checkout c8121b9e19fcd7658aa87aea2457979b07c9fd25 && pip install .[all]",
+    )
+    # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+    .run_function(
+        download_model_to_folder,
+        secrets=[Secret.from_name("huggingface")],
+        timeout=60 * 20,
+    )
+)
+
+stub = Stub("infinity-mxbai-rerank-large-v1", image=image)
+GPU_CONFIG = gpu.T4(count=1)
+
+
+# Run a web server on port 8000 and expose vLLM OpenAI compatible server
+@stub.function(
+    allow_concurrent_inputs=100,
+    container_idle_timeout=60,
+    gpu=GPU_CONFIG,
+    secrets=[
+        Secret.from_name("huggingface"),
+        Secret.from_dotenv(),
+    ],
+)
+@web_server(7997, startup_timeout=300)
+def infinity_embeddings_server():
+    cmd = f"infinity_emb --model-name-or-path {BASE_MODEL}"
+    subprocess.Popen(cmd, shell=True)
diff --git a/infinity_snowflake_arctic_embed_l_335m.py b/infinity_snowflake_arctic_embed_l_335m.py
@@ -9,8 +9,6 @@
 MODEL_DIR = "/model"
 BASE_MODEL = "Snowflake/snowflake-arctic-embed-l"
 
-api_key = secrets.token_urlsafe()
-
 # ## Define a container image
 
 
@@ -51,7 +49,7 @@ def download_model_to_folder():
     .apt_install("git")
     .run_commands(
         "git clone https://github.com/monotykamary/infinity.git",
-        "cd infinity/libs/infinity_emb && git checkout e63545da1c5c0607831b94ce75707d6dcf9f5474 && pip install .[all]",
+        "cd infinity/libs/infinity_emb && git checkout c8121b9e19fcd7658aa87aea2457979b07c9fd25 && pip install .[all]",
     )
     # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
     .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
@@ -71,10 +69,12 @@ def download_model_to_folder():
     allow_concurrent_inputs=100,
     container_idle_timeout=60,
     gpu=GPU_CONFIG,
-    secrets=[Secret.from_name("huggingface")],
+    secrets=[
+        Secret.from_name("huggingface"),
+        Secret.from_dotenv(),
+    ],
 )
 @web_server(7997, startup_timeout=300)
 def infinity_embeddings_server():
-    api_key = os.getenv("INFINITY_API_KEY")
-    cmd = f"INFINITY_API_KEY={api_key} infinity_emb --model-name-or-path {BASE_MODEL}"
+    cmd = f"infinity_emb --model-name-or-path {BASE_MODEL}"
     subprocess.Popen(cmd, shell=True)
diff --git a/vllm_deepseek_coder_33b.py b/vllm_deepseek_coder_33b.py
@@ -13,8 +13,6 @@
 MODEL_DIR = "/model"
 BASE_MODEL = "TheBloke/deepseek-coder-33B-instruct-AWQ"
 
-api_key = secrets.token_urlsafe()
-
 # ## Define a container image
 
 
@@ -67,9 +65,6 @@ def download_model_to_folder():
         secrets=[Secret.from_name("huggingface")],
         timeout=60 * 20,
     )
-    .run_commands(
-        f"export VLLM_API_KEY={api_key}",
-    )
 )
 
 stub = Stub("vllm-deepseek-coder-33b", image=image)
@@ -81,7 +76,10 @@ def download_model_to_folder():
     allow_concurrent_inputs=100,
     container_idle_timeout=60,
     gpu=GPU_CONFIG,
-    secrets=[Secret.from_name("huggingface")],
+    secrets=[
+        Secret.from_name("huggingface"),
+        Secret.from_dotenv(),
+    ],
 )
 @web_server(8000, startup_timeout=300)
 def openai_compatible_server():
diff --git a/vllm_llama3_8b.py b/vllm_llama3_8b.py
@@ -13,8 +13,6 @@
 MODEL_DIR = "/model"
 BASE_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
 
-api_key = secrets.token_urlsafe()
-
 # ## Define a container image
 
 
@@ -67,9 +65,6 @@ def download_model_to_folder():
         secrets=[Secret.from_name("huggingface")],
         timeout=60 * 20,
     )
-    .run_commands(
-        f"export VLLM_API_KEY={api_key}",
-    )
 )
 
 stub = Stub("vllm-llama3-8b", image=image)
@@ -81,7 +76,10 @@ def download_model_to_folder():
     allow_concurrent_inputs=100,
     container_idle_timeout=60,
     gpu=GPU_CONFIG,
-    secrets=[Secret.from_name("huggingface")],
+    secrets=[
+        Secret.from_name("huggingface"),
+        Secret.from_dotenv(),
+    ],
 )
 @web_server(8000, startup_timeout=300)
 def openai_compatible_server():
diff --git a/vllm_seallm_7b_v2_5.py b/vllm_seallm_7b_v2_5.py
@@ -13,8 +13,6 @@
 MODEL_DIR = "/model"
 BASE_MODEL = "SeaLLMs/SeaLLM-7B-v2.5"
 
-api_key = secrets.token_urlsafe()
-
 # ## Define a container image
 
 
@@ -67,9 +65,6 @@ def download_model_to_folder():
         secrets=[Secret.from_name("huggingface")],
         timeout=60 * 20,
     )
-    .run_commands(
-        f"export VLLM_API_KEY={api_key}",
-    )
 )
 
 stub = Stub("vllm-seallm-7b-v2.5", image=image)
@@ -81,7 +76,10 @@ def download_model_to_folder():
     allow_concurrent_inputs=100,
         container_idle_timeout=60,
     gpu=GPU_CONFIG,
-    secrets=[Secret.from_name("huggingface")],
+    secrets=[
+        Secret.from_name("huggingface"),
+        Secret.from_dotenv(),
+    ],
 )
 @web_server(8000, startup_timeout=300)
 def openai_compatible_server():
diff --git a/vllm_sqlcoder_7b_2.py b/vllm_sqlcoder_7b_2.py
@@ -13,8 +13,6 @@
 MODEL_DIR = "/model"
 BASE_MODEL = "defog/sqlcoder-7b-2"
 
-api_key = secrets.token_urlsafe()
-
 # ## Define a container image
 
 
@@ -67,9 +65,6 @@ def download_model_to_folder():
         secrets=[Secret.from_name("huggingface")],
         timeout=60 * 20,
     )
-    .run_commands(
-        f"export VLLM_API_KEY={api_key}",
-    )
 )
 
 stub = Stub("vllm-defog-sqlcoder-7b-2", image=image)
@@ -81,7 +76,10 @@ def download_model_to_folder():
     allow_concurrent_inputs=100,
     container_idle_timeout=60,
     gpu=GPU_CONFIG,
-    secrets=[Secret.from_name("huggingface")],
+    secrets=[
+        Secret.from_name("huggingface"),
+        Secret.from_dotenv(),
+    ],
 )
 @web_server(8000, startup_timeout=300)
 def openai_compatible_server():

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+INFINITY_API_KEY=""`
	`2`	`+VLLM_API_KEY=""`