Skip to content

Commit 7139575

Browse files
committed
feat: add mxbai and set api keys from dotenv
1 parent 349bb85 commit 7139575

9 files changed

+186
-31
lines changed

.env.example

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
INFINITY_API_KEY=""
2+
VLLM_API_KEY=""

devbox.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"shell": {
88
"init_hook": [
99
". $VENV_DIR/bin/activate",
10-
"pip install modal",
10+
"pip install modal python-dotenv",
1111
"modal profile activate default"
1212
],
1313
"scripts": {

infinity_mxbai_embed_large_v1.py

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# # Fast inference with Infinity (mixedbread-ai/mxbai-embed-large-v1)
2+
3+
import os
4+
import subprocess
5+
import secrets
6+
7+
from modal import Image, Secret, Stub, enter, gpu, method, web_server
8+
9+
MODEL_DIR = "/model"
10+
BASE_MODEL = "mixedbread-ai/mxbai-embed-large-v1"
11+
12+
# ## Define a container image
13+
14+
15+
# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this
16+
# is that the container no longer has to re-download the model from Huggingface - instead, it will take
17+
# advantage of Modal's internal filesystem for faster cold starts.
18+
#
19+
# ### Download the weights
20+
# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`.
21+
#
22+
# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run.
23+
def download_model_to_folder():
24+
from huggingface_hub import snapshot_download
25+
from transformers.utils import move_cache
26+
27+
os.makedirs(MODEL_DIR, exist_ok=True)
28+
29+
snapshot_download(
30+
BASE_MODEL,
31+
local_dir=MODEL_DIR,
32+
ignore_patterns=["*.pt", "*.bin"], # Using safetensors
33+
)
34+
move_cache()
35+
36+
37+
# ### Image definition
38+
# We'll start from a recommended Docker Hub image and install `vLLM`.
39+
# Then we'll use `run_function` to run the function defined above to ensure the weights of
40+
# the model are saved within the container image.
41+
image = (
42+
Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
43+
.pip_install(
44+
"wheel==0.43.0",
45+
"huggingface_hub==0.22.2",
46+
"hf-transfer==0.1.6",
47+
"torch==2.2.1",
48+
)
49+
.apt_install("git")
50+
.run_commands(
51+
"git clone https://github.com/monotykamary/infinity.git",
52+
"cd infinity/libs/infinity_emb && git checkout c8121b9e19fcd7658aa87aea2457979b07c9fd25 && pip install .[all]",
53+
)
54+
# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
55+
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
56+
.run_function(
57+
download_model_to_folder,
58+
secrets=[Secret.from_name("huggingface")],
59+
timeout=60 * 20,
60+
)
61+
)
62+
63+
stub = Stub("infinity-mxbai-embed-large-v1", image=image)
64+
GPU_CONFIG = gpu.T4(count=1)
65+
66+
67+
# Run a web server on port 8000 and expose vLLM OpenAI compatible server
68+
@stub.function(
69+
allow_concurrent_inputs=100,
70+
container_idle_timeout=60,
71+
gpu=GPU_CONFIG,
72+
secrets=[
73+
Secret.from_name("huggingface"),
74+
Secret.from_dotenv(),
75+
],
76+
)
77+
@web_server(7997, startup_timeout=300)
78+
def infinity_embeddings_server():
79+
cmd = f"infinity_emb --model-name-or-path {BASE_MODEL}"
80+
subprocess.Popen(cmd, shell=True)

infinity_mxbai_rerank_large_v1.py

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# # Fast inference with Infinity (mixedbread-ai/mxbai-rerank-large-v1)
2+
3+
import os
4+
import subprocess
5+
import secrets
6+
7+
from modal import Image, Secret, Stub, enter, gpu, method, web_server
8+
9+
MODEL_DIR = "/model"
10+
BASE_MODEL = "mixedbread-ai/mxbai-rerank-large-v1"
11+
12+
# ## Define a container image
13+
14+
15+
# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this
16+
# is that the container no longer has to re-download the model from Huggingface - instead, it will take
17+
# advantage of Modal's internal filesystem for faster cold starts.
18+
#
19+
# ### Download the weights
20+
# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`.
21+
#
22+
# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run.
23+
def download_model_to_folder():
24+
from huggingface_hub import snapshot_download
25+
from transformers.utils import move_cache
26+
27+
os.makedirs(MODEL_DIR, exist_ok=True)
28+
29+
snapshot_download(
30+
BASE_MODEL,
31+
local_dir=MODEL_DIR,
32+
ignore_patterns=["*.pt", "*.bin"], # Using safetensors
33+
)
34+
move_cache()
35+
36+
37+
# ### Image definition
38+
# We'll start from a recommended Docker Hub image and install `vLLM`.
39+
# Then we'll use `run_function` to run the function defined above to ensure the weights of
40+
# the model are saved within the container image.
41+
image = (
42+
Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
43+
.pip_install(
44+
"wheel==0.43.0",
45+
"huggingface_hub==0.22.2",
46+
"hf-transfer==0.1.6",
47+
"torch==2.2.1",
48+
"sentence-transformers==2.7.0",
49+
)
50+
.apt_install("git")
51+
.run_commands(
52+
"git clone https://github.com/monotykamary/infinity.git",
53+
"cd infinity/libs/infinity_emb && git checkout c8121b9e19fcd7658aa87aea2457979b07c9fd25 && pip install .[all]",
54+
)
55+
# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
56+
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
57+
.run_function(
58+
download_model_to_folder,
59+
secrets=[Secret.from_name("huggingface")],
60+
timeout=60 * 20,
61+
)
62+
)
63+
64+
stub = Stub("infinity-mxbai-rerank-large-v1", image=image)
65+
GPU_CONFIG = gpu.T4(count=1)
66+
67+
68+
# Run a web server on port 8000 and expose vLLM OpenAI compatible server
69+
@stub.function(
70+
allow_concurrent_inputs=100,
71+
container_idle_timeout=60,
72+
gpu=GPU_CONFIG,
73+
secrets=[
74+
Secret.from_name("huggingface"),
75+
Secret.from_dotenv(),
76+
],
77+
)
78+
@web_server(7997, startup_timeout=300)
79+
def infinity_embeddings_server():
80+
cmd = f"infinity_emb --model-name-or-path {BASE_MODEL}"
81+
subprocess.Popen(cmd, shell=True)

infinity_snowflake_arctic_embed_l_335m.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
MODEL_DIR = "/model"
1010
BASE_MODEL = "Snowflake/snowflake-arctic-embed-l"
1111

12-
api_key = secrets.token_urlsafe()
13-
1412
# ## Define a container image
1513

1614

@@ -51,7 +49,7 @@ def download_model_to_folder():
5149
.apt_install("git")
5250
.run_commands(
5351
"git clone https://github.com/monotykamary/infinity.git",
54-
"cd infinity/libs/infinity_emb && git checkout e63545da1c5c0607831b94ce75707d6dcf9f5474 && pip install .[all]",
52+
"cd infinity/libs/infinity_emb && git checkout c8121b9e19fcd7658aa87aea2457979b07c9fd25 && pip install .[all]",
5553
)
5654
# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
5755
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
@@ -71,10 +69,12 @@ def download_model_to_folder():
7169
allow_concurrent_inputs=100,
7270
container_idle_timeout=60,
7371
gpu=GPU_CONFIG,
74-
secrets=[Secret.from_name("huggingface")],
72+
secrets=[
73+
Secret.from_name("huggingface"),
74+
Secret.from_dotenv(),
75+
],
7576
)
7677
@web_server(7997, startup_timeout=300)
7778
def infinity_embeddings_server():
78-
api_key = os.getenv("INFINITY_API_KEY")
79-
cmd = f"INFINITY_API_KEY={api_key} infinity_emb --model-name-or-path {BASE_MODEL}"
79+
cmd = f"infinity_emb --model-name-or-path {BASE_MODEL}"
8080
subprocess.Popen(cmd, shell=True)

vllm_deepseek_coder_33b.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
MODEL_DIR = "/model"
1414
BASE_MODEL = "TheBloke/deepseek-coder-33B-instruct-AWQ"
1515

16-
api_key = secrets.token_urlsafe()
17-
1816
# ## Define a container image
1917

2018

@@ -67,9 +65,6 @@ def download_model_to_folder():
6765
secrets=[Secret.from_name("huggingface")],
6866
timeout=60 * 20,
6967
)
70-
.run_commands(
71-
f"export VLLM_API_KEY={api_key}",
72-
)
7368
)
7469

7570
stub = Stub("vllm-deepseek-coder-33b", image=image)
@@ -81,7 +76,10 @@ def download_model_to_folder():
8176
allow_concurrent_inputs=100,
8277
container_idle_timeout=60,
8378
gpu=GPU_CONFIG,
84-
secrets=[Secret.from_name("huggingface")],
79+
secrets=[
80+
Secret.from_name("huggingface"),
81+
Secret.from_dotenv(),
82+
],
8583
)
8684
@web_server(8000, startup_timeout=300)
8785
def openai_compatible_server():

vllm_llama3_8b.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
MODEL_DIR = "/model"
1414
BASE_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
1515

16-
api_key = secrets.token_urlsafe()
17-
1816
# ## Define a container image
1917

2018

@@ -67,9 +65,6 @@ def download_model_to_folder():
6765
secrets=[Secret.from_name("huggingface")],
6866
timeout=60 * 20,
6967
)
70-
.run_commands(
71-
f"export VLLM_API_KEY={api_key}",
72-
)
7368
)
7469

7570
stub = Stub("vllm-llama3-8b", image=image)
@@ -81,7 +76,10 @@ def download_model_to_folder():
8176
allow_concurrent_inputs=100,
8277
container_idle_timeout=60,
8378
gpu=GPU_CONFIG,
84-
secrets=[Secret.from_name("huggingface")],
79+
secrets=[
80+
Secret.from_name("huggingface"),
81+
Secret.from_dotenv(),
82+
],
8583
)
8684
@web_server(8000, startup_timeout=300)
8785
def openai_compatible_server():

vllm_seallm_7b_v2_5.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
MODEL_DIR = "/model"
1414
BASE_MODEL = "SeaLLMs/SeaLLM-7B-v2.5"
1515

16-
api_key = secrets.token_urlsafe()
17-
1816
# ## Define a container image
1917

2018

@@ -67,9 +65,6 @@ def download_model_to_folder():
6765
secrets=[Secret.from_name("huggingface")],
6866
timeout=60 * 20,
6967
)
70-
.run_commands(
71-
f"export VLLM_API_KEY={api_key}",
72-
)
7368
)
7469

7570
stub = Stub("vllm-seallm-7b-v2.5", image=image)
@@ -81,7 +76,10 @@ def download_model_to_folder():
8176
allow_concurrent_inputs=100,
8277
container_idle_timeout=60,
8378
gpu=GPU_CONFIG,
84-
secrets=[Secret.from_name("huggingface")],
79+
secrets=[
80+
Secret.from_name("huggingface"),
81+
Secret.from_dotenv(),
82+
],
8583
)
8684
@web_server(8000, startup_timeout=300)
8785
def openai_compatible_server():

vllm_sqlcoder_7b_2.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
MODEL_DIR = "/model"
1414
BASE_MODEL = "defog/sqlcoder-7b-2"
1515

16-
api_key = secrets.token_urlsafe()
17-
1816
# ## Define a container image
1917

2018

@@ -67,9 +65,6 @@ def download_model_to_folder():
6765
secrets=[Secret.from_name("huggingface")],
6866
timeout=60 * 20,
6967
)
70-
.run_commands(
71-
f"export VLLM_API_KEY={api_key}",
72-
)
7368
)
7469

7570
stub = Stub("vllm-defog-sqlcoder-7b-2", image=image)
@@ -81,7 +76,10 @@ def download_model_to_folder():
8176
allow_concurrent_inputs=100,
8277
container_idle_timeout=60,
8378
gpu=GPU_CONFIG,
84-
secrets=[Secret.from_name("huggingface")],
79+
secrets=[
80+
Secret.from_name("huggingface"),
81+
Secret.from_dotenv(),
82+
],
8583
)
8684
@web_server(8000, startup_timeout=300)
8785
def openai_compatible_server():

0 commit comments

Comments
 (0)