|
| 1 | +# # Fast inference with Infinity (mixedbread-ai/mxbai-rerank-large-v1) |
| 2 | + |
| 3 | +import os |
| 4 | +import subprocess |
| 5 | +import secrets |
| 6 | + |
| 7 | +from modal import Image, Secret, Stub, enter, gpu, method, web_server |
| 8 | + |
| 9 | +MODEL_DIR = "/model" |
| 10 | +BASE_MODEL = "mixedbread-ai/mxbai-rerank-large-v1" |
| 11 | + |
| 12 | +# ## Define a container image |
| 13 | + |
| 14 | + |
| 15 | +# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this |
| 16 | +# is that the container no longer has to re-download the model from Huggingface - instead, it will take |
| 17 | +# advantage of Modal's internal filesystem for faster cold starts. |
| 18 | +# |
| 19 | +# ### Download the weights |
| 20 | +# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. |
| 21 | +# |
| 22 | +# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. |
| 23 | +def download_model_to_folder(): |
| 24 | + from huggingface_hub import snapshot_download |
| 25 | + from transformers.utils import move_cache |
| 26 | + |
| 27 | + os.makedirs(MODEL_DIR, exist_ok=True) |
| 28 | + |
| 29 | + snapshot_download( |
| 30 | + BASE_MODEL, |
| 31 | + local_dir=MODEL_DIR, |
| 32 | + ignore_patterns=["*.pt", "*.bin"], # Using safetensors |
| 33 | + ) |
| 34 | + move_cache() |
| 35 | + |
| 36 | + |
| 37 | +# ### Image definition |
| 38 | +# We'll start from a recommended Docker Hub image and install `vLLM`. |
| 39 | +# Then we'll use `run_function` to run the function defined above to ensure the weights of |
| 40 | +# the model are saved within the container image. |
| 41 | +image = ( |
| 42 | + Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") |
| 43 | + .pip_install( |
| 44 | + "wheel==0.43.0", |
| 45 | + "huggingface_hub==0.22.2", |
| 46 | + "hf-transfer==0.1.6", |
| 47 | + "torch==2.2.1", |
| 48 | + "sentence-transformers==2.7.0", |
| 49 | + ) |
| 50 | + .apt_install("git") |
| 51 | + .run_commands( |
| 52 | + "git clone https://github.com/monotykamary/infinity.git", |
| 53 | + "cd infinity/libs/infinity_emb && git checkout c8121b9e19fcd7658aa87aea2457979b07c9fd25 && pip install .[all]", |
| 54 | + ) |
| 55 | + # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. |
| 56 | + .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) |
| 57 | + .run_function( |
| 58 | + download_model_to_folder, |
| 59 | + secrets=[Secret.from_name("huggingface")], |
| 60 | + timeout=60 * 20, |
| 61 | + ) |
| 62 | +) |
| 63 | + |
| 64 | +stub = Stub("infinity-mxbai-rerank-large-v1", image=image) |
| 65 | +GPU_CONFIG = gpu.T4(count=1) |
| 66 | + |
| 67 | + |
| 68 | +# Run a web server on port 8000 and expose vLLM OpenAI compatible server |
| 69 | +@stub.function( |
| 70 | + allow_concurrent_inputs=100, |
| 71 | + container_idle_timeout=60, |
| 72 | + gpu=GPU_CONFIG, |
| 73 | + secrets=[ |
| 74 | + Secret.from_name("huggingface"), |
| 75 | + Secret.from_dotenv(), |
| 76 | + ], |
| 77 | +) |
| 78 | +@web_server(7997, startup_timeout=300) |
| 79 | +def infinity_embeddings_server(): |
| 80 | + cmd = f"infinity_emb --model-name-or-path {BASE_MODEL}" |
| 81 | + subprocess.Popen(cmd, shell=True) |
0 commit comments