Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[wip] deepseek r1 distilled llama 70B #1050

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions 06_gpu_and_ml/llm-serving/download_llama.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# ---
# args: ["--force-download"]
# ---

import modal

MODELS_DIR = "/llamas"

DEFAULT_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
DEFAULT_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"
DEFAULT_NAME = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
DEFAULT_REVISION = "07a264a567ba0863a4ab34fdb3c2b8a54e0bb494"

volume = modal.Volume.from_name("llamas", create_if_missing=True)

Expand Down
14 changes: 7 additions & 7 deletions 06_gpu_and_ml/llm-serving/vllm_inference.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# ---
# deploy: true
# cmd: ["modal", "serve", "06_gpu_and_ml/llm-serving/vllm_inference.py"]
# pytest: false
# tags: ["use-case-lm-inference", "featured"]
# ---

# # Run an OpenAI-Compatible vLLM Server
Expand Down Expand Up @@ -34,7 +32,7 @@
import modal

vllm_image = modal.Image.debian_slim(python_version="3.12").pip_install(
"vllm==0.6.3post1", "fastapi[standard]==0.115.4"
"vllm==v0.6.6.post1", "fastapi[standard]==0.115.4"
)

# ## Download the model weights
Expand All @@ -47,8 +45,8 @@
# [here](https://neuralmagic.com/blog/introducing-machete-a-mixed-input-gemm-kernel-optimized-for-nvidia-hopper-gpus/).

MODELS_DIR = "/llamas"
MODEL_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
MODEL_REVISION = "07a264a567ba0863a4ab34fdb3c2b8a54e0bb494"

# We need to make the weights of that model available to our Modal Functions.

Expand Down Expand Up @@ -80,9 +78,9 @@
# After attaching that engine to the FastAPI app via the `api_server` module of the vLLM library, we return the FastAPI app
# so it can be served on Modal.

app = modal.App("example-vllm-openai-compatible")
app = modal.App("deepseek-r1")

N_GPU = 1 # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count
N_GPU = 2 # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count
TOKEN = "super-secret-token" # auth token. for production use, replace with a modal.Secret

MINUTES = 60 # seconds
Expand All @@ -96,6 +94,7 @@
timeout=24 * HOURS,
allow_concurrent_inputs=1000,
volumes={MODELS_DIR: volume},
keep_warm=1,
)
@modal.asgi_app()
def serve():
Expand Down Expand Up @@ -179,6 +178,7 @@ async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
lora_modules=[],
prompt_adapters=[],
request_logger=request_logger,
chat_template_content_format="raw",
)
api_server.completion = lambda s: OpenAIServingCompletion(
engine,
Expand Down