Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update server_vllm.py #253

Merged
merged 3 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions server_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
import argparse
import asyncio
import json
import logging
import re
from typing import Any, AsyncGenerator, Dict, List, Literal, Optional, Tuple, Union
import logging

import fastapi
import uvicorn
Expand All @@ -36,22 +36,31 @@

TIMEOUT_KEEP_ALIVE = 5 # seconds

#logger = init_logger(__name__)
# logger = init_logger(__name__)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())


served_model = None
served_model = []
app = fastapi.FastAPI()


@app.get("/v1/models")
async def show_available_models():
"""Show available models. Right now we only have one model."""
model_cards = [
ModelCard(id=served_model, root=served_model, permission=[ModelPermission()])
]
"""Show available models."""
model_cards = []
if isinstance(served_model, list):
for model in served_model:
model_cards.append(
ModelCard(id=model, root=model, permission=[ModelPermission()])
)
else:
model_cards.append(
ModelCard(
id=served_model, root=served_model, permission=[ModelPermission()]
)
)
return ModelList(data=model_cards)


Expand Down Expand Up @@ -130,9 +139,11 @@ async def create_chat_completion(raw_request: Request):
logger.info(f"args: {args}")

if args.served_model_name is not None:
served_model = args.served_model_name
else:
served_model = args.model
logger.info(
"args.served_model_name is not used in this service and will be ignored. Served model will consist of args.model only."
)

served_model = [args.model]

engine_args = AsyncEngineArgs.from_cli_args(args)
# A separate tokenizer to map token IDs to strings.
Expand Down
2 changes: 1 addition & 1 deletion tests/test_prompt_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, *args, **kwargs):
"meetkai/functionary-small-v2.4",
"meetkai/functionary-small-v2.5",
"meetkai/functionary-medium-v3.0",
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"meetkai/functionary-small-v3.1",
]

def read_example_data(self, template_version: str):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_request_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def __init__(self, *args, **kwargs):
PromptTemplateV2: "meetkai/functionary-small-v2.4",
Llama3Template: "meetkai/functionary-small-v2.5",
Llama3TemplateV3: "meetkai/functionary-medium-v3.0",
Llama31Template: "meta-llama/Meta-Llama-3.1-8B-Instruct",
Llama31Template: "meetkai/functionary-small-v3.1",
LlavaLlama: "lmms-lab/llama3-llava-next-8b",
}
self.default_text_str = "Normal text generation"
Expand Down
Loading