Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docs/docs/references/python_sdk_reference/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,6 @@ from llama_stack_client.types import (
Methods:

- <code title="post /v1/inference/chat-completion">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">chat_completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_response.py">InferenceChatCompletionResponse</a></code>
- <code title="post /v1/inference/completion">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_completion_response.py">InferenceCompletionResponse</a></code>
- <code title="post /v1/inference/embeddings">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">embeddings</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_embeddings_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/embeddings_response.py">EmbeddingsResponse</a></code>

## VectorIo
Expand Down
14 changes: 4 additions & 10 deletions docs/getting_started.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -824,16 +824,10 @@
"\n",
"\n",
"user_input = \"Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003. Extract this information into JSON for me. \"\n",
"response = client.inference.completion(\n",
" model_id=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
" content=user_input,\n",
" stream=False,\n",
" sampling_params={\n",
" \"strategy\": {\n",
" \"type\": \"greedy\",\n",
" },\n",
" \"max_tokens\": 50,\n",
" },\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
" messages=[{\"role\": \"user\", \"content\": user_input}],\n",
" max_tokens=50,\n",
" response_format={\n",
" \"type\": \"json_schema\",\n",
" \"json_schema\": Output.model_json_schema(),\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -706,20 +706,15 @@
" provider_id=\"nvidia\",\n",
")\n",
"\n",
"response = client.inference.completion(\n",
" content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
"response = client.completions.create(\n",
" prompt=\"Complete the sentence using one word: Roses are red, violets are \",\n",
" stream=False,\n",
" model_id=CUSTOMIZED_MODEL_DIR,\n",
" sampling_params={\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"temperature\": 0.7,\n",
" \"top_p\": 0.9\n",
" },\n",
" \"max_tokens\": 20,\n",
" },\n",
" model=CUSTOMIZED_MODEL_DIR,\n",
" temperature=0.7,\n",
" top_p=0.9,\n",
" max_tokens=20,\n",
")\n",
"print(f\"Inference response: {response.content}\")"
"print(f\"Inference response: {response.choices[0].text}\")"
]
},
{
Expand Down Expand Up @@ -1233,20 +1228,15 @@
" provider_id=\"nvidia\",\n",
")\n",
"\n",
"response = client.inference.completion(\n",
" content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
"response = client.completions.create(\n",
" prompt=\"Complete the sentence using one word: Roses are red, violets are \",\n",
" stream=False,\n",
" model_id=customized_chat_model_dir,\n",
" sampling_params={\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"temperature\": 0.7,\n",
" \"top_p\": 0.9\n",
" },\n",
" \"max_tokens\": 20,\n",
" },\n",
" model=customized_chat_model_dir,\n",
" temperature=0.7,\n",
" top_p=0.9,\n",
" max_tokens=20,\n",
")\n",
"print(f\"Inference response: {response.content}\")"
"print(f\"Inference response: {response.choices[0].text}\")"
]
},
{
Expand Down
19 changes: 0 additions & 19 deletions llama_stack/providers/remote/inference/nvidia/NVIDIA.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,25 +39,6 @@ client = LlamaStackAsLibraryClient("nvidia")
client.initialize()
```

### Create Completion

The following example shows how to create a completion for an NVIDIA NIM.

> [!NOTE]
> The hosted NVIDIA Llama NIMs (for example ```meta-llama/Llama-3.1-8B-Instruct```) that have ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` do not support the ```completion``` method, while locally deployed NIMs do.

```python
response = client.inference.completion(
model_id="meta-llama/Llama-3.1-8B-Instruct",
content="Complete the sentence using one word: Roses are red, violets are :",
stream=False,
sampling_params={
"max_tokens": 50,
},
)
print(f"Response: {response.content}")
```

### Create Chat Completion

The following example shows how to create a chat completion for an NVIDIA NIM.
Expand Down
12 changes: 5 additions & 7 deletions llama_stack/providers/remote/post_training/nvidia/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,11 @@ client.models.register(
#### 2. Inference with the fine-tuned model

```python
response = client.inference.completion(
content="Complete the sentence using one word: Roses are red, violets are ",
response = client.completions.create(
prompt="Complete the sentence using one word: Roses are red, violets are ",
stream=False,
model_id="test-example-model@v1",
sampling_params={
"max_tokens": 50,
},
model="test-example-model@v1",
max_tokens=50,
)
print(response.content)
print(response.choices[0].text)
```
6 changes: 3 additions & 3 deletions tests/integration/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,10 @@ Note that when re-recording tests, you must use a Stack pointing to a server (i.

### Basic Test Pattern
```python
def test_basic_completion(llama_stack_client, text_model_id):
response = llama_stack_client.inference.completion(
def test_basic_chat_completion(llama_stack_client, text_model_id):
response = llama_stack_client.inference.chat_completion(
model_id=text_model_id,
content=CompletionMessage(role="user", content="Hello"),
messages=[{"role": "user", "content": "Hello"}],
)

# Test structure, not AI output quality
Expand Down
Loading