add video url support to chat.completions (#256)

orangetin · web-flow · commit 262eed883eb6 · 2025-03-05T20:13:58.000-08:00
* add support for video_url in chat.completions

* update documentation

* bump version
diff --git a/README.md b/README.md
@@ -52,25 +52,101 @@ This repo contains both a Python Library and a CLI. We'll demonstrate how to use
 ### Chat Completions
 
 ```python
-import os
 from together import Together
 
-client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
+client = Together()
 
+# Simple text message
 response = client.chat.completions.create(
     model="mistralai/Mixtral-8x7B-Instruct-v0.1",
     messages=[{"role": "user", "content": "tell me about new york"}],
 )
 print(response.choices[0].message.content)
+
+# Multi-modal message with text and image
+response = client.chat.completions.create(
+    model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
+    messages=[{
+        "role": "user", 
+        "content": [
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
+                }
+            }
+        ]
+    }]
+)
+print(response.choices[0].message.content)
+
+# Multi-modal message with multiple images
+response = client.chat.completions.create(
+    model="Qwen/Qwen2.5-VL-72B-Instruct",
+    messages=[{
+        "role": "user",
+        "content": [
+            {
+                "type": "text", 
+                "text": "Compare these two images."
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
+                }
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/slack.png"
+                }
+            }
+        ]
+    }]
+)
+print(response.choices[0].message.content)
+
+# Multi-modal message with text and video
+response = client.chat.completions.create(
+    model="Qwen/Qwen2.5-VL-72B-Instruct",
+    messages=[{
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What's happening in this video?"
+            },
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url": "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+                }
+            }
+        ]
+    }]
+)
+print(response.choices[0].message.content)
 ```
 
+The chat completions API supports three types of content:
+- Plain text messages using the `content` field directly
+- Multi-modal messages with images using `type: "image_url"`
+- Multi-modal messages with videos using `type: "video_url"`
+
+When using multi-modal content, the `content` field becomes an array of content objects, each with its own type and corresponding data.
+
 #### Streaming
 
 ```python
 import os
 from together import Together
 
-client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
+client = Together()
 stream = client.chat.completions.create(
     model="mistralai/Mixtral-8x7B-Instruct-v0.1",
     messages=[{"role": "user", "content": "tell me about new york"}],
@@ -84,17 +160,17 @@ for chunk in stream:
 #### Async usage
 
 ```python
-import os, asyncio
+import asyncio
 from together import AsyncTogether
 
-async_client = AsyncTogether(api_key=os.environ.get("TOGETHER_API_KEY"))
+async_client = AsyncTogether()
 messages = [
     "What are the top things to do in San Francisco?",
     "What country is Paris in?",
 ]
 
 async def async_chat_completion(messages):
-    async_client = AsyncTogether(api_key=os.environ.get("TOGETHER_API_KEY"))
+    async_client = AsyncTogether()
     tasks = [
         async_client.chat.completions.create(
             model="mistralai/Mixtral-8x7B-Instruct-v0.1",
@@ -115,10 +191,9 @@ asyncio.run(async_chat_completion(messages))
 Completions are for code and language models shown [here](https://docs.together.ai/docs/inference-models). Below, a code model example is shown.
 
 ```python
-import os
 from together import Together
 
-client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
+client = Together()
 
 response = client.completions.create(
     model="codellama/CodeLlama-34b-Python-hf",
@@ -131,10 +206,9 @@ print(response.choices[0].text)
 #### Streaming
 
 ```python
-import os
 from together import Together
 
-client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
+client = Together()
 stream = client.completions.create(
     model="codellama/CodeLlama-34b-Python-hf",
     prompt="Write a Next.js component with TailwindCSS for a header component.",
@@ -148,10 +222,10 @@ for chunk in stream:
 #### Async usage
 
 ```python
-import os, asyncio
+import asyncio
 from together import AsyncTogether
 
-async_client = AsyncTogether(api_key=os.environ.get("TOGETHER_API_KEY"))
+async_client = AsyncTogether()
 prompts = [
     "Write a Next.js component with TailwindCSS for a header component.",
     "Write a python function for the fibonacci sequence",
@@ -176,10 +250,9 @@ asyncio.run(async_chat_completion(prompts))
 ### Image generation
 
 ```python
-import os
 from together import Together
 
-client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
+client = Together()
 
 response = client.images.generate(
     prompt="space robots",
@@ -196,7 +269,7 @@ print(response.data[0].b64_json)
 from typing import List
 from together import Together
 
-client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
+client = Together()
 
 def get_embeddings(texts: List[str], model: str) -> List[List[float]]:
     texts = [text.replace("\n", " ") for text in texts]
@@ -215,7 +288,7 @@ print(embeddings)
 from typing import List
 from together import Together
 
-client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
+client = Together()
 
 def get_reranked_documents(query: str, documents: List[str], model: str, top_n: int = 3) -> List[str]:
     outputs = client.rerank.create(model=model, query=query, documents=documents, top_n=top_n)
@@ -237,10 +310,9 @@ Read more about Reranking [here](https://docs.together.ai/docs/rerank-overview).
 The files API is used for fine-tuning and allows developers to upload data to fine-tune on. It also has several methods to list all files, retrive files, and delete files. Please refer to our fine-tuning docs [here](https://docs.together.ai/docs/fine-tuning-python).
 
 ```python
-import os
 from together import Together
 
-client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
+client = Together()
 
 client.files.upload(file="somedata.jsonl") # uploads a file
 client.files.list() # lists all uploaded files
@@ -254,10 +326,9 @@ client.files.delete(id="file-d0d318cb-b7d9-493a-bd70-1cfe089d3815") # deletes a
 The finetune API is used for fine-tuning and allows developers to create finetuning jobs. It also has several methods to list all jobs, retrive statuses and get checkpoints. Please refer to our fine-tuning docs [here](https://docs.together.ai/docs/fine-tuning-python).
 
 ```python
-import os
 from together import Together
 
-client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
+client = Together()
 
 client.fine_tuning.create(
   training_file = 'file-d0d318cb-b7d9-493a-bd70-1cfe089d3815',
@@ -281,10 +352,9 @@ client.fine_tuning.download(id="ft-c66a5c18-1d6d-43c9-94bd-32d756425b4b") # down
 This lists all the models that Together supports.
 
 ```python
-import os
 from together import Together
 
-client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
+client = Together()
 
 models = client.models.list()
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"
 
 [tool.poetry]
 name = "together"
-version = "1.4.1"
+version = "1.4.2"
 authors = [
     "Together AI <support@together.ai>"
 ]
diff --git a/src/together/types/chat_completions.py b/src/together/types/chat_completions.py
@@ -44,16 +44,22 @@ class ToolCalls(BaseModel):
 class ChatCompletionMessageContentType(str, Enum):
     TEXT = "text"
     IMAGE_URL = "image_url"
+    VIDEO_URL = "video_url"
 
 
 class ChatCompletionMessageContentImageURL(BaseModel):
     url: str
 
 
+class ChatCompletionMessageContentVideoURL(BaseModel):
+    url: str
+
+
 class ChatCompletionMessageContent(BaseModel):
     type: ChatCompletionMessageContentType
     text: str | None = None
     image_url: ChatCompletionMessageContentImageURL | None = None
+    video_url: ChatCompletionMessageContentVideoURL | None = None
 
 
 class ChatCompletionMessage(BaseModel):
diff --git a/tests/unit/test_video_url.py b/tests/unit/test_video_url.py
@@ -0,0 +1,40 @@
+from together.types.chat_completions import (
+    ChatCompletionMessage,
+    ChatCompletionMessageContent,
+    ChatCompletionMessageContentType,
+    ChatCompletionMessageContentVideoURL,
+    MessageRole,
+)
+
+
+def test_video_url_message():
+    # Test creating a message with video_url content
+    message = ChatCompletionMessage(
+        role=MessageRole.USER,
+        content=[
+            ChatCompletionMessageContent(
+                type=ChatCompletionMessageContentType.TEXT, text="What's in this video?"
+            ),
+            ChatCompletionMessageContent(
+                type=ChatCompletionMessageContentType.VIDEO_URL,
+                video_url=ChatCompletionMessageContentVideoURL(
+                    url="https://example.com/video.mp4"
+                ),
+            ),
+        ],
+    )
+
+    # Verify the message structure
+    assert message.role == MessageRole.USER
+    assert isinstance(message.content, list)
+    assert len(message.content) == 2
+
+    # Verify text content
+    assert message.content[0].type == ChatCompletionMessageContentType.TEXT
+    assert message.content[0].text == "What's in this video?"
+    assert message.content[0].video_url is None
+
+    # Verify video_url content
+    assert message.content[1].type == ChatCompletionMessageContentType.VIDEO_URL
+    assert message.content[1].text is None
+    assert message.content[1].video_url.url == "https://example.com/video.mp4"

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"`
`12`	`12`
`13`	`13`	`[tool.poetry]`
`14`	`14`	`name = "together"`
`15`		`-version = "1.4.1"`
	`15`	`+version = "1.4.2"`
`16`	`16`	`authors = [`
`17`	`17`	`"Together AI <[email protected]>"`
`18`	`18`	`]`