From d589a1e9d2c63eabbf0888ba45145d256751d47d Mon Sep 17 00:00:00 2001
From: Ben Cherry <bcherry@gmail.com>
Date: Thu, 12 Dec 2024 17:44:28 -0800
Subject: [PATCH] Fix center_aspect_fit, add center_aspect_scale resizing
 option (#1222)

---
 .changeset/tiny-papayas-film.md               |  9 +++++
 .../livekit/agents/utils/images/image.py      | 40 +++++++++++++++++--
 .../livekit/plugins/anthropic/llm.py          |  2 +-
 .../plugins/openai/beta/assistant_llm.py      |  2 +-
 .../livekit/plugins/openai/utils.py           |  2 +-
 5 files changed, 49 insertions(+), 6 deletions(-)
 create mode 100644 .changeset/tiny-papayas-film.md

diff --git a/.changeset/tiny-papayas-film.md b/.changeset/tiny-papayas-film.md
new file mode 100644
index 0000000000..07ccea04cb
--- /dev/null
+++ b/.changeset/tiny-papayas-film.md
@@ -0,0 +1,9 @@
+---
+"livekit-agents": patch
+"livekit-plugins-anthropic": patch
+"livekit-plugins-openai": patch
+---
+
+Fix center_aspect_fit bug, add scale_aspect_fit and scale_aspect_fill resizing options.
+
+Make scale_aspect_fit the new default resizing option for video frames.
diff --git a/livekit-agents/livekit/agents/utils/images/image.py b/livekit-agents/livekit/agents/utils/images/image.py
index 15755284d2..bcc0a5b5f5 100644
--- a/livekit-agents/livekit/agents/utils/images/image.py
+++ b/livekit-agents/livekit/agents/utils/images/image.py
@@ -33,7 +33,18 @@ class EncodeOptions:
 class ResizeOptions:
     width: int
     height: int
-    strategy: Literal["center_aspect_fit", "center_aspect_cover", "skew"]
+    strategy: Literal[
+        # Fit the image into the provided dimensions, with letterboxing
+        "center_aspect_fit",
+        # Fill the provided dimensions, with cropping
+        "center_aspect_cover",
+        # Fit the image into the provided dimensions, preserving its original aspect ratio
+        "scale_aspect_fit",
+        # Fill the provided dimensions, preserving its original aspect ratio (image will be larger than the provided dimensions)
+        "scale_aspect_cover",
+        # Precisely resize the image to the provided dimensions
+        "skew",
+    ]
 
 
 def import_pil():
@@ -83,10 +94,11 @@ def _resize_image(image: Any, options: EncodeOptions):
 
         # If the new image is wider than the original
         if resize_opts.width / resize_opts.height > image.width / image.height:
-            new_width = resize_opts.width
-            new_height = int(image.height * (resize_opts.width / image.width))
+            new_height = resize_opts.height
+            new_width = int(image.width * (resize_opts.height / image.height))
 
         resized = image.resize((new_width, new_height))
+
         Image.Image.paste(
             result,
             resized,
@@ -118,5 +130,27 @@ def _resize_image(image: Any, options: EncodeOptions):
             ),
         )
         return result
+    elif resize_opts.strategy == "scale_aspect_fill":
+        # Start with assuming width is the limiting dimension
+        new_width = resize_opts.width
+        new_height = int(image.height * (resize_opts.width / image.width))
+
+        # If height is under the limit, scale based on height instead
+        if new_height < resize_opts.height:
+            new_height = resize_opts.height
+            new_width = int(image.width * (resize_opts.height / image.height))
+
+        return image.resize((new_width, new_height))
+    elif resize_opts.strategy == "scale_aspect_fit":
+        # Start with assuming width is the limiting dimension
+        new_width = resize_opts.width
+        new_height = int(image.height * (resize_opts.width / image.width))
+
+        # If height would exceed the limit, scale based on height instead
+        if new_height > resize_opts.height:
+            new_height = resize_opts.height
+            new_width = int(image.width * (resize_opts.height / image.height))
+
+        return image.resize((new_width, new_height))
 
     raise ValueError(f"Unknown resize strategy: {resize_opts.strategy}")
diff --git a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py
index d9e1b2a0dd..644d8432e0 100644
--- a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py
+++ b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py
@@ -441,7 +441,7 @@ def _build_anthropic_image_content(
                 opts.resize_options = utils.images.ResizeOptions(
                     width=image.inference_width,
                     height=image.inference_height,
-                    strategy="center_aspect_fit",
+                    strategy="scale_aspect_fit",
                 )
 
             encoded_data = utils.images.encode(image.image, opts)
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/beta/assistant_llm.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/beta/assistant_llm.py
index a5e3fe2296..7df336e890 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/beta/assistant_llm.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/beta/assistant_llm.py
@@ -532,7 +532,7 @@ async def _upload_frame(
             opts.resize_options = utils.images.ResizeOptions(
                 width=inference_width,
                 height=inference_height,
-                strategy="center_aspect_fit",
+                strategy="scale_aspect_fit",
             )
 
         encoded_data = utils.images.encode(frame, opts)
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/utils.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/utils.py
index 4e8d40c00e..ed830e59e4 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/utils.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/utils.py
@@ -78,7 +78,7 @@ def _build_oai_image_content(image: llm.ChatImage, cache_key: Any):
                 opts.resize_options = utils.images.ResizeOptions(
                     width=image.inference_width,
                     height=image.inference_height,
-                    strategy="center_aspect_fit",
+                    strategy="scale_aspect_fit",
                 )
 
             encoded_data = utils.images.encode(image.image, opts)