From 19481c9df5c93a16f6634654bf561ff62a0dc253 Mon Sep 17 00:00:00 2001
From: Paula Ramos <pjramg@gmail.com>
Date: Fri, 14 Nov 2025 13:28:06 -0500
Subject: [PATCH 1/4] inference: add CLI overrides for fps/total_pixels and
 vLLM memory knobs

---
 scripts/inference.py | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)
 mode change 100755 => 100644 scripts/inference.py

diff --git a/scripts/inference.py b/scripts/inference.py
old mode 100755
new mode 100644
index 27d31a3..eca5ccc
--- a/scripts/inference.py
+++ b/scripts/inference.py
@@ -148,6 +148,20 @@ def main():
         type=str,
         help="Output directory for debugging",
     )
+    #Adding the next two arguments to improve performance on the GPU
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.60,
+        help="Target fraction of GPU memory vLLM can use for model + KV cache",
+    )
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        default=1024,
+        help="Maximum sequence length for sizing KV cache",
+    )
+
     args = parser.parse_args()
 
     images: list[str] = args.images or []
@@ -204,8 +218,11 @@ def main():
         revision=args.revision,
         limit_mm_per_prompt={"image": len(images), "video": len(videos)},
         enforce_eager=True,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        max_model_len=args.max_model_len,
     )
 
+
     # Process inputs
     processor: transformers.Qwen2_5_VLProcessor = (
         transformers.AutoProcessor.from_pretrained(args.model)
@@ -239,14 +256,22 @@ def main():
         "mm_processor_kwargs": video_kwargs,
     }
     outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
+
     print(SEPARATOR)
+    full_texts = []
     for output in outputs[0].outputs:
         output_text = output.text
+        full_texts.append(output_text)
         print("Assistant:")
         print(textwrap.indent(output_text.rstrip(), "  "))
     print(SEPARATOR)
 
-    result, _ = extract_tagged_text(output_text)
+    # Debug: show raw length so we know if it’s really short
+    print(f"[DEBUG] Total outputs: {len(full_texts)}")
+    print(f"[DEBUG] Last output length: {len(full_texts[-1]) if full_texts else 0}")
+
+    result, _ = extract_tagged_text(full_texts[-1])
+
     if args.verbose and result:
         pprint_dict(result, "Result")
 

From 0919ddf3ad50bd2e9d64af135faae99af9f0bfe3 Mon Sep 17 00:00:00 2001
From: Paula Ramos <pjramg@gmail.com>
Date: Tue, 18 Nov 2025 05:33:35 -0500
Subject: [PATCH 2/4] avoid CUDA OOM

---
 scripts/inference.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/scripts/inference.py b/scripts/inference.py
index eca5ccc..e83eed5 100644
--- a/scripts/inference.py
+++ b/scripts/inference.py
@@ -152,13 +152,13 @@ def main():
     parser.add_argument(
         "--gpu-memory-utilization",
         type=float,
-        default=0.60,
+        default=None,
         help="Target fraction of GPU memory vLLM can use for model + KV cache",
     )
     parser.add_argument(
         "--max-model-len",
         type=int,
-        default=1024,
+        default=None,
         help="Maximum sequence length for sizing KV cache",
     )
 
@@ -266,10 +266,6 @@ def main():
         print(textwrap.indent(output_text.rstrip(), "  "))
     print(SEPARATOR)
 
-    # Debug: show raw length so we know if it’s really short
-    print(f"[DEBUG] Total outputs: {len(full_texts)}")
-    print(f"[DEBUG] Last output length: {len(full_texts[-1]) if full_texts else 0}")
-
     result, _ = extract_tagged_text(full_texts[-1])
 
     if args.verbose and result:

From 7f6baa26c00fdce0448fa1be75a2015974cffcde Mon Sep 17 00:00:00 2001
From: Paula Ramos <pjramg@gmail.com>
Date: Tue, 25 Nov 2025 11:53:08 -0500
Subject: [PATCH 3/4] chore: fix EOF newline and apply CLI override patch

---
 scripts/inference.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/inference.py b/scripts/inference.py
index e83eed5..d5a81b8 100644
--- a/scripts/inference.py
+++ b/scripts/inference.py
@@ -148,7 +148,7 @@ def main():
         type=str,
         help="Output directory for debugging",
     )
-    #Adding the next two arguments to improve performance on the GPU
+    # Adding the next two arguments to improve performance on the GPU
     parser.add_argument(
         "--gpu-memory-utilization",
         type=float,
@@ -222,7 +222,6 @@ def main():
         max_model_len=args.max_model_len,
     )
 
-
     # Process inputs
     processor: transformers.Qwen2_5_VLProcessor = (
         transformers.AutoProcessor.from_pretrained(args.model)

From 925275beb2875f2ba2c8549db0f43bfc3bbdab52 Mon Sep 17 00:00:00 2001
From: Paula Ramos <pjramg@gmail.com>
Date: Tue, 25 Nov 2025 11:55:50 -0500
Subject: [PATCH 4/4] chore: mark inference script as executable

---
 scripts/inference.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 scripts/inference.py

diff --git a/scripts/inference.py b/scripts/inference.py
old mode 100644
new mode 100755