From 19481c9df5c93a16f6634654bf561ff62a0dc253 Mon Sep 17 00:00:00 2001 From: Paula Ramos Date: Fri, 14 Nov 2025 13:28:06 -0500 Subject: [PATCH 1/4] inference: add CLI overrides for fps/total_pixels and vLLM memory knobs --- scripts/inference.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) mode change 100755 => 100644 scripts/inference.py diff --git a/scripts/inference.py b/scripts/inference.py old mode 100755 new mode 100644 index 27d31a3..eca5ccc --- a/scripts/inference.py +++ b/scripts/inference.py @@ -148,6 +148,20 @@ def main(): type=str, help="Output directory for debugging", ) + #Adding the next two arguments to improve performance on the GPU + parser.add_argument( + "--gpu-memory-utilization", + type=float, + default=0.60, + help="Target fraction of GPU memory vLLM can use for model + KV cache", + ) + parser.add_argument( + "--max-model-len", + type=int, + default=1024, + help="Maximum sequence length for sizing KV cache", + ) + args = parser.parse_args() images: list[str] = args.images or [] @@ -204,8 +218,11 @@ def main(): revision=args.revision, limit_mm_per_prompt={"image": len(images), "video": len(videos)}, enforce_eager=True, + gpu_memory_utilization=args.gpu_memory_utilization, + max_model_len=args.max_model_len, ) + # Process inputs processor: transformers.Qwen2_5_VLProcessor = ( transformers.AutoProcessor.from_pretrained(args.model) @@ -239,14 +256,22 @@ def main(): "mm_processor_kwargs": video_kwargs, } outputs = llm.generate([llm_inputs], sampling_params=sampling_params) + print(SEPARATOR) + full_texts = [] for output in outputs[0].outputs: output_text = output.text + full_texts.append(output_text) print("Assistant:") print(textwrap.indent(output_text.rstrip(), " ")) print(SEPARATOR) - result, _ = extract_tagged_text(output_text) + # Debug: show raw length so we know if it’s really short + print(f"[DEBUG] Total outputs: {len(full_texts)}") + print(f"[DEBUG] Last output length: {len(full_texts[-1]) if full_texts else 0}") + + result, _ = extract_tagged_text(full_texts[-1]) + if args.verbose and result: pprint_dict(result, "Result") From 0919ddf3ad50bd2e9d64af135faae99af9f0bfe3 Mon Sep 17 00:00:00 2001 From: Paula Ramos Date: Tue, 18 Nov 2025 05:33:35 -0500 Subject: [PATCH 2/4] avoid CUDA OOM --- scripts/inference.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/inference.py b/scripts/inference.py index eca5ccc..e83eed5 100644 --- a/scripts/inference.py +++ b/scripts/inference.py @@ -152,13 +152,13 @@ def main(): parser.add_argument( "--gpu-memory-utilization", type=float, - default=0.60, + default=None, help="Target fraction of GPU memory vLLM can use for model + KV cache", ) parser.add_argument( "--max-model-len", type=int, - default=1024, + default=None, help="Maximum sequence length for sizing KV cache", ) @@ -266,10 +266,6 @@ def main(): print(textwrap.indent(output_text.rstrip(), " ")) print(SEPARATOR) - # Debug: show raw length so we know if it’s really short - print(f"[DEBUG] Total outputs: {len(full_texts)}") - print(f"[DEBUG] Last output length: {len(full_texts[-1]) if full_texts else 0}") - result, _ = extract_tagged_text(full_texts[-1]) if args.verbose and result: From 7f6baa26c00fdce0448fa1be75a2015974cffcde Mon Sep 17 00:00:00 2001 From: Paula Ramos Date: Tue, 25 Nov 2025 11:53:08 -0500 Subject: [PATCH 3/4] chore: fix EOF newline and apply CLI override patch --- scripts/inference.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/inference.py b/scripts/inference.py index e83eed5..d5a81b8 100644 --- a/scripts/inference.py +++ b/scripts/inference.py @@ -148,7 +148,7 @@ def main(): type=str, help="Output directory for debugging", ) - #Adding the next two arguments to improve performance on the GPU + # Adding the next two arguments to improve performance on the GPU parser.add_argument( "--gpu-memory-utilization", type=float, @@ -222,7 +222,6 @@ def main(): max_model_len=args.max_model_len, ) - # Process inputs processor: transformers.Qwen2_5_VLProcessor = ( transformers.AutoProcessor.from_pretrained(args.model) From 925275beb2875f2ba2c8549db0f43bfc3bbdab52 Mon Sep 17 00:00:00 2001 From: Paula Ramos Date: Tue, 25 Nov 2025 11:55:50 -0500 Subject: [PATCH 4/4] chore: mark inference script as executable --- scripts/inference.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/inference.py diff --git a/scripts/inference.py b/scripts/inference.py old mode 100644 new mode 100755