awslabs · mvinci12 · Jun 15, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/3.test_cases/pytorch/vllm/cosmos-reason/README.md b/3.test_cases/pytorch/vllm/cosmos-reason/README.md
diff --git a/3.test_cases/pytorch/vllm/cosmos-reason/env_vars.example b/3.test_cases/pytorch/vllm/cosmos-reason/env_vars.example
@@ -0,0 +1,63 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+# Source this file with: source env_vars
+# All variables here are consumed by the kubernetes/ and hyperpod-eks/ manifests
+# (rendered with `envsubst`) and by the examples/ scripts.
+
+# ---- AWS / cluster context ----
+export AWS_REGION="us-west-2"
+export AWS_ACCOUNT_ID="123456789012"
+export NAMESPACE="default"
+
+# ---- Model selection ----
+# Default: Cosmos-Reason1-7B (Qwen2.5-VL backbone). Fits on A10G 24 GB / L4 24 GB / L40S 48 GB.
+# Alternates:
+#   nvidia/Cosmos-Reason2-2B  (Qwen3-VL, 2B params, also fits 24 GB GPUs)
+#   nvidia/Cosmos-Reason2-8B  (Qwen3-VL, 8B params, requires ≥32 GB GPU)
+export MODEL_ID="nvidia/Cosmos-Reason1-7B"
+
+# Cosmos Reason models are gated on Hugging Face — accept terms on the model card first,
+# then create a token at https://huggingface.co/settings/tokens with read access.
+# DO NOT commit this value. Pass it via your shell or the CI secret store.
+export HF_TOKEN=""
+
+# ---- vLLM container ----
+# kubernetes/ (vanilla EKS) — upstream image
+export VLLM_IMAGE_VANILLA="vllm/vllm-openai:v0.21.0"
+
+# hyperpod-eks/ (Inference Operator) — AWS-managed vLLM DLC
+# Tags: vllm:0.17-gpu-py312 (vLLM 0.17.0)  |  vllm:server-sagemaker-cuda-v1 (vLLM 0.19.1)
+export VLLM_IMAGE_AWS_DLC="763104351884.dkr.ecr.${AWS_REGION}.amazonaws.com/vllm:0.17-gpu-py312"
+
+# Additionally for HyperPod path: set the TLS bucket. You can find this in the HyperPod console within the `Inference` tab called `S3 bucket for TLS certificates`
+export TLS_CERT_S3_URI="s3://hyperpod-tls-<id>/certs"
+
+# ---- Hardware sizing ----
+# Validated combinations (model | GPU | TP | max-model-len):
+#   Cosmos-Reason1-7B  | A10G 24G   | TP=1 | 24576 (reduce to 8192 if OOM during CUDA graph capture)
+#   Cosmos-Reason1-7B  | L4  24G    | TP=1 | 24576 (reduce to 8192 if OOM during CUDA graph capture)
+#   Cosmos-Reason1-7B  | L40S 48G   | TP=1 | 32768
+#   Cosmos-Reason2-2B  | A10G/L4    | TP=1 | 16384
+#   Cosmos-Reason2-8B  | L40S 48G   | TP=1 | 16384
+#   Cosmos-Reason2-8B  | g6.12xl    | TP=4 | 16384  (4× L4, PCIe-only — slower than NVLink)
+#   Cosmos-Reason2-8B  | H100 80G   | TP=1 | 32768
+export INSTANCE_TYPE="g5.8xlarge"
+export HYPERPOD_INSTANCE_TYPE="ml.${INSTANCE_TYPE}"
+export TENSOR_PARALLEL_SIZE="1"
+export MAX_MODEL_LEN="24576"
+export GPU_MEMORY_UTILIZATION="0.92"
+
+# ---- vLLM serving args ----
+# DTYPE: bfloat16 is the only NVIDIA-tested precision for Cosmos Reason
+export DTYPE="bfloat16"
+
+# ---- Reason2 (Qwen3-VL) vLLM args ----
+# If deploying Cosmos-Reason2-*, you must manually edit the deployment manifest:
+#   kubernetes/deployment.yaml: uncomment the --reasoning-parser and --media-io-kwargs lines
+#   hyperpod-eks/endpoint.yaml: uncomment the --reasoning-parser and SM_VLLM_REASONING_PARSER lines
+# See the Troubleshooting section in README.md for details.
+
+# ---- Endpoint defaults ----
+export ENDPOINT_NAME="cosmos-reason"
+export INVOCATION_PORT="8000"
diff --git a/3.test_cases/pytorch/vllm/cosmos-reason/examples/.gitignore b/3.test_cases/pytorch/vllm/cosmos-reason/examples/.gitignore
@@ -0,0 +1,8 @@
+# Downloaded sample media (fetched by download_samples.sh)
+sample.jpg
+sample_meteor.webm
+*.webm
+*.mp4
+
+# JSONL output from auto_label.py
+*.jsonl
diff --git a/3.test_cases/pytorch/vllm/cosmos-reason/examples/README.md b/3.test_cases/pytorch/vllm/cosmos-reason/examples/README.md
@@ -0,0 +1,58 @@
+# Cosmos Reason — Client Examples
+
+Reference Python clients exercising three Cosmos Reason use cases against an OpenAI-compatible
+vLLM endpoint.
+
+| Script | Use case | Latency target |
+|--------|----------|---------------|
+| `image_vqa.py` | Single-image visual Q&A | < 1 s for short reply |
+| `video_qa.py` | Short video clip Q&A | 5-15 s |
+| `auto_label.py` | SDG critic loop — `<think>` reasoning + structured `<answer>` JSON | 10-30 s |
+
+## Setup
+
+```bash
+pip install requests urllib3
+
+# Download sample media (image + video)
+./download_samples.sh
+
+# If Pod is in-cluster, port-forward first:
+kubectl port-forward svc/cosmos-reason 8000:8000 &
+
+# OR set the operator-managed endpoint URL:
+export ENDPOINT="https://cosmos-reason-<id>.elb.<region>.amazonaws.com"
+```
+
+By default all scripts hit `http://localhost:8000`. Override with `--endpoint` or
+`$ENDPOINT`. Use `--insecure` if the endpoint uses a self-signed TLS certificate
+(e.g., operator-managed ALB).
+
+## Examples
+
+```bash
+# Single image
+python3 image_vqa.py --image sample.jpg \
+  --prompt "What is the safety risk in this scene?"
+
+# Short video clip
+python3 video_qa.py --video sample_meteor.webm \
+  --prompt "Describe what is happening in this video."
+
+# Batch SDG auto-labeling (with retry on transient errors)
+python3 auto_label.py --image-dir . --output labels.jsonl --limit 1
+
+# With self-signed cert (operator-managed ALB)
+python3 image_vqa.py --endpoint https://cosmos-reason.elb.us-west-2.amazonaws.com \
+  --image sample.jpg --insecure
+```
+
+## Notes
+
+- Cosmos-Reason1 (Qwen2.5-VL) emits `<think>...</think><answer>...</answer>` inline
+  in the `content` field. The scripts here parse those tags.
+- Cosmos-Reason2 (Qwen3-VL) with `--reasoning-parser qwen3` separates `<think>` into
+  the response's `reasoning_content` field. The scripts handle both formats.
+- `MODEL_ID` is read from `$MODEL_ID` env var, defaulting to `nvidia/Cosmos-Reason1-7B`.
+- `auto_label.py` supports `--max-retries N` (default 3) for transient HTTP errors
+  (429, 502, 503, 504) with exponential backoff.
diff --git a/3.test_cases/pytorch/vllm/cosmos-reason/examples/auto_label.py b/3.test_cases/pytorch/vllm/cosmos-reason/examples/auto_label.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+"""
+auto_label.py — Synthetic Data Generation (SDG) auto-labeling using Cosmos Reason.
+
+Pattern: AV training-data captioning, as adopted by Uber
+(https://blogs.nvidia.com/blog/nemotron-cosmos-reasoning-enterprise-physical-ai/).
+Each input image gets a structured JSON label plus the model's chain-of-thought
+reasoning trace. Useful for filtering implausible Cosmos-Predict outputs in an
+SDG critic loop, or for bootstrapping training labels.
+
+Output: one JSON object per line (JSONL).
+
+Example:
+    python3 auto_label.py --image-dir ./scenes/ --output labels.jsonl
+    python3 auto_label.py --image-dir ./scenes/ --schema custom_schema.json
+"""
+
+import argparse
+import json
+import os
+import re
+import sys
+import time
+from pathlib import Path
+from typing import Optional
+
+import requests
+import urllib3
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+from image_vqa import encode_image, parse_reasoning_response  # reuse helpers
+
+
+DEFAULT_SCHEMA = {
+    "scene": "string — short description",
+    "objects": "list[string] — primary visible objects",
+    "hazards": "list[string] — identified safety concerns",
+    "weather": "string — clear / rain / snow / fog / cloudy / unknown",
+    "time_of_day": "string — dawn / day / dusk / night / unknown",
+}
+
+
+def make_session(max_retries: int) -> requests.Session:
+    """Build a requests.Session with retry logic for transient HTTP errors."""
+    session = requests.Session()
+    retry = Retry(
+        total=max_retries,
+        backoff_factor=1.0,
+        status_forcelist=[429, 502, 503, 504],
+        allowed_methods=["POST"],
+    )
+    session.mount("http://", HTTPAdapter(max_retries=retry))
+    session.mount("https://", HTTPAdapter(max_retries=retry))
+    return session
+
+
+def extract_json_from_answer(answer: str) -> Optional[dict]:
+    """Try hard to pull a JSON object out of the model's <answer> block."""
+    if not answer:
+        return None
+    # JSON inside ```json ... ``` fence
+    fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", answer, re.DOTALL)
+    if fence:
+        try:
+            return json.loads(fence.group(1))
+        except json.JSONDecodeError:
+            pass
+    # Bare JSON object
+    bare = re.search(r"\{.*\}", answer, re.DOTALL)
+    if bare:
+        try:
+            return json.loads(bare.group(0))
+        except json.JSONDecodeError:
+            pass
+    return None
+
+
+def label_image(image_path: Path, endpoint: str, model: str, schema: dict,
+                max_tokens: int, session: requests.Session, verify_tls: bool) -> dict:
+    image_url = encode_image(str(image_path))
+
+    system = (
+        "You are auto-labeling driving scenes for AV training data. "
+        "Output your reasoning in <think>...</think>, then output a JSON label "
+        f"in <answer>...</answer> matching this schema: {json.dumps(schema)}"
+    )
+
+    payload = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": system},
+            {"role": "user", "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": "Label this scene."},
+            ]},
+        ],
+        "max_tokens": max_tokens,
+        "temperature": 0.4,
+    }
+
+    start = time.monotonic()
+    r = session.post(f"{endpoint}/v1/chat/completions",
+                     headers={"Content-Type": "application/json"},
+                     json=payload,
+                     verify=verify_tls,
+                     timeout=300)
+    elapsed_ms = int((time.monotonic() - start) * 1000)
+    r.raise_for_status()
+    data = r.json()
+
+    msg = data["choices"][0]["message"]
+    reasoning, answer = parse_reasoning_response(msg)
+    label = extract_json_from_answer(answer)
+
+    return {
+        "image": str(image_path),
+        "elapsed_ms": elapsed_ms,
+        "completion_tokens": data["usage"]["completion_tokens"],
+        "label": label,
+        "reasoning": reasoning,
+        "raw_answer": answer if not label else None,
+    }
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--endpoint", default=os.environ.get("ENDPOINT", "http://localhost:8000"))
+    parser.add_argument("--model", default=os.environ.get("MODEL_ID", "nvidia/Cosmos-Reason1-7B"))
+    parser.add_argument("--image-dir", required=True, help="Directory containing images to label")
+    parser.add_argument("--output", default="labels.jsonl", help="JSONL output path")
+    parser.add_argument("--schema", help="Path to a JSON file with the label schema (overrides default)")
+    parser.add_argument("--max-tokens", type=int, default=800)
+    parser.add_argument("--limit", type=int, default=0,
+                        help="Process at most N images (0 = unlimited)")
+    parser.add_argument("--max-retries", type=int, default=3,
+                        help="Max retries per image on transient HTTP errors (429/502/503/504)")
+    parser.add_argument("--insecure", action="store_true",
+                        help="Disable TLS certificate verification (for self-signed certs)")
+    args = parser.parse_args()
+
+    verify_tls = not args.insecure
+    if args.insecure:
+        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+    session = make_session(args.max_retries)
+
+    schema = DEFAULT_SCHEMA
+    if args.schema:
+        with open(args.schema) as f:
+            schema = json.load(f)
+
+    image_dir = Path(args.image_dir)
+    images = sorted([p for p in image_dir.iterdir()
+                     if p.suffix.lower() in {".jpg", ".jpeg", ".png", ".webp"}])
+    if args.limit > 0:
+        images = images[:args.limit]
+
+    if not images:
+        print(f"No images found in {image_dir}", file=sys.stderr)
+        return 1
+
+    print(f"Labeling {len(images)} images against {args.endpoint} ({args.model})...")
+
+    with open(args.output, "w") as out:
+        for i, img in enumerate(images, 1):
+            try:
+                result = label_image(img, args.endpoint, args.model, schema,
+                                     args.max_tokens, session, verify_tls)
+                out.write(json.dumps(result) + "\n")
+                out.flush()
+                ok = "OK" if result["label"] else "PARSE_FAILED"
+                print(f"  [{i}/{len(images)}] {img.name} {ok} ({result['elapsed_ms']} ms)")
+            except Exception as exc:  # noqa: BLE001
+                err = {"image": str(img), "error": str(exc)}
+                out.write(json.dumps(err) + "\n")
+                out.flush()
+                print(f"  [{i}/{len(images)}] {img.name} ERROR: {exc}", file=sys.stderr)
+
+    print(f"\nWrote {args.output}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/3.test_cases/pytorch/vllm/cosmos-reason/examples/download_samples.sh b/3.test_cases/pytorch/vllm/cosmos-reason/examples/download_samples.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+# Download sample media for the Cosmos Reason example clients.
+#
+# Image: Unsplash (Unsplash License — free for commercial and non-commercial use)
+# Video: Wikimedia Commons (CC BY 3.0)
+set -euo pipefail
+
+cd "$(dirname "$0")"
+
+echo "Downloading sample.jpg (urban street scene from Unsplash)..."
+curl -L -o sample.jpg \
+  "https://images.unsplash.com/photo-1449824913935-59a10b8d2000?w=640"
+
+echo "Downloading sample video from Wikimedia Commons..."
+curl -L -o sample_meteor.webm \
+  "https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/2013_Russian_meteor_event_(Magnitogorsk).webm"
+
+echo ""
+echo "Downloaded:"
+ls -lh sample.jpg sample_meteor.webm
+echo ""
+echo "Run examples:"
+echo "  python3 image_vqa.py --image sample.jpg"
+echo "  python3 video_qa.py --video sample_meteor.webm"
+echo "  python3 auto_label.py --image-dir . --limit 1"