From 1cbd56cf58e6e05801645357a51d122285a1587e Mon Sep 17 00:00:00 2001
From: Sahana <sahanave@gmail.com>
Date: Sun, 14 Sep 2025 14:06:41 +0100
Subject: [PATCH 01/10] first version of inital-pos

---
 src/lerobot/scripts/collect_initpos.py | 151 +++++++++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100644 src/lerobot/scripts/collect_initpos.py

diff --git a/src/lerobot/scripts/collect_initpos.py b/src/lerobot/scripts/collect_initpos.py
new file mode 100644
index 0000000000..884ba52b42
--- /dev/null
+++ b/src/lerobot/scripts/collect_initpos.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+import argparse
+from pathlib import Path
+import json
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import os
+
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+from torch.utils.data import DataLoader
+
+from lerobot.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.scripts.visualize_dataset import EpisodeSampler
+
+# --- EDIT for your robot layout ---
+MOTOR_IDXS = {
+    "shoulder_pan": 0,
+    "shoulder_lift": 1,
+    "elbow_flex": 2,
+    "wrist_flex": 3,
+    "wrist_roll": 4,
+    "gripper_pos": 5,
+}
+# ----------------------------------
+
+def collect_first_10s_episode(dataset: LeRobotDataset, episode_index: int,
+                              seconds=10.0, use_state=True, dl_workers: int = 2):
+    """Return (raw, avg) where:
+       raw[motor] -> np.array(T,),  avg[motor] -> float
+    """
+    fps = float(dataset.meta.fps)
+    n_frames = int(seconds * fps)
+
+    sampler = EpisodeSampler(dataset, episode_index)
+    loader = DataLoader(dataset,
+                        sampler=sampler,
+                        batch_size=1,
+                        shuffle=False,
+                        num_workers=dl_workers,
+                        pin_memory=False)
+
+    # incremental buffers per motor to avoid big reallocs
+    vals = {m: [] for m in MOTOR_IDXS}
+
+    for i, batch in enumerate(loader):
+        if i >= n_frames:
+            break
+        vec = batch["observation.state"][0] if use_state else batch["action"][0]
+        if isinstance(vec, torch.Tensor):
+            vec = vec.detach().cpu().numpy()
+        for m, idx in MOTOR_IDXS.items():
+            vals[m].append(float(vec[idx]))
+
+    raw = {m: np.asarray(v, dtype=np.float32) for m, v in vals.items()}
+    avg = {m: (float(v.mean()) if v.size else float("nan")) for m, v in raw.items()}
+    return raw, avg
+
+def invert_episode_major(d_ep_motor):
+    if not d_ep_motor:
+        return {}
+    motors = next(iter(d_ep_motor.values())).keys()
+    out = {m: {} for m in motors}
+    for ep, m_dict in d_ep_motor.items():
+        for m, v in m_dict.items():
+            out[m][ep] = v
+    return out
+
+def save_dicts(raw_motor_major, avg_motor_major, outdir: Path):
+    outdir.mkdir(parents=True, exist_ok=True)
+    raw_json = {m: {str(ep): arr.tolist() for ep, arr in eps.items()} for m, eps in raw_motor_major.items()}
+    avg_json = {m: {str(ep): val for ep, val in eps.items()} for m, eps in avg_motor_major.items()}
+    (outdir / "first10s_raw.json").write_text(json.dumps(raw_json))
+    (outdir / "first10s_avg.json").write_text(json.dumps(avg_json))
+
+def plot_episode_means(avg_motor_major, outpath: Path):
+    motors = list(MOTOR_IDXS.keys())
+    n = len(motors)
+    import matplotlib.pyplot as plt
+    plt.figure(figsize=(12, 2.2 * n))
+    for i, m in enumerate(motors, 1):
+        plt.subplot(n, 1, i)
+        items = sorted(((int(ep), v) for ep, v in avg_motor_major.get(m, {}).items()), key=lambda x: x[0])
+        if not items:
+            plt.title(f"{m} (no data)"); continue
+        xs = [ep for ep, _ in items]
+        ys = [v for _, v in items]
+        plt.scatter(xs, ys, s=14)
+        plt.xlabel("episode_id"); plt.ylabel("mean @ first 10s"); plt.title(m)
+        plt.grid(True, alpha=0.3)
+    plt.tight_layout()
+    plt.savefig(outpath, dpi=180)
+    plt.close()
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--repo-id", required=True, type=str)
+    ap.add_argument("--root", type=Path, default=None)
+    ap.add_argument("--seconds", type=float, default=10.0)
+    ap.add_argument("--use-state", action="store_true", help="Use observation.state (default)")
+    ap.add_argument("--use-action", action="store_true", help="Use action instead of state")
+    ap.add_argument("--outdir", type=Path, default=Path("initpos_stats"))
+    ap.add_argument("--dl-workers", type=int, default=2, help="DataLoader workers PER EPISODE")
+    ap.add_argument("--max-threads", type=int, default=max(1, os.cpu_count() // 2),
+                    help="Max concurrent episodes")
+    args = ap.parse_args()
+
+    use_state = True
+    if args.use_action: use_state = False
+    if args.use_state:  use_state = True
+
+    ds = LeRobotDataset(args.repo_id, root=args.root, tolerance_s=1e-4)
+    n_eps = len(ds.episode_data_index["from"])
+
+    # Parallel over episodes
+    all_raw_ep_major, all_avg_ep_major = {}, {}
+    with ThreadPoolExecutor(max_workers=args.max_threads) as ex:
+        futures = {
+            ex.submit(collect_first_10s_episode, ds, ep,
+                      seconds=args.seconds, use_state=use_state,
+                      dl_workers=args.dl_workers): ep
+            for ep in range(n_eps)
+        }
+        for fut in as_completed(futures):
+            ep = futures[fut]
+            try:
+                raw, avg = fut.result()
+            except Exception as e:
+                # Don't crash the run; record NaNs for this episode
+                raw = {m: np.array([], dtype=np.float32) for m in MOTOR_IDXS}
+                avg = {m: float("nan") for m in MOTOR_IDXS}
+                print(f"[warn] episode {ep} failed: {e}")
+            all_raw_ep_major[ep] = raw
+            all_avg_ep_major[ep] = avg
+
+    # Convert to motor-major for saving/plotting
+    raw_motor_major = invert_episode_major(all_raw_ep_major)   # motor -> {ep: np.array}
+    avg_motor_major = invert_episode_major(all_avg_ep_major)   # motor -> {ep: float}
+
+    args.outdir.mkdir(parents=True, exist_ok=True)
+    save_dicts(raw_motor_major, avg_motor_major, args.outdir)
+    plot_episode_means(avg_motor_major, args.outdir / "episode_means.png")
+
+    fps = float(ds.meta.fps)
+    print(f"[done] fps={fps:.3f} | episodes={n_eps} | saved:")
+    print(f"  - {args.outdir/'first10s_raw.json'}")
+    print(f"  - {args.outdir/'first10s_avg.json'}")
+    print(f"  - {args.outdir/'episode_means.png'}")
+
+if __name__ == "__main__":
+    main()

From a6d8bf84cdaa1c4ad32c02ab37ae3c43689f3f66 Mon Sep 17 00:00:00 2001
From: Sahana <sahanave@gmail.com>
Date: Sun, 14 Sep 2025 15:39:21 +0100
Subject: [PATCH 02/10] add rag_health

---
 src/lerobot/scripts/rag_robot_health.py | 587 ++++++++++++++++++++++++
 1 file changed, 587 insertions(+)
 create mode 100644 src/lerobot/scripts/rag_robot_health.py

diff --git a/src/lerobot/scripts/rag_robot_health.py b/src/lerobot/scripts/rag_robot_health.py
new file mode 100644
index 0000000000..829eb5584e
--- /dev/null
+++ b/src/lerobot/scripts/rag_robot_health.py
@@ -0,0 +1,587 @@
+#!/usr/bin/env python3
+"""
+RAG for Robot Dataset Health — Episode Motor Averages (conversational)
+
+What this script does
+---------------------
+1) Loads per-episode 10s motor averages (motor-major JSON: motor -> {episode_id: value|list|str})
+2) Computes per-motor μ/σ, z-scores per episode, outlier flags, health_score
+3) Builds episode "doc" text blobs for Cohere Embed + Rerank
+4) Indexes embeddings in FAISS
+5) Optional conversational summaries via Cohere Command-R
+6) CLI:
+   - summary                  : build index (if needed) + print health summary
+   - query   --q "..."        : semantic search (Embed->ANN->Rerank)
+   - similar --ep N           : episodes similar to ep_N
+   - explain --ep N [--motor] : WHY outlier + side-by-side PNG
+
+Inputs
+------
+--avg-json      : path to averages JSON (motor -> {ep: float|list|str})
+--frames-json   : (optional) ep_id -> [first-frame paths or URLs]
+--repo-dir      : directory to store artifacts (docs.jsonl, stats.json, faiss.index, embeddings.npy)
+
+Install
+-------
+pip install cohere faiss-cpu numpy pillow tqdm
+
+Env
+---
+export COHERE_API_KEY=YOUR_KEY
+
+# 1) Build + conversational summary
+python rag_robot_health.py \
+  --avg-json initpos_stats/Sahana16/record-medicine/first10s_avg.json \
+  --repo-dir Sahana16/record-medicine \
+ --chat --style direct summary
+
+# 2) Conversational query
+python rag_robot_health.py \
+  --avg-json initpos_stats/Sahana16/record-medicine/first10s_avg.json \
+  --repo-dir record-medicine \
+  --chat --style friendly \
+  query --q "outliers on elbow_flex"
+
+"""
+
+from __future__ import annotations
+import os, json, argparse, math
+from pathlib import Path
+from typing import Dict, List, Tuple, Any
+from dataclasses import dataclass, asdict
+
+import numpy as np
+from tqdm import tqdm
+from PIL import Image, ImageDraw
+
+import faiss
+import cohere
+
+import os
+os.environ["COHERE_API_KEY"] = "6V7WaFPKcEnSC8iF6HTca8HHkhvIOJVuVg7Xzer7"
+
+# ---------------------------- Config ----------------------------
+MOTORS = [
+    "shoulder_pan",
+    "shoulder_lift",
+    "elbow_flex",
+    "wrist_flex",
+    "wrist_roll",
+    "gripper_pos",
+]
+
+OUTLIER_Z = 3.0
+CANDIDATE_Z = 2.0
+RERANK_TOP_K = 12
+ANN_CAND_K = 64
+EMBED_MODEL = "embed-english-v3.0"
+RERANK_MODEL = "rerank-english-v3.0"
+CHAT_MODEL  = "command-r"
+
+# ---------------------------- Data types ----------------------------
+@dataclass
+class EpisodeDoc:
+    ep_id: str
+    split: str | None
+    task: str | None
+    motor_avgs: Dict[str, float]
+    motor_mu: Dict[str, float]
+    motor_sigma: Dict[str, float]
+    motor_z: Dict[str, float]
+    outlier_flags: Dict[str, bool]
+    health_score: float
+    first_frames: List[str] | None
+
+    def to_text_blob(self) -> str:
+        def fmt(v):
+            if v is None or (isinstance(v, float) and (math.isnan(v) or math.isinf(v))):
+                return "nan"
+            return f"{v:.6f}"
+        parts = []
+        parts.append(f"ep_id: {self.ep_id}")
+        if self.task: parts.append(f"task: {self.task}")
+        if self.split: parts.append(f"split: {self.split}")
+        parts.append("motor_avgs: {" + ", ".join(f"{m}:{fmt(self.motor_avgs.get(m, float('nan')))}" for m in MOTORS) + "}")
+        parts.append("motor_z: {"    + ", ".join(f"{m}:{fmt(self.motor_z.get(m, float('nan')))}" for m in MOTORS) + "}")
+        parts.append("outliers: {"   + ", ".join(f"{m}:{'true' if self.outlier_flags.get(m, False) else 'false'}" for m in MOTORS) + "}")
+        parts.append(f"health_score: {fmt(self.health_score)}")
+        return "; ".join(parts)
+
+# ---------------------------- IO helpers ----------------------------
+def _looks_num_str(s: str) -> bool:
+    s = s.strip()
+    if not s: return False
+    if s[0] in "+-": s = s[1:]
+    return s.replace(".", "", 1).isdigit()
+
+def _coerce_scalar(val) -> float:
+    if isinstance(val, (int, float)):
+        return float(val)
+    if isinstance(val, str):
+        return float(val.strip()) if _looks_num_str(val) else float("nan")
+    if isinstance(val, (list, tuple)):
+        nums = []
+        for x in val:
+            if isinstance(x, (int, float)): nums.append(float(x))
+            elif isinstance(x, str) and _looks_num_str(x): nums.append(float(x))
+        return float(np.mean(nums)) if nums else float("nan")
+    return float("nan")
+
+def load_avg_json(path: Path) -> Dict[str, Dict[str, float]]:
+    """
+    Accepts motor-major JSON with floats/strings/lists.
+    Returns: motor -> {episode_id: float_mean}
+    """
+    raw = json.loads(Path(path).read_text())
+    filtered: Dict[str, Dict[str, float]] = {}
+    bad = []
+    for m in MOTORS:
+        motor_map = {}
+        for ep, val in (raw.get(m, {}) or {}).items():
+            v = _coerce_scalar(val)
+            if math.isnan(v):
+                bad.append((m, ep, val))
+            motor_map[str(ep)] = v
+        filtered[m] = motor_map
+    if bad:
+        print(f"[warn] {len(bad)} invalid entries; set to NaN. First few:")
+        for b in bad[:5]:
+            print("   motor:", b[0], "episode:", b[1], "value:", str(b[2])[:120])
+    return filtered
+
+def load_frames_json(path: Path | None) -> Dict[str, List[str]]:
+    if not path: return {}
+    return {str(k): v for k, v in json.loads(Path(path).read_text()).items()}
+
+def ensure_dir(p: Path):
+    p.mkdir(parents=True, exist_ok=True)
+
+# ---------------------------- Stats & Docs ----------------------------
+def compute_mu_sigma(avg_motor_major: Dict[str, Dict[str, float]]) -> Tuple[Dict[str, float], Dict[str, float]]:
+    mu, sigma = {}, {}
+    for m in MOTORS:
+        vals = np.array(list(avg_motor_major.get(m, {}).values()), dtype=np.float64)
+        if vals.size == 0:
+            mu[m], sigma[m] = float("nan"), float("nan")
+        else:
+            mu[m] = float(np.nanmean(vals))
+            sigma[m] = float(np.nanstd(vals, ddof=1) if vals.size > 1 else 0.0)
+    return mu, sigma
+
+def build_episode_table(avg_motor_major: Dict[str, Dict[str, float]]) -> Dict[str, Dict[str, float]]:
+    episodes = set()
+    for m in MOTORS:
+        episodes.update(avg_motor_major.get(m, {}).keys())
+    table = {ep: {} for ep in episodes}
+    for m in MOTORS:
+        for ep, val in avg_motor_major.get(m, {}).items():
+            table[ep][m] = val  # already coerced
+    return table
+
+def zscore(value: float, mu: float, sigma: float) -> float:
+    if sigma is None or sigma == 0 or math.isnan(sigma):
+        return float("nan")
+    return (value - mu) / sigma
+
+def health_score_from_z(zs: Dict[str, float]) -> float:
+    abs_vals = [abs(v) for v in zs.values() if not math.isnan(v)]
+    if not abs_vals: return float("nan")
+    return float(1.0 / (1.0 + (sum(abs_vals) / max(1, len(abs_vals)))))
+
+def build_docs(
+    avg_motor_major: Dict[str, Dict[str, float]],
+    frames_map: Dict[str, List[str]] | None = None,
+    global_task: str | None = None,
+    global_split: str | None = None
+) -> Tuple[List[EpisodeDoc], Dict[str, float], Dict[str, float]]:
+    mu, sigma = compute_mu_sigma(avg_motor_major)
+    ep_table = build_episode_table(avg_motor_major)
+    docs: List[EpisodeDoc] = []
+    for ep_id, motor_avgs in ep_table.items():
+        motor_z = {m: zscore(motor_avgs.get(m, float("nan")), mu.get(m, float("nan")), sigma.get(m, float("nan"))) for m in MOTORS}
+        flags = {m: (abs(motor_z.get(m, float("nan"))) > OUTLIER_Z) if not math.isnan(motor_z.get(m, float("nan"))) else False for m in MOTORS}
+        health = health_score_from_z(motor_z)
+        frames = (frames_map or {}).get(str(ep_id))
+        docs.append(EpisodeDoc(
+            ep_id=str(ep_id),
+            split=global_split,
+            task=global_task,
+            motor_avgs={m: motor_avgs.get(m, float("nan")) for m in MOTORS},
+            motor_mu=mu,
+            motor_sigma=sigma,
+            motor_z=motor_z,
+            outlier_flags=flags,
+            health_score=health,
+            first_frames=frames
+        ))
+    return docs, mu, sigma
+
+# ---------------------------- Cohere & FAISS ----------------------------
+def get_cohere() -> cohere.Client:
+    key = os.environ.get("COHERE_API_KEY")
+    if not key:
+        raise RuntimeError("Set COHERE_API_KEY environment variable.")
+    return cohere.Client(key)
+
+def embed_texts_cohere(texts: List[str], input_type: str) -> np.ndarray:
+    co = get_cohere()
+    out = co.embed(texts=texts, model=EMBED_MODEL, input_type=input_type)
+    embs = np.array(out.embeddings, dtype=np.float32)
+    norms = np.linalg.norm(embs, axis=1, keepdims=True) + 1e-12
+    return embs / norms
+
+def build_faiss(embs: np.ndarray) -> faiss.IndexFlatIP:
+    d = embs.shape[1]
+    index = faiss.IndexFlatIP(d)  # cosine via dot on normalized vectors
+    index.add(embs)
+    return index
+
+def ann_search(index: faiss.IndexFlatIP, q_vec: np.ndarray, k: int = ANN_CAND_K) -> Tuple[np.ndarray, np.ndarray]:
+    D, I = index.search(q_vec[None, :], k)
+    return I[0], D[0]
+
+def rerank(co: cohere.Client, query: str, doc_texts: List[str], k: int = RERANK_TOP_K) -> List[int]:
+    rr = co.rerank(model=RERANK_MODEL, query=query, documents=doc_texts, top_n=min(k, len(doc_texts)))
+    return [r.index for r in rr.results]
+
+# ---------------------------- Conversational layer ----------------------------
+def chatify(prompt: str, user_context: str = "", style: str = "direct") -> str:
+    """
+    Use Cohere Command-R to turn structured findings into a concise, conversational blurb.
+    """
+    co = get_cohere()
+    sys = (
+        "You are an expert robotics dataset analyst. "
+        "Be direct, clear, and actionable in 4-8 sentences. "
+        "If listing steps, keep them short and specific."
+    )
+    full_prompt = (
+        f"{prompt}\n\n"
+        f"Tone: {style}.\n"
+        f"Context:\n{user_context}\n"
+        f"Respond as one concise paragraph or a few short bullets."
+    )
+    try:
+        resp = co.chat(model=CHAT_MODEL, message=full_prompt)
+        return (resp.text or "").strip()
+    except Exception as e:
+        return f"(chat unavailable: {e})"
+
+# ---------------------------- Reports ----------------------------
+def dataset_health_summary(docs: List[EpisodeDoc]) -> Dict[str, Any]:
+    motor_outliers = {m: 0 for m in MOTORS}
+    for d in docs:
+        for m in MOTORS:
+            if d.outlier_flags.get(m, False):
+                motor_outliers[m] += 1
+    ranked = [d for d in docs if not math.isnan(d.health_score)]
+    ranked.sort(key=lambda x: x.health_score)  # low is worse
+    worst = ranked[:min(5, len(ranked))]
+    best  = list(reversed(ranked))[:min(5, len(ranked))]
+    return {
+        "motor_outlier_counts": motor_outliers,
+        "worst_eps": [(d.ep_id, round(d.health_score, 4)) for d in worst],
+        "best_eps":  [(d.ep_id, round(d.health_score, 4)) for d in best],
+    }
+
+def print_summary(summary: Dict[str, Any]):
+    print("\n== Dataset Health Summary ==")
+    print(f"Outlier counts per motor (|z| > {OUTLIER_Z:.1f}):")
+    for m in MOTORS:
+        print(f"  - {m:14s}: {summary['motor_outlier_counts'][m]}")
+    print("\nWorst episodes by health_score:")
+    for ep, sc in summary["worst_eps"]:
+        print(f"  - ep {ep:>6s}: {sc:.4f}")
+    print("\nBest episodes by health_score:")
+    for ep, sc in summary["best_eps"]:
+        print(f"  - ep {ep:>6s}: {sc:.4f}")
+    print()
+
+# ---------------------------- Visual Explain ----------------------------
+def top_outlier_motors(doc, top_k=3):
+    items = []
+    for m in MOTORS:
+        z = doc["motor_z"].get(m, float("nan"))
+        if math.isnan(z): continue
+        items.append((m, z,
+                      doc["motor_mu"].get(m, float("nan")),
+                      doc["motor_sigma"].get(m, float("nan")),
+                      doc["motor_avgs"].get(m, float("nan"))))
+    items.sort(key=lambda x: abs(x[1]), reverse=True)
+    return items[:top_k]
+
+def find_healthy_exemplars(docs_json, embs, index, target_idx, focus_motor=None, k=3):
+    I, _ = ann_search(index, embs[target_idx], k=min(256, len(docs_json)))
+    candidates = [i for i in I if i != target_idx]
+    def ok(d):
+        if focus_motor:
+            z = d["motor_z"].get(focus_motor)
+            return (z is not None) and (not math.isnan(z)) and (abs(z) < 1.0)
+        zs = [abs(v) for v in d["motor_z"].values() if not math.isnan(v)]
+        return zs and (sum(zs)/len(zs) < 1.0)
+    healthy = [i for i in candidates if ok(docs_json[i])]
+    return healthy[:k] if healthy else candidates[:k]
+
+def load_first_images(paths, max_per_ep=2, target_size=(512, 384)):
+    imgs = []
+    for p in (paths or [])[:max_per_ep]:
+        try:
+            im = Image.open(p).convert("RGB")
+            im.thumbnail(target_size)
+            imgs.append(im)
+        except Exception:
+            continue
+    return imgs
+
+def make_side_by_side(target_imgs, exemplar_imgs_list, caption, out_png_path):
+    pad = 16
+    col_w = max([img.width for img in (target_imgs + sum(exemplar_imgs_list, []))] + [320])
+    row_h = sum(img.height for img in target_imgs) + (len(target_imgs)-1)*pad if target_imgs else 240
+    rows = max(1, max((len(x) for x in exemplar_imgs_list), default=1))
+    right_cols = len(exemplar_imgs_list)
+    width = pad + col_w + pad + right_cols*(col_w + pad) + pad
+    height = pad + max(row_h, rows*((target_imgs[0].height if target_imgs else 240) + pad)) + 120
+    canvas = Image.new("RGB", (width, height), (255,255,255))
+    draw = ImageDraw.Draw(canvas)
+    draw.text((pad, pad), caption, fill=(0,0,0))
+    # target (left)
+    y = pad + 32; x = pad
+    for img in target_imgs:
+        canvas.paste(img, (x, y)); y += img.height + pad
+    # exemplars (right)
+    base_x = pad + col_w + pad
+    for col, imgs in enumerate(exemplar_imgs_list):
+        y = pad + 32; x = base_x + col*(col_w + pad)
+        for img in imgs:
+            canvas.paste(img, (x, y)); y += img.height + pad
+    canvas.save(out_png_path)
+    return out_png_path
+
+# ---------------------------- CLI actions ----------------------------
+def ensure_repo(docs: List[EpisodeDoc], repo_dir: Path):
+    ensure_dir(repo_dir)
+    docs_path = repo_dir / "docs.jsonl"
+    stats_path = repo_dir / "stats.json"
+    embs_path = repo_dir / "embeddings.npy"
+    index_path = repo_dir / "faiss.index"
+
+    with docs_path.open("w") as f:
+        for d in docs:
+            rec = asdict(d)
+            rec["text_blob"] = d.to_text_blob()
+            f.write(json.dumps(rec) + "\n")
+
+    texts = [d.to_text_blob() for d in docs]
+    embs = embed_texts_cohere(texts, input_type="search_document")
+    np.save(embs_path, embs)
+
+    index = build_faiss(embs)
+    faiss.write_index(index, str(index_path))
+
+    stats = {
+        "n_docs": len(docs),
+        "motors": MOTORS,
+        "embed_model": EMBED_MODEL,
+        "rerank_model": RERANK_MODEL,
+        "outlier_z": OUTLIER_Z,
+        "candidate_z": CANDIDATE_Z,
+    }
+    stats_path.write_text(json.dumps(stats, indent=2))
+
+    print(f"[repo] wrote {docs_path}")
+    print(f"[repo] wrote {embs_path}")
+    print(f"[repo] wrote {index_path}")
+    print(f"[repo] wrote {stats_path}")
+
+def load_repo(repo_dir: Path) -> Tuple[List[Dict[str, Any]], np.ndarray, faiss.IndexFlatIP]:
+    docs_path = repo_dir / "docs.jsonl"
+    embs_path = repo_dir / "embeddings.npy"
+    index_path = repo_dir / "faiss.index"
+    docs_json = [json.loads(l) for l in docs_path.read_text().splitlines() if l.strip()]
+    embs = np.load(embs_path)
+    index = faiss.read_index(str(index_path))
+    return docs_json, embs, index
+
+def cmd_summary(args):
+    avg_mm = load_avg_json(args.avg_json)
+    frames_map = load_frames_json(args.frames_json) if args.frames_json else {}
+    docs, mu, sigma = build_docs(avg_mm, frames_map, global_task=args.task, global_split=args.split)
+    ensure_repo(docs, args.repo_dir)
+    summary = dataset_health_summary(docs)
+    print_summary(summary)
+
+    if args.chat:
+        counts = "\n".join([f"{m}: {summary['motor_outlier_counts'][m]}" for m in MOTORS])
+        worst = ", ".join([f"ep {e} (score {s})" for e, s in summary["worst_eps"]])
+        best  = ", ".join([f"ep {e} (score {s})" for e, s in summary["best_eps"]])
+        ctx = (
+            f"Outlier counts:\n{counts}\n\n"
+            f"Worst episodes: {worst}\nBest episodes: {best}\n"
+            f"Outlier threshold: |z| > {OUTLIER_Z}"
+        )
+        prompt = ("Summarize dataset health for robotics training. "
+                  "Explain implications and give 2–3 concrete next steps.")
+        print(chatify(prompt, ctx, style=args.style))
+
+def cmd_query(args):
+    if not (args.repo_dir / "docs.jsonl").exists():
+        avg_mm = load_avg_json(args.avg_json)
+        frames_map = load_frames_json(args.frames_json) if args.frames_json else {}
+        docs, mu, sigma = build_docs(avg_mm, frames_map, global_task=args.task, global_split=args.split)
+        ensure_repo(docs, args.repo_dir)
+
+    docs_json, embs, index = load_repo(args.repo_dir)
+    texts = [d["text_blob"] for d in docs_json]
+
+    co = get_cohere()
+    q_emb = embed_texts_cohere([args.q], input_type="search_query")[0]
+    idxs, _ = ann_search(index, q_emb, k=min(ANN_CAND_K, len(texts)))
+    cands = [texts[i] for i in idxs]
+    order = rerank(co, args.q, cands, k=min(RERANK_TOP_K, len(cands)))
+    final_ids = [idxs[i] for i in order]
+
+    print("\n== Query Results ==")
+    for rank, i in enumerate(final_ids, 1):
+        d = docs_json[i]
+        print(f"{rank:2d}. ep {d['ep_id']} | health={d['health_score']:.4f} | outliers={[m for m,f in d['outlier_flags'].items() if f]}")
+        if d.get("first_frames"):
+            print(f"    frames: {', '.join(d['first_frames'][:3])}")
+    print()
+
+    if args.chat:
+        top_blobs = "\n\n".join([docs_json[i]["text_blob"] for i in final_ids[:5]])
+        prompt = (f"User question: {args.q}\n"
+                  "Answer conversationally for a robotics dataset engineer. "
+                  "Reference episode IDs and motors likely at fault. Suggest next checks.")
+        print(chatify(prompt, top_blobs, style=args.style))
+
+def cmd_similar(args):
+    if not (args.repo_dir / "docs.jsonl").exists():
+        avg_mm = load_avg_json(args.avg_json)
+        frames_map = load_frames_json(args.frames_json) if args.frames_json else {}
+        docs, mu, sigma = build_docs(avg_mm, frames_map, global_task=args.task, global_split=args.split)
+        ensure_repo(docs, args.repo_dir)
+
+    docs_json, embs, index = load_repo(args.repo_dir)
+    texts = [d["text_blob"] for d in docs_json]
+    ep_idx_map = {d["ep_id"]: i for i, d in enumerate(docs_json)}
+    if str(args.ep) not in ep_idx_map:
+        raise SystemExit(f"ep {args.ep} not found in repo docs")
+
+    i = ep_idx_map[str(args.ep)]
+    co = get_cohere()
+
+    q_vec = embs[i]
+    idxs, _ = ann_search(index, q_vec, k=min(ANN_CAND_K, len(texts)))
+    idxs = [j for j in idxs if j != i]
+    cands_text = [texts[j] for j in idxs]
+    order = rerank(co, texts[i], cands_text, k=min(RERANK_TOP_K, len(cands_text)))
+    final_ids = [idxs[j] for j in order]
+
+    print(f"\n== Episodes similar to ep {args.ep} ==")
+    for rank, j in enumerate(final_ids, 1):
+        d = docs_json[j]
+        print(f"{rank:2d}. ep {d['ep_id']} | health={d['health_score']:.4f} | outliers={[m for m,f in d['outlier_flags'].items() if f]}")
+        if d.get("first_frames"):
+            print(f"    frames: {', '.join(d['first_frames'][:3])}")
+    print()
+
+    if args.chat:
+        top_blobs = "\n\n".join([docs_json[j]["text_blob"] for j in final_ids[:5]])
+        anchor = docs_json[i]["text_blob"]
+        prompt = (f"Compare similar episodes to anchor episode {args.ep}. "
+                  "Call out motor dimensions that differ and propose a quick validation checklist.")
+        ctx = f"ANCHOR:\n{anchor}\n\nSIMILARS:\n{top_blobs}"
+        print(chatify(prompt, ctx, style=args.style))
+
+def cmd_explain(args):
+    if not (args.repo_dir / "docs.jsonl").exists():
+        avg_mm = load_avg_json(args.avg_json)
+        frames_map = load_frames_json(args.frames_json) if args.frames_json else {}
+        docs, mu, sigma = build_docs(avg_mm, frames_map, global_task=args.task, global_split=args.split)
+        ensure_repo(docs, args.repo_dir)
+
+    docs_json, embs, index = load_repo(args.repo_dir)
+    ep_idx_map = {d["ep_id"]: i for i, d in enumerate(docs_json)}
+    if str(args.ep) not in ep_idx_map:
+        raise SystemExit(f"ep {args.ep} not found in repo docs")
+
+    i = ep_idx_map[str(args.ep)]
+    d = docs_json[i]
+
+    offenders = top_outlier_motors(d, top_k=3)
+    focus_motor = args.motor if args.motor else (offenders[0][0] if offenders else None)
+    ex_ids = find_healthy_exemplars(docs_json, embs, index, i, focus_motor=focus_motor, k=args.k)
+
+    target_imgs = load_first_images(d.get("first_frames"), max_per_ep=2)
+    exemplar_imgs_list = [load_first_images(docs_json[j].get("first_frames"), max_per_ep=2) for j in ex_ids]
+
+    out_png = args.repo_dir / f"explain_ep{args.ep}.png"
+    cap = f"Episode {args.ep}: outlier vs healthy exemplars (focus motor: {focus_motor})"
+    make_side_by_side(target_imgs, exemplar_imgs_list, cap, out_png)
+
+    print(f"\n== Explanation for episode {args.ep} ==")
+    if offenders:
+        for (m, z, mu, sigma, val) in offenders:
+            side = "high" if z > 0 else "low"
+            label = " -> OUTLIER" if abs(z) > OUTLIER_Z else (" -> CANDIDATE" if abs(z) > CANDIDATE_Z else "")
+            print(f"- {m}: z={z:.2f} ({side}); value={val:.4f}, mean={mu:.4f}, std={sigma:.4f}{label}")
+    else:
+        print(f"- No strong offenders; health_score={d['health_score']:.4f}")
+
+    print("\nHealthy exemplars shown:")
+    for j in ex_ids:
+        dj = docs_json[j]
+        max_abs_z = max((abs(v) for v in dj['motor_z'].values() if not math.isnan(v)), default=float("nan"))
+        print(f"- ep {dj['ep_id']} | health={dj['health_score']:.4f} | max|z|={max_abs_z:.2f}")
+
+    print(f"\nSaved side-by-side: {out_png}\n")
+
+    if args.chat:
+        lines = []
+        for (m, z, mu, sigma, val) in offenders:
+            side = "high" if z > 0 else "low"
+            lines.append(f"{m}: z={z:.2f} ({side}), value={val:.4f}, mean={mu:.4f}, std={sigma:.4f}")
+        ex_list = ", ".join([docs_json[j]["ep_id"] for j in ex_ids]) if ex_ids else "none"
+        ctx = (
+            f"Episode {args.ep} offenders:\n" + "\n".join(lines) +
+            f"\nExemplars: {ex_list}\nOutlier threshold: |z| > {OUTLIER_Z}\nImage: {out_png}"
+        )
+        prompt = ("Explain in plain language why this episode is an outlier and "
+                  "what the images likely show. Give 2 next steps to confirm/fix.")
+        print(chatify(prompt, ctx, style=args.style))
+
+# ---------------------------- Main ----------------------------
+def parse_args():
+    p = argparse.ArgumentParser(description="RAG on motor averages: health + retrieval (conversational)")
+    p.add_argument("--avg-json", type=Path, required=True, help="Path to motor-major averages JSON")
+    p.add_argument("--frames-json", type=Path, default=None, help="Optional: ep_id -> [first-frame paths]")
+    p.add_argument("--repo-dir", type=Path, required=True, help="Directory to store docs/index/embeddings")
+    p.add_argument("--task", type=str, default=None, help="Optional task label")
+    p.add_argument("--split", type=str, default=None, help="Optional split label (train/eval)")
+    p.add_argument("--chat", action="store_true", help="Generate a conversational summary/answer via Cohere")
+    p.add_argument("--style", type=str, default="direct", help="Tone: direct | friendly | technical | executive")
+    sub = p.add_subparsers(dest="cmd", required=True)
+
+    s1 = sub.add_parser("summary", help="Build index (if needed) and print dataset health summary")
+    s1.set_defaults(func=cmd_summary)
+
+    s2 = sub.add_parser("query", help="Text search over episode docs (Embed->ANN->Rerank)")
+    s2.add_argument("--q", type=str, required=True)
+    s2.set_defaults(func=cmd_query)
+
+    s3 = sub.add_parser("similar", help="Episodes similar to a given episode")
+    s3.add_argument("--ep", type=str, required=True)
+    s3.set_defaults(func=cmd_similar)
+
+    s4 = sub.add_parser("explain", help="Explain why an episode is an outlier; render images vs healthy exemplars")
+    s4.add_argument("--ep", type=str, required=True)
+    s4.add_argument("--motor", type=str, default=None, choices=MOTORS + [None])
+    s4.add_argument("--k", type=int, default=3, help="Number of healthy exemplars to show")
+    s4.set_defaults(func=cmd_explain)
+
+    return p.parse_args()
+
+if __name__ == "__main__":
+    args = parse_args()
+    args.func(args)

From ea91b98f8c75484414771f1bd32d0d4a93f63c7f Mon Sep 17 00:00:00 2001
From: Sahana <sahanave@gmail.com>
Date: Mon, 6 Oct 2025 22:17:23 +0100
Subject: [PATCH 03/10] add

---
 src/lerobot/scripts/collect_initpos.py | 108 ++++++++++++++++++++-----
 1 file changed, 87 insertions(+), 21 deletions(-)

diff --git a/src/lerobot/scripts/collect_initpos.py b/src/lerobot/scripts/collect_initpos.py
index 884ba52b42..73cc2645d6 100644
--- a/src/lerobot/scripts/collect_initpos.py
+++ b/src/lerobot/scripts/collect_initpos.py
@@ -24,10 +24,51 @@
 }
 # ----------------------------------
 
+def chw_to_hwc_uint8(img_t: torch.Tensor) -> np.ndarray:
+    """
+    Convert CxHxW (float32 in [0,1] or uint8) -> HxWxC uint8.
+    """
+    if isinstance(img_t, torch.Tensor):
+        t = img_t.detach().cpu()
+    else:
+        raise TypeError("Expected torch.Tensor for image")
+    if t.dtype == torch.float32:
+        t = (t.clamp(0, 1) * 255.0).to(torch.uint8)
+    elif t.dtype != torch.uint8:
+        t = t.to(torch.uint8)
+    assert t.ndim == 3 and t.shape[0] <= t.shape[1] and t.shape[0] <= t.shape[2], f"Expected CxHxW, got {tuple(t.shape)}"
+    return t.permute(1, 2, 0).numpy()
+
+def save_first_frames_from_batch(batch, dataset: LeRobotDataset, ep: int, frames_dir: Path) -> list[str]:
+    """
+    Save first-frame images for all available cameras in this batch (assumes batch_size=1).
+    Returns list of saved file paths.
+    """
+    saved = []
+    frames_dir.mkdir(parents=True, exist_ok=True)
+    cam_keys = getattr(dataset.meta, "camera_keys", [])
+    for cam in cam_keys:
+        if cam in batch:
+            img = batch[cam][0]  # CxHxW
+            try:
+                arr = chw_to_hwc_uint8(img)
+                out_path = frames_dir / f"episode_{ep}_{cam}.png"
+                # Use PIL to write (matplotlib is slower; cv2 adds dep). PIL is bundled via matplotlib.
+                from PIL import Image
+                Image.fromarray(arr).save(out_path)
+                saved.append(str(out_path))
+            except Exception as e:
+                print(f"[warn] failed saving first frame for ep {ep} cam {cam}: {e}")
+    return saved
+
 def collect_first_10s_episode(dataset: LeRobotDataset, episode_index: int,
-                              seconds=10.0, use_state=True, dl_workers: int = 2):
-    """Return (raw, avg) where:
-       raw[motor] -> np.array(T,),  avg[motor] -> float
+                              seconds=10.0, use_state=True, dl_workers: int = 2,
+                              frames_dir: Path | None = None) -> tuple[dict, dict, list[str]]:
+    """
+    Return (raw, avg, first_frames)
+      raw[motor] -> np.array(T,)
+      avg[motor] -> float
+      first_frames -> list of saved file paths for t=0 (one per camera if available)
     """
     fps = float(dataset.meta.fps)
     n_frames = int(seconds * fps)
@@ -40,10 +81,16 @@ def collect_first_10s_episode(dataset: LeRobotDataset, episode_index: int,
                         num_workers=dl_workers,
                         pin_memory=False)
 
-    # incremental buffers per motor to avoid big reallocs
     vals = {m: [] for m in MOTOR_IDXS}
+    first_frames_paths: list[str] = []
+    grabbed_first = False
 
     for i, batch in enumerate(loader):
+        # Save first frames exactly at the first yielded sample
+        if not grabbed_first and frames_dir is not None:
+            first_frames_paths = save_first_frames_from_batch(batch, dataset, episode_index, frames_dir)
+            grabbed_first = True
+
         if i >= n_frames:
             break
         vec = batch["observation.state"][0] if use_state else batch["action"][0]
@@ -54,7 +101,7 @@ def collect_first_10s_episode(dataset: LeRobotDataset, episode_index: int,
 
     raw = {m: np.asarray(v, dtype=np.float32) for m, v in vals.items()}
     avg = {m: (float(v.mean()) if v.size else float("nan")) for m, v in raw.items()}
-    return raw, avg
+    return raw, avg, first_frames_paths
 
 def invert_episode_major(d_ep_motor):
     if not d_ep_motor:
@@ -76,7 +123,6 @@ def save_dicts(raw_motor_major, avg_motor_major, outdir: Path):
 def plot_episode_means(avg_motor_major, outpath: Path):
     motors = list(MOTOR_IDXS.keys())
     n = len(motors)
-    import matplotlib.pyplot as plt
     plt.figure(figsize=(12, 2.2 * n))
     for i, m in enumerate(motors, 1):
         plt.subplot(n, 1, i)
@@ -103,35 +149,48 @@ def main():
     ap.add_argument("--dl-workers", type=int, default=2, help="DataLoader workers PER EPISODE")
     ap.add_argument("--max-threads", type=int, default=max(1, os.cpu_count() // 2),
                     help="Max concurrent episodes")
+    ap.add_argument("--save-first-frames", action="store_true",
+                    help="Save t=0 frames per episode per camera into outdir/first_frames and write first_frames.json")
     args = ap.parse_args()
 
     use_state = True
     if args.use_action: use_state = False
     if args.use_state:  use_state = True
 
-    ds = LeRobotDataset(args.repo_id, root=args.root, tolerance_s=1e-4)
+    ds = LeRobotDataset(args.repo-id if hasattr(args, "repo-id") else args.repo_id, root=args.root, tolerance_s=1e-4)
     n_eps = len(ds.episode_data_index["from"])
 
+    frames_dir = (args.outdir / "first_frames") if args.save_first_frames else None
+    if frames_dir is not None:
+        frames_dir.mkdir(parents=True, exist_ok=True)
+
     # Parallel over episodes
     all_raw_ep_major, all_avg_ep_major = {}, {}
+    first_frames_map: dict[str, list[str]] = {}
+
+    def work(ep: int):
+        try:
+            raw, avg, frames = collect_first_10s_episode(
+                ds, ep,
+                seconds=args.seconds, use_state=use_state,
+                dl_workers=args.dl_workers,
+                frames_dir=frames_dir
+            )
+            return ep, raw, avg, frames
+        except Exception as e:
+            print(f"[warn] episode {ep} failed: {e}")
+            raw = {m: np.array([], dtype=np.float32) for m in MOTOR_IDXS}
+            avg = {m: float("nan") for m in MOTOR_IDXS}
+            return ep, raw, avg, []
+
     with ThreadPoolExecutor(max_workers=args.max_threads) as ex:
-        futures = {
-            ex.submit(collect_first_10s_episode, ds, ep,
-                      seconds=args.seconds, use_state=use_state,
-                      dl_workers=args.dl_workers): ep
-            for ep in range(n_eps)
-        }
+        futures = [ex.submit(work, ep) for ep in range(n_eps)]
         for fut in as_completed(futures):
-            ep = futures[fut]
-            try:
-                raw, avg = fut.result()
-            except Exception as e:
-                # Don't crash the run; record NaNs for this episode
-                raw = {m: np.array([], dtype=np.float32) for m in MOTOR_IDXS}
-                avg = {m: float("nan") for m in MOTOR_IDXS}
-                print(f"[warn] episode {ep} failed: {e}")
+            ep, raw, avg, frames = fut.result()
             all_raw_ep_major[ep] = raw
             all_avg_ep_major[ep] = avg
+            if args.save_first_frames:
+                first_frames_map[str(ep)] = frames
 
     # Convert to motor-major for saving/plotting
     raw_motor_major = invert_episode_major(all_raw_ep_major)   # motor -> {ep: np.array}
@@ -141,11 +200,18 @@ def main():
     save_dicts(raw_motor_major, avg_motor_major, args.outdir)
     plot_episode_means(avg_motor_major, args.outdir / "episode_means.png")
 
+    # Save first_frames.json if requested
+    if args.save_first_frames:
+        (args.outdir / "first_frames.json").write_text(json.dumps(first_frames_map, indent=2))
+
     fps = float(ds.meta.fps)
     print(f"[done] fps={fps:.3f} | episodes={n_eps} | saved:")
     print(f"  - {args.outdir/'first10s_raw.json'}")
     print(f"  - {args.outdir/'first10s_avg.json'}")
     print(f"  - {args.outdir/'episode_means.png'}")
+    if args.save_first_frames:
+        print(f"  - {args.outdir/'first_frames.json'}")
+        print(f"  - {frames_dir}/*")
 
 if __name__ == "__main__":
     main()

From f1835bb90b92ad6c4ba513e96360dc0c87cab6d0 Mon Sep 17 00:00:00 2001
From: Sahana Venkatesh <venkatesh.sahana1606@gmail.com>
Date: Mon, 6 Oct 2025 22:19:25 +0100
Subject: [PATCH 04/10] Update src/lerobot/scripts/rag_robot_health.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Sahana Venkatesh <venkatesh.sahana1606@gmail.com>
---
 src/lerobot/scripts/rag_robot_health.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/lerobot/scripts/rag_robot_health.py b/src/lerobot/scripts/rag_robot_health.py
index 829eb5584e..409e4fd4cc 100644
--- a/src/lerobot/scripts/rag_robot_health.py
+++ b/src/lerobot/scripts/rag_robot_health.py
@@ -45,7 +45,10 @@
 """
 
 from __future__ import annotations
-import os, json, argparse, math
+import os
+import json
+import argparse
+import math
 from pathlib import Path
 from typing import Dict, List, Tuple, Any
 from dataclasses import dataclass, asdict

From 0affb9cb2ae0ada5c972ab0abc8aa830a2484a34 Mon Sep 17 00:00:00 2001
From: Sahana Venkatesh <venkatesh.sahana1606@gmail.com>
Date: Mon, 6 Oct 2025 22:19:37 +0100
Subject: [PATCH 05/10] Update src/lerobot/scripts/rag_robot_health.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Sahana Venkatesh <venkatesh.sahana1606@gmail.com>
---
 src/lerobot/scripts/rag_robot_health.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lerobot/scripts/rag_robot_health.py b/src/lerobot/scripts/rag_robot_health.py
index 409e4fd4cc..44cc3eca89 100644
--- a/src/lerobot/scripts/rag_robot_health.py
+++ b/src/lerobot/scripts/rag_robot_health.py
@@ -60,7 +60,6 @@
 import faiss
 import cohere
 
-import os
 os.environ["COHERE_API_KEY"] = "6V7WaFPKcEnSC8iF6HTca8HHkhvIOJVuVg7Xzer7"
 
 # ---------------------------- Config ----------------------------

From de11127a1ea8bf2199eff44664c6f9e86634bb42 Mon Sep 17 00:00:00 2001
From: Sahana Venkatesh <venkatesh.sahana1606@gmail.com>
Date: Mon, 6 Oct 2025 22:19:56 +0100
Subject: [PATCH 06/10] Update src/lerobot/scripts/rag_robot_health.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Sahana Venkatesh <venkatesh.sahana1606@gmail.com>
---
 src/lerobot/scripts/rag_robot_health.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lerobot/scripts/rag_robot_health.py b/src/lerobot/scripts/rag_robot_health.py
index 44cc3eca89..a0586b56e2 100644
--- a/src/lerobot/scripts/rag_robot_health.py
+++ b/src/lerobot/scripts/rag_robot_health.py
@@ -60,7 +60,6 @@
 import faiss
 import cohere
 
-os.environ["COHERE_API_KEY"] = "6V7WaFPKcEnSC8iF6HTca8HHkhvIOJVuVg7Xzer7"
 
 # ---------------------------- Config ----------------------------
 MOTORS = [

From ba2e37b20da1b6c5492eb6e6af805ec81a03b55c Mon Sep 17 00:00:00 2001
From: Sahana Venkatesh <venkatesh.sahana1606@gmail.com>
Date: Mon, 6 Oct 2025 22:20:41 +0100
Subject: [PATCH 07/10] Update src/lerobot/scripts/collect_initpos.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Sahana Venkatesh <venkatesh.sahana1606@gmail.com>
---
 src/lerobot/scripts/collect_initpos.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lerobot/scripts/collect_initpos.py b/src/lerobot/scripts/collect_initpos.py
index 73cc2645d6..f477de6715 100644
--- a/src/lerobot/scripts/collect_initpos.py
+++ b/src/lerobot/scripts/collect_initpos.py
@@ -157,7 +157,7 @@ def main():
     if args.use_action: use_state = False
     if args.use_state:  use_state = True
 
-    ds = LeRobotDataset(args.repo-id if hasattr(args, "repo-id") else args.repo_id, root=args.root, tolerance_s=1e-4)
+    ds = LeRobotDataset(args.repo_id, root=args.root, tolerance_s=1e-4)
     n_eps = len(ds.episode_data_index["from"])
 
     frames_dir = (args.outdir / "first_frames") if args.save_first_frames else None

From 39a2f2687865d5f258dbd515a7b20fd74d5df481 Mon Sep 17 00:00:00 2001
From: Sahana Venkatesh <venkatesh.sahana1606@gmail.com>
Date: Mon, 6 Oct 2025 22:26:55 +0100
Subject: [PATCH 08/10] Update src/lerobot/scripts/rag_robot_health.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Sahana Venkatesh <venkatesh.sahana1606@gmail.com>
---
 src/lerobot/scripts/rag_robot_health.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lerobot/scripts/rag_robot_health.py b/src/lerobot/scripts/rag_robot_health.py
index a0586b56e2..b5e4896ebd 100644
--- a/src/lerobot/scripts/rag_robot_health.py
+++ b/src/lerobot/scripts/rag_robot_health.py
@@ -112,7 +112,7 @@ def fmt(v):
 def _looks_num_str(s: str) -> bool:
     s = s.strip()
     if not s: return False
-    if s[0] in "+-": s = s[1:]
+    if s[0] in ('+', '-'): s = s[1:]
     return s.replace(".", "", 1).isdigit()
 
 def _coerce_scalar(val) -> float:

From ca4147765e49a2e73472685d99c92774b3f273cd Mon Sep 17 00:00:00 2001
From: Sahana Venkatesh <venkatesh.sahana1606@gmail.com>
Date: Mon, 6 Oct 2025 22:27:09 +0100
Subject: [PATCH 09/10] Update src/lerobot/scripts/rag_robot_health.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Sahana Venkatesh <venkatesh.sahana1606@gmail.com>
---
 src/lerobot/scripts/rag_robot_health.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/lerobot/scripts/rag_robot_health.py b/src/lerobot/scripts/rag_robot_health.py
index b5e4896ebd..ac83525153 100644
--- a/src/lerobot/scripts/rag_robot_health.py
+++ b/src/lerobot/scripts/rag_robot_health.py
@@ -200,7 +200,9 @@ def build_docs(
     docs: List[EpisodeDoc] = []
     for ep_id, motor_avgs in ep_table.items():
         motor_z = {m: zscore(motor_avgs.get(m, float("nan")), mu.get(m, float("nan")), sigma.get(m, float("nan"))) for m in MOTORS}
-        flags = {m: (abs(motor_z.get(m, float("nan"))) > OUTLIER_Z) if not math.isnan(motor_z.get(m, float("nan"))) else False for m in MOTORS}
+        flags = {m: (abs(z) > OUTLIER_Z) if not math.isnan(z) else False
+                 for m in MOTORS
+                 for z in [motor_z.get(m, float("nan"))]}
         health = health_score_from_z(motor_z)
         frames = (frames_map or {}).get(str(ep_id))
         docs.append(EpisodeDoc(

From b210ed97fbc56a9a69734971d4eac66e3d8c5899 Mon Sep 17 00:00:00 2001
From: Sahana Venkatesh <venkatesh.sahana1606@gmail.com>
Date: Mon, 6 Oct 2025 22:27:16 +0100
Subject: [PATCH 10/10] Update src/lerobot/scripts/collect_initpos.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Sahana Venkatesh <venkatesh.sahana1606@gmail.com>
---
 src/lerobot/scripts/collect_initpos.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/lerobot/scripts/collect_initpos.py b/src/lerobot/scripts/collect_initpos.py
index f477de6715..9300f9642f 100644
--- a/src/lerobot/scripts/collect_initpos.py
+++ b/src/lerobot/scripts/collect_initpos.py
@@ -36,7 +36,8 @@ def chw_to_hwc_uint8(img_t: torch.Tensor) -> np.ndarray:
         t = (t.clamp(0, 1) * 255.0).to(torch.uint8)
     elif t.dtype != torch.uint8:
         t = t.to(torch.uint8)
-    assert t.ndim == 3 and t.shape[0] <= t.shape[1] and t.shape[0] <= t.shape[2], f"Expected CxHxW, got {tuple(t.shape)}"
+    if not (t.ndim == 3 and t.shape[0] <= t.shape[1] and t.shape[0] <= t.shape[2]):
+        raise ValueError(f"Expected input tensor of shape CxHxW (C <= H and C <= W), got {tuple(t.shape)}")
     return t.permute(1, 2, 0).numpy()
 
 def save_first_frames_from_batch(batch, dataset: LeRobotDataset, ep: int, frames_dir: Path) -> list[str]: