From 1cbd56cf58e6e05801645357a51d122285a1587e Mon Sep 17 00:00:00 2001 From: Sahana Date: Sun, 14 Sep 2025 14:06:41 +0100 Subject: [PATCH 01/10] first version of inital-pos --- src/lerobot/scripts/collect_initpos.py | 151 +++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 src/lerobot/scripts/collect_initpos.py diff --git a/src/lerobot/scripts/collect_initpos.py b/src/lerobot/scripts/collect_initpos.py new file mode 100644 index 0000000000..884ba52b42 --- /dev/null +++ b/src/lerobot/scripts/collect_initpos.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python +import argparse +from pathlib import Path +import json +from concurrent.futures import ThreadPoolExecutor, as_completed +import os + +import numpy as np +import matplotlib.pyplot as plt +import torch +from torch.utils.data import DataLoader + +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.scripts.visualize_dataset import EpisodeSampler + +# --- EDIT for your robot layout --- +MOTOR_IDXS = { + "shoulder_pan": 0, + "shoulder_lift": 1, + "elbow_flex": 2, + "wrist_flex": 3, + "wrist_roll": 4, + "gripper_pos": 5, +} +# ---------------------------------- + +def collect_first_10s_episode(dataset: LeRobotDataset, episode_index: int, + seconds=10.0, use_state=True, dl_workers: int = 2): + """Return (raw, avg) where: + raw[motor] -> np.array(T,), avg[motor] -> float + """ + fps = float(dataset.meta.fps) + n_frames = int(seconds * fps) + + sampler = EpisodeSampler(dataset, episode_index) + loader = DataLoader(dataset, + sampler=sampler, + batch_size=1, + shuffle=False, + num_workers=dl_workers, + pin_memory=False) + + # incremental buffers per motor to avoid big reallocs + vals = {m: [] for m in MOTOR_IDXS} + + for i, batch in enumerate(loader): + if i >= n_frames: + break + vec = batch["observation.state"][0] if use_state else batch["action"][0] + if isinstance(vec, torch.Tensor): + vec = vec.detach().cpu().numpy() + for m, idx in MOTOR_IDXS.items(): + vals[m].append(float(vec[idx])) + + raw = {m: np.asarray(v, dtype=np.float32) for m, v in vals.items()} + avg = {m: (float(v.mean()) if v.size else float("nan")) for m, v in raw.items()} + return raw, avg + +def invert_episode_major(d_ep_motor): + if not d_ep_motor: + return {} + motors = next(iter(d_ep_motor.values())).keys() + out = {m: {} for m in motors} + for ep, m_dict in d_ep_motor.items(): + for m, v in m_dict.items(): + out[m][ep] = v + return out + +def save_dicts(raw_motor_major, avg_motor_major, outdir: Path): + outdir.mkdir(parents=True, exist_ok=True) + raw_json = {m: {str(ep): arr.tolist() for ep, arr in eps.items()} for m, eps in raw_motor_major.items()} + avg_json = {m: {str(ep): val for ep, val in eps.items()} for m, eps in avg_motor_major.items()} + (outdir / "first10s_raw.json").write_text(json.dumps(raw_json)) + (outdir / "first10s_avg.json").write_text(json.dumps(avg_json)) + +def plot_episode_means(avg_motor_major, outpath: Path): + motors = list(MOTOR_IDXS.keys()) + n = len(motors) + import matplotlib.pyplot as plt + plt.figure(figsize=(12, 2.2 * n)) + for i, m in enumerate(motors, 1): + plt.subplot(n, 1, i) + items = sorted(((int(ep), v) for ep, v in avg_motor_major.get(m, {}).items()), key=lambda x: x[0]) + if not items: + plt.title(f"{m} (no data)"); continue + xs = [ep for ep, _ in items] + ys = [v for _, v in items] + plt.scatter(xs, ys, s=14) + plt.xlabel("episode_id"); plt.ylabel("mean @ first 10s"); plt.title(m) + plt.grid(True, alpha=0.3) + plt.tight_layout() + plt.savefig(outpath, dpi=180) + plt.close() + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--repo-id", required=True, type=str) + ap.add_argument("--root", type=Path, default=None) + ap.add_argument("--seconds", type=float, default=10.0) + ap.add_argument("--use-state", action="store_true", help="Use observation.state (default)") + ap.add_argument("--use-action", action="store_true", help="Use action instead of state") + ap.add_argument("--outdir", type=Path, default=Path("initpos_stats")) + ap.add_argument("--dl-workers", type=int, default=2, help="DataLoader workers PER EPISODE") + ap.add_argument("--max-threads", type=int, default=max(1, os.cpu_count() // 2), + help="Max concurrent episodes") + args = ap.parse_args() + + use_state = True + if args.use_action: use_state = False + if args.use_state: use_state = True + + ds = LeRobotDataset(args.repo_id, root=args.root, tolerance_s=1e-4) + n_eps = len(ds.episode_data_index["from"]) + + # Parallel over episodes + all_raw_ep_major, all_avg_ep_major = {}, {} + with ThreadPoolExecutor(max_workers=args.max_threads) as ex: + futures = { + ex.submit(collect_first_10s_episode, ds, ep, + seconds=args.seconds, use_state=use_state, + dl_workers=args.dl_workers): ep + for ep in range(n_eps) + } + for fut in as_completed(futures): + ep = futures[fut] + try: + raw, avg = fut.result() + except Exception as e: + # Don't crash the run; record NaNs for this episode + raw = {m: np.array([], dtype=np.float32) for m in MOTOR_IDXS} + avg = {m: float("nan") for m in MOTOR_IDXS} + print(f"[warn] episode {ep} failed: {e}") + all_raw_ep_major[ep] = raw + all_avg_ep_major[ep] = avg + + # Convert to motor-major for saving/plotting + raw_motor_major = invert_episode_major(all_raw_ep_major) # motor -> {ep: np.array} + avg_motor_major = invert_episode_major(all_avg_ep_major) # motor -> {ep: float} + + args.outdir.mkdir(parents=True, exist_ok=True) + save_dicts(raw_motor_major, avg_motor_major, args.outdir) + plot_episode_means(avg_motor_major, args.outdir / "episode_means.png") + + fps = float(ds.meta.fps) + print(f"[done] fps={fps:.3f} | episodes={n_eps} | saved:") + print(f" - {args.outdir/'first10s_raw.json'}") + print(f" - {args.outdir/'first10s_avg.json'}") + print(f" - {args.outdir/'episode_means.png'}") + +if __name__ == "__main__": + main() From a6d8bf84cdaa1c4ad32c02ab37ae3c43689f3f66 Mon Sep 17 00:00:00 2001 From: Sahana Date: Sun, 14 Sep 2025 15:39:21 +0100 Subject: [PATCH 02/10] add rag_health --- src/lerobot/scripts/rag_robot_health.py | 587 ++++++++++++++++++++++++ 1 file changed, 587 insertions(+) create mode 100644 src/lerobot/scripts/rag_robot_health.py diff --git a/src/lerobot/scripts/rag_robot_health.py b/src/lerobot/scripts/rag_robot_health.py new file mode 100644 index 0000000000..829eb5584e --- /dev/null +++ b/src/lerobot/scripts/rag_robot_health.py @@ -0,0 +1,587 @@ +#!/usr/bin/env python3 +""" +RAG for Robot Dataset Health — Episode Motor Averages (conversational) + +What this script does +--------------------- +1) Loads per-episode 10s motor averages (motor-major JSON: motor -> {episode_id: value|list|str}) +2) Computes per-motor μ/σ, z-scores per episode, outlier flags, health_score +3) Builds episode "doc" text blobs for Cohere Embed + Rerank +4) Indexes embeddings in FAISS +5) Optional conversational summaries via Cohere Command-R +6) CLI: + - summary : build index (if needed) + print health summary + - query --q "..." : semantic search (Embed->ANN->Rerank) + - similar --ep N : episodes similar to ep_N + - explain --ep N [--motor] : WHY outlier + side-by-side PNG + +Inputs +------ +--avg-json : path to averages JSON (motor -> {ep: float|list|str}) +--frames-json : (optional) ep_id -> [first-frame paths or URLs] +--repo-dir : directory to store artifacts (docs.jsonl, stats.json, faiss.index, embeddings.npy) + +Install +------- +pip install cohere faiss-cpu numpy pillow tqdm + +Env +--- +export COHERE_API_KEY=YOUR_KEY + +# 1) Build + conversational summary +python rag_robot_health.py \ + --avg-json initpos_stats/Sahana16/record-medicine/first10s_avg.json \ + --repo-dir Sahana16/record-medicine \ + --chat --style direct summary + +# 2) Conversational query +python rag_robot_health.py \ + --avg-json initpos_stats/Sahana16/record-medicine/first10s_avg.json \ + --repo-dir record-medicine \ + --chat --style friendly \ + query --q "outliers on elbow_flex" + +""" + +from __future__ import annotations +import os, json, argparse, math +from pathlib import Path +from typing import Dict, List, Tuple, Any +from dataclasses import dataclass, asdict + +import numpy as np +from tqdm import tqdm +from PIL import Image, ImageDraw + +import faiss +import cohere + +import os +os.environ["COHERE_API_KEY"] = "6V7WaFPKcEnSC8iF6HTca8HHkhvIOJVuVg7Xzer7" + +# ---------------------------- Config ---------------------------- +MOTORS = [ + "shoulder_pan", + "shoulder_lift", + "elbow_flex", + "wrist_flex", + "wrist_roll", + "gripper_pos", +] + +OUTLIER_Z = 3.0 +CANDIDATE_Z = 2.0 +RERANK_TOP_K = 12 +ANN_CAND_K = 64 +EMBED_MODEL = "embed-english-v3.0" +RERANK_MODEL = "rerank-english-v3.0" +CHAT_MODEL = "command-r" + +# ---------------------------- Data types ---------------------------- +@dataclass +class EpisodeDoc: + ep_id: str + split: str | None + task: str | None + motor_avgs: Dict[str, float] + motor_mu: Dict[str, float] + motor_sigma: Dict[str, float] + motor_z: Dict[str, float] + outlier_flags: Dict[str, bool] + health_score: float + first_frames: List[str] | None + + def to_text_blob(self) -> str: + def fmt(v): + if v is None or (isinstance(v, float) and (math.isnan(v) or math.isinf(v))): + return "nan" + return f"{v:.6f}" + parts = [] + parts.append(f"ep_id: {self.ep_id}") + if self.task: parts.append(f"task: {self.task}") + if self.split: parts.append(f"split: {self.split}") + parts.append("motor_avgs: {" + ", ".join(f"{m}:{fmt(self.motor_avgs.get(m, float('nan')))}" for m in MOTORS) + "}") + parts.append("motor_z: {" + ", ".join(f"{m}:{fmt(self.motor_z.get(m, float('nan')))}" for m in MOTORS) + "}") + parts.append("outliers: {" + ", ".join(f"{m}:{'true' if self.outlier_flags.get(m, False) else 'false'}" for m in MOTORS) + "}") + parts.append(f"health_score: {fmt(self.health_score)}") + return "; ".join(parts) + +# ---------------------------- IO helpers ---------------------------- +def _looks_num_str(s: str) -> bool: + s = s.strip() + if not s: return False + if s[0] in "+-": s = s[1:] + return s.replace(".", "", 1).isdigit() + +def _coerce_scalar(val) -> float: + if isinstance(val, (int, float)): + return float(val) + if isinstance(val, str): + return float(val.strip()) if _looks_num_str(val) else float("nan") + if isinstance(val, (list, tuple)): + nums = [] + for x in val: + if isinstance(x, (int, float)): nums.append(float(x)) + elif isinstance(x, str) and _looks_num_str(x): nums.append(float(x)) + return float(np.mean(nums)) if nums else float("nan") + return float("nan") + +def load_avg_json(path: Path) -> Dict[str, Dict[str, float]]: + """ + Accepts motor-major JSON with floats/strings/lists. + Returns: motor -> {episode_id: float_mean} + """ + raw = json.loads(Path(path).read_text()) + filtered: Dict[str, Dict[str, float]] = {} + bad = [] + for m in MOTORS: + motor_map = {} + for ep, val in (raw.get(m, {}) or {}).items(): + v = _coerce_scalar(val) + if math.isnan(v): + bad.append((m, ep, val)) + motor_map[str(ep)] = v + filtered[m] = motor_map + if bad: + print(f"[warn] {len(bad)} invalid entries; set to NaN. First few:") + for b in bad[:5]: + print(" motor:", b[0], "episode:", b[1], "value:", str(b[2])[:120]) + return filtered + +def load_frames_json(path: Path | None) -> Dict[str, List[str]]: + if not path: return {} + return {str(k): v for k, v in json.loads(Path(path).read_text()).items()} + +def ensure_dir(p: Path): + p.mkdir(parents=True, exist_ok=True) + +# ---------------------------- Stats & Docs ---------------------------- +def compute_mu_sigma(avg_motor_major: Dict[str, Dict[str, float]]) -> Tuple[Dict[str, float], Dict[str, float]]: + mu, sigma = {}, {} + for m in MOTORS: + vals = np.array(list(avg_motor_major.get(m, {}).values()), dtype=np.float64) + if vals.size == 0: + mu[m], sigma[m] = float("nan"), float("nan") + else: + mu[m] = float(np.nanmean(vals)) + sigma[m] = float(np.nanstd(vals, ddof=1) if vals.size > 1 else 0.0) + return mu, sigma + +def build_episode_table(avg_motor_major: Dict[str, Dict[str, float]]) -> Dict[str, Dict[str, float]]: + episodes = set() + for m in MOTORS: + episodes.update(avg_motor_major.get(m, {}).keys()) + table = {ep: {} for ep in episodes} + for m in MOTORS: + for ep, val in avg_motor_major.get(m, {}).items(): + table[ep][m] = val # already coerced + return table + +def zscore(value: float, mu: float, sigma: float) -> float: + if sigma is None or sigma == 0 or math.isnan(sigma): + return float("nan") + return (value - mu) / sigma + +def health_score_from_z(zs: Dict[str, float]) -> float: + abs_vals = [abs(v) for v in zs.values() if not math.isnan(v)] + if not abs_vals: return float("nan") + return float(1.0 / (1.0 + (sum(abs_vals) / max(1, len(abs_vals))))) + +def build_docs( + avg_motor_major: Dict[str, Dict[str, float]], + frames_map: Dict[str, List[str]] | None = None, + global_task: str | None = None, + global_split: str | None = None +) -> Tuple[List[EpisodeDoc], Dict[str, float], Dict[str, float]]: + mu, sigma = compute_mu_sigma(avg_motor_major) + ep_table = build_episode_table(avg_motor_major) + docs: List[EpisodeDoc] = [] + for ep_id, motor_avgs in ep_table.items(): + motor_z = {m: zscore(motor_avgs.get(m, float("nan")), mu.get(m, float("nan")), sigma.get(m, float("nan"))) for m in MOTORS} + flags = {m: (abs(motor_z.get(m, float("nan"))) > OUTLIER_Z) if not math.isnan(motor_z.get(m, float("nan"))) else False for m in MOTORS} + health = health_score_from_z(motor_z) + frames = (frames_map or {}).get(str(ep_id)) + docs.append(EpisodeDoc( + ep_id=str(ep_id), + split=global_split, + task=global_task, + motor_avgs={m: motor_avgs.get(m, float("nan")) for m in MOTORS}, + motor_mu=mu, + motor_sigma=sigma, + motor_z=motor_z, + outlier_flags=flags, + health_score=health, + first_frames=frames + )) + return docs, mu, sigma + +# ---------------------------- Cohere & FAISS ---------------------------- +def get_cohere() -> cohere.Client: + key = os.environ.get("COHERE_API_KEY") + if not key: + raise RuntimeError("Set COHERE_API_KEY environment variable.") + return cohere.Client(key) + +def embed_texts_cohere(texts: List[str], input_type: str) -> np.ndarray: + co = get_cohere() + out = co.embed(texts=texts, model=EMBED_MODEL, input_type=input_type) + embs = np.array(out.embeddings, dtype=np.float32) + norms = np.linalg.norm(embs, axis=1, keepdims=True) + 1e-12 + return embs / norms + +def build_faiss(embs: np.ndarray) -> faiss.IndexFlatIP: + d = embs.shape[1] + index = faiss.IndexFlatIP(d) # cosine via dot on normalized vectors + index.add(embs) + return index + +def ann_search(index: faiss.IndexFlatIP, q_vec: np.ndarray, k: int = ANN_CAND_K) -> Tuple[np.ndarray, np.ndarray]: + D, I = index.search(q_vec[None, :], k) + return I[0], D[0] + +def rerank(co: cohere.Client, query: str, doc_texts: List[str], k: int = RERANK_TOP_K) -> List[int]: + rr = co.rerank(model=RERANK_MODEL, query=query, documents=doc_texts, top_n=min(k, len(doc_texts))) + return [r.index for r in rr.results] + +# ---------------------------- Conversational layer ---------------------------- +def chatify(prompt: str, user_context: str = "", style: str = "direct") -> str: + """ + Use Cohere Command-R to turn structured findings into a concise, conversational blurb. + """ + co = get_cohere() + sys = ( + "You are an expert robotics dataset analyst. " + "Be direct, clear, and actionable in 4-8 sentences. " + "If listing steps, keep them short and specific." + ) + full_prompt = ( + f"{prompt}\n\n" + f"Tone: {style}.\n" + f"Context:\n{user_context}\n" + f"Respond as one concise paragraph or a few short bullets." + ) + try: + resp = co.chat(model=CHAT_MODEL, message=full_prompt) + return (resp.text or "").strip() + except Exception as e: + return f"(chat unavailable: {e})" + +# ---------------------------- Reports ---------------------------- +def dataset_health_summary(docs: List[EpisodeDoc]) -> Dict[str, Any]: + motor_outliers = {m: 0 for m in MOTORS} + for d in docs: + for m in MOTORS: + if d.outlier_flags.get(m, False): + motor_outliers[m] += 1 + ranked = [d for d in docs if not math.isnan(d.health_score)] + ranked.sort(key=lambda x: x.health_score) # low is worse + worst = ranked[:min(5, len(ranked))] + best = list(reversed(ranked))[:min(5, len(ranked))] + return { + "motor_outlier_counts": motor_outliers, + "worst_eps": [(d.ep_id, round(d.health_score, 4)) for d in worst], + "best_eps": [(d.ep_id, round(d.health_score, 4)) for d in best], + } + +def print_summary(summary: Dict[str, Any]): + print("\n== Dataset Health Summary ==") + print(f"Outlier counts per motor (|z| > {OUTLIER_Z:.1f}):") + for m in MOTORS: + print(f" - {m:14s}: {summary['motor_outlier_counts'][m]}") + print("\nWorst episodes by health_score:") + for ep, sc in summary["worst_eps"]: + print(f" - ep {ep:>6s}: {sc:.4f}") + print("\nBest episodes by health_score:") + for ep, sc in summary["best_eps"]: + print(f" - ep {ep:>6s}: {sc:.4f}") + print() + +# ---------------------------- Visual Explain ---------------------------- +def top_outlier_motors(doc, top_k=3): + items = [] + for m in MOTORS: + z = doc["motor_z"].get(m, float("nan")) + if math.isnan(z): continue + items.append((m, z, + doc["motor_mu"].get(m, float("nan")), + doc["motor_sigma"].get(m, float("nan")), + doc["motor_avgs"].get(m, float("nan")))) + items.sort(key=lambda x: abs(x[1]), reverse=True) + return items[:top_k] + +def find_healthy_exemplars(docs_json, embs, index, target_idx, focus_motor=None, k=3): + I, _ = ann_search(index, embs[target_idx], k=min(256, len(docs_json))) + candidates = [i for i in I if i != target_idx] + def ok(d): + if focus_motor: + z = d["motor_z"].get(focus_motor) + return (z is not None) and (not math.isnan(z)) and (abs(z) < 1.0) + zs = [abs(v) for v in d["motor_z"].values() if not math.isnan(v)] + return zs and (sum(zs)/len(zs) < 1.0) + healthy = [i for i in candidates if ok(docs_json[i])] + return healthy[:k] if healthy else candidates[:k] + +def load_first_images(paths, max_per_ep=2, target_size=(512, 384)): + imgs = [] + for p in (paths or [])[:max_per_ep]: + try: + im = Image.open(p).convert("RGB") + im.thumbnail(target_size) + imgs.append(im) + except Exception: + continue + return imgs + +def make_side_by_side(target_imgs, exemplar_imgs_list, caption, out_png_path): + pad = 16 + col_w = max([img.width for img in (target_imgs + sum(exemplar_imgs_list, []))] + [320]) + row_h = sum(img.height for img in target_imgs) + (len(target_imgs)-1)*pad if target_imgs else 240 + rows = max(1, max((len(x) for x in exemplar_imgs_list), default=1)) + right_cols = len(exemplar_imgs_list) + width = pad + col_w + pad + right_cols*(col_w + pad) + pad + height = pad + max(row_h, rows*((target_imgs[0].height if target_imgs else 240) + pad)) + 120 + canvas = Image.new("RGB", (width, height), (255,255,255)) + draw = ImageDraw.Draw(canvas) + draw.text((pad, pad), caption, fill=(0,0,0)) + # target (left) + y = pad + 32; x = pad + for img in target_imgs: + canvas.paste(img, (x, y)); y += img.height + pad + # exemplars (right) + base_x = pad + col_w + pad + for col, imgs in enumerate(exemplar_imgs_list): + y = pad + 32; x = base_x + col*(col_w + pad) + for img in imgs: + canvas.paste(img, (x, y)); y += img.height + pad + canvas.save(out_png_path) + return out_png_path + +# ---------------------------- CLI actions ---------------------------- +def ensure_repo(docs: List[EpisodeDoc], repo_dir: Path): + ensure_dir(repo_dir) + docs_path = repo_dir / "docs.jsonl" + stats_path = repo_dir / "stats.json" + embs_path = repo_dir / "embeddings.npy" + index_path = repo_dir / "faiss.index" + + with docs_path.open("w") as f: + for d in docs: + rec = asdict(d) + rec["text_blob"] = d.to_text_blob() + f.write(json.dumps(rec) + "\n") + + texts = [d.to_text_blob() for d in docs] + embs = embed_texts_cohere(texts, input_type="search_document") + np.save(embs_path, embs) + + index = build_faiss(embs) + faiss.write_index(index, str(index_path)) + + stats = { + "n_docs": len(docs), + "motors": MOTORS, + "embed_model": EMBED_MODEL, + "rerank_model": RERANK_MODEL, + "outlier_z": OUTLIER_Z, + "candidate_z": CANDIDATE_Z, + } + stats_path.write_text(json.dumps(stats, indent=2)) + + print(f"[repo] wrote {docs_path}") + print(f"[repo] wrote {embs_path}") + print(f"[repo] wrote {index_path}") + print(f"[repo] wrote {stats_path}") + +def load_repo(repo_dir: Path) -> Tuple[List[Dict[str, Any]], np.ndarray, faiss.IndexFlatIP]: + docs_path = repo_dir / "docs.jsonl" + embs_path = repo_dir / "embeddings.npy" + index_path = repo_dir / "faiss.index" + docs_json = [json.loads(l) for l in docs_path.read_text().splitlines() if l.strip()] + embs = np.load(embs_path) + index = faiss.read_index(str(index_path)) + return docs_json, embs, index + +def cmd_summary(args): + avg_mm = load_avg_json(args.avg_json) + frames_map = load_frames_json(args.frames_json) if args.frames_json else {} + docs, mu, sigma = build_docs(avg_mm, frames_map, global_task=args.task, global_split=args.split) + ensure_repo(docs, args.repo_dir) + summary = dataset_health_summary(docs) + print_summary(summary) + + if args.chat: + counts = "\n".join([f"{m}: {summary['motor_outlier_counts'][m]}" for m in MOTORS]) + worst = ", ".join([f"ep {e} (score {s})" for e, s in summary["worst_eps"]]) + best = ", ".join([f"ep {e} (score {s})" for e, s in summary["best_eps"]]) + ctx = ( + f"Outlier counts:\n{counts}\n\n" + f"Worst episodes: {worst}\nBest episodes: {best}\n" + f"Outlier threshold: |z| > {OUTLIER_Z}" + ) + prompt = ("Summarize dataset health for robotics training. " + "Explain implications and give 2–3 concrete next steps.") + print(chatify(prompt, ctx, style=args.style)) + +def cmd_query(args): + if not (args.repo_dir / "docs.jsonl").exists(): + avg_mm = load_avg_json(args.avg_json) + frames_map = load_frames_json(args.frames_json) if args.frames_json else {} + docs, mu, sigma = build_docs(avg_mm, frames_map, global_task=args.task, global_split=args.split) + ensure_repo(docs, args.repo_dir) + + docs_json, embs, index = load_repo(args.repo_dir) + texts = [d["text_blob"] for d in docs_json] + + co = get_cohere() + q_emb = embed_texts_cohere([args.q], input_type="search_query")[0] + idxs, _ = ann_search(index, q_emb, k=min(ANN_CAND_K, len(texts))) + cands = [texts[i] for i in idxs] + order = rerank(co, args.q, cands, k=min(RERANK_TOP_K, len(cands))) + final_ids = [idxs[i] for i in order] + + print("\n== Query Results ==") + for rank, i in enumerate(final_ids, 1): + d = docs_json[i] + print(f"{rank:2d}. ep {d['ep_id']} | health={d['health_score']:.4f} | outliers={[m for m,f in d['outlier_flags'].items() if f]}") + if d.get("first_frames"): + print(f" frames: {', '.join(d['first_frames'][:3])}") + print() + + if args.chat: + top_blobs = "\n\n".join([docs_json[i]["text_blob"] for i in final_ids[:5]]) + prompt = (f"User question: {args.q}\n" + "Answer conversationally for a robotics dataset engineer. " + "Reference episode IDs and motors likely at fault. Suggest next checks.") + print(chatify(prompt, top_blobs, style=args.style)) + +def cmd_similar(args): + if not (args.repo_dir / "docs.jsonl").exists(): + avg_mm = load_avg_json(args.avg_json) + frames_map = load_frames_json(args.frames_json) if args.frames_json else {} + docs, mu, sigma = build_docs(avg_mm, frames_map, global_task=args.task, global_split=args.split) + ensure_repo(docs, args.repo_dir) + + docs_json, embs, index = load_repo(args.repo_dir) + texts = [d["text_blob"] for d in docs_json] + ep_idx_map = {d["ep_id"]: i for i, d in enumerate(docs_json)} + if str(args.ep) not in ep_idx_map: + raise SystemExit(f"ep {args.ep} not found in repo docs") + + i = ep_idx_map[str(args.ep)] + co = get_cohere() + + q_vec = embs[i] + idxs, _ = ann_search(index, q_vec, k=min(ANN_CAND_K, len(texts))) + idxs = [j for j in idxs if j != i] + cands_text = [texts[j] for j in idxs] + order = rerank(co, texts[i], cands_text, k=min(RERANK_TOP_K, len(cands_text))) + final_ids = [idxs[j] for j in order] + + print(f"\n== Episodes similar to ep {args.ep} ==") + for rank, j in enumerate(final_ids, 1): + d = docs_json[j] + print(f"{rank:2d}. ep {d['ep_id']} | health={d['health_score']:.4f} | outliers={[m for m,f in d['outlier_flags'].items() if f]}") + if d.get("first_frames"): + print(f" frames: {', '.join(d['first_frames'][:3])}") + print() + + if args.chat: + top_blobs = "\n\n".join([docs_json[j]["text_blob"] for j in final_ids[:5]]) + anchor = docs_json[i]["text_blob"] + prompt = (f"Compare similar episodes to anchor episode {args.ep}. " + "Call out motor dimensions that differ and propose a quick validation checklist.") + ctx = f"ANCHOR:\n{anchor}\n\nSIMILARS:\n{top_blobs}" + print(chatify(prompt, ctx, style=args.style)) + +def cmd_explain(args): + if not (args.repo_dir / "docs.jsonl").exists(): + avg_mm = load_avg_json(args.avg_json) + frames_map = load_frames_json(args.frames_json) if args.frames_json else {} + docs, mu, sigma = build_docs(avg_mm, frames_map, global_task=args.task, global_split=args.split) + ensure_repo(docs, args.repo_dir) + + docs_json, embs, index = load_repo(args.repo_dir) + ep_idx_map = {d["ep_id"]: i for i, d in enumerate(docs_json)} + if str(args.ep) not in ep_idx_map: + raise SystemExit(f"ep {args.ep} not found in repo docs") + + i = ep_idx_map[str(args.ep)] + d = docs_json[i] + + offenders = top_outlier_motors(d, top_k=3) + focus_motor = args.motor if args.motor else (offenders[0][0] if offenders else None) + ex_ids = find_healthy_exemplars(docs_json, embs, index, i, focus_motor=focus_motor, k=args.k) + + target_imgs = load_first_images(d.get("first_frames"), max_per_ep=2) + exemplar_imgs_list = [load_first_images(docs_json[j].get("first_frames"), max_per_ep=2) for j in ex_ids] + + out_png = args.repo_dir / f"explain_ep{args.ep}.png" + cap = f"Episode {args.ep}: outlier vs healthy exemplars (focus motor: {focus_motor})" + make_side_by_side(target_imgs, exemplar_imgs_list, cap, out_png) + + print(f"\n== Explanation for episode {args.ep} ==") + if offenders: + for (m, z, mu, sigma, val) in offenders: + side = "high" if z > 0 else "low" + label = " -> OUTLIER" if abs(z) > OUTLIER_Z else (" -> CANDIDATE" if abs(z) > CANDIDATE_Z else "") + print(f"- {m}: z={z:.2f} ({side}); value={val:.4f}, mean={mu:.4f}, std={sigma:.4f}{label}") + else: + print(f"- No strong offenders; health_score={d['health_score']:.4f}") + + print("\nHealthy exemplars shown:") + for j in ex_ids: + dj = docs_json[j] + max_abs_z = max((abs(v) for v in dj['motor_z'].values() if not math.isnan(v)), default=float("nan")) + print(f"- ep {dj['ep_id']} | health={dj['health_score']:.4f} | max|z|={max_abs_z:.2f}") + + print(f"\nSaved side-by-side: {out_png}\n") + + if args.chat: + lines = [] + for (m, z, mu, sigma, val) in offenders: + side = "high" if z > 0 else "low" + lines.append(f"{m}: z={z:.2f} ({side}), value={val:.4f}, mean={mu:.4f}, std={sigma:.4f}") + ex_list = ", ".join([docs_json[j]["ep_id"] for j in ex_ids]) if ex_ids else "none" + ctx = ( + f"Episode {args.ep} offenders:\n" + "\n".join(lines) + + f"\nExemplars: {ex_list}\nOutlier threshold: |z| > {OUTLIER_Z}\nImage: {out_png}" + ) + prompt = ("Explain in plain language why this episode is an outlier and " + "what the images likely show. Give 2 next steps to confirm/fix.") + print(chatify(prompt, ctx, style=args.style)) + +# ---------------------------- Main ---------------------------- +def parse_args(): + p = argparse.ArgumentParser(description="RAG on motor averages: health + retrieval (conversational)") + p.add_argument("--avg-json", type=Path, required=True, help="Path to motor-major averages JSON") + p.add_argument("--frames-json", type=Path, default=None, help="Optional: ep_id -> [first-frame paths]") + p.add_argument("--repo-dir", type=Path, required=True, help="Directory to store docs/index/embeddings") + p.add_argument("--task", type=str, default=None, help="Optional task label") + p.add_argument("--split", type=str, default=None, help="Optional split label (train/eval)") + p.add_argument("--chat", action="store_true", help="Generate a conversational summary/answer via Cohere") + p.add_argument("--style", type=str, default="direct", help="Tone: direct | friendly | technical | executive") + sub = p.add_subparsers(dest="cmd", required=True) + + s1 = sub.add_parser("summary", help="Build index (if needed) and print dataset health summary") + s1.set_defaults(func=cmd_summary) + + s2 = sub.add_parser("query", help="Text search over episode docs (Embed->ANN->Rerank)") + s2.add_argument("--q", type=str, required=True) + s2.set_defaults(func=cmd_query) + + s3 = sub.add_parser("similar", help="Episodes similar to a given episode") + s3.add_argument("--ep", type=str, required=True) + s3.set_defaults(func=cmd_similar) + + s4 = sub.add_parser("explain", help="Explain why an episode is an outlier; render images vs healthy exemplars") + s4.add_argument("--ep", type=str, required=True) + s4.add_argument("--motor", type=str, default=None, choices=MOTORS + [None]) + s4.add_argument("--k", type=int, default=3, help="Number of healthy exemplars to show") + s4.set_defaults(func=cmd_explain) + + return p.parse_args() + +if __name__ == "__main__": + args = parse_args() + args.func(args) From ea91b98f8c75484414771f1bd32d0d4a93f63c7f Mon Sep 17 00:00:00 2001 From: Sahana Date: Mon, 6 Oct 2025 22:17:23 +0100 Subject: [PATCH 03/10] add --- src/lerobot/scripts/collect_initpos.py | 108 ++++++++++++++++++++----- 1 file changed, 87 insertions(+), 21 deletions(-) diff --git a/src/lerobot/scripts/collect_initpos.py b/src/lerobot/scripts/collect_initpos.py index 884ba52b42..73cc2645d6 100644 --- a/src/lerobot/scripts/collect_initpos.py +++ b/src/lerobot/scripts/collect_initpos.py @@ -24,10 +24,51 @@ } # ---------------------------------- +def chw_to_hwc_uint8(img_t: torch.Tensor) -> np.ndarray: + """ + Convert CxHxW (float32 in [0,1] or uint8) -> HxWxC uint8. + """ + if isinstance(img_t, torch.Tensor): + t = img_t.detach().cpu() + else: + raise TypeError("Expected torch.Tensor for image") + if t.dtype == torch.float32: + t = (t.clamp(0, 1) * 255.0).to(torch.uint8) + elif t.dtype != torch.uint8: + t = t.to(torch.uint8) + assert t.ndim == 3 and t.shape[0] <= t.shape[1] and t.shape[0] <= t.shape[2], f"Expected CxHxW, got {tuple(t.shape)}" + return t.permute(1, 2, 0).numpy() + +def save_first_frames_from_batch(batch, dataset: LeRobotDataset, ep: int, frames_dir: Path) -> list[str]: + """ + Save first-frame images for all available cameras in this batch (assumes batch_size=1). + Returns list of saved file paths. + """ + saved = [] + frames_dir.mkdir(parents=True, exist_ok=True) + cam_keys = getattr(dataset.meta, "camera_keys", []) + for cam in cam_keys: + if cam in batch: + img = batch[cam][0] # CxHxW + try: + arr = chw_to_hwc_uint8(img) + out_path = frames_dir / f"episode_{ep}_{cam}.png" + # Use PIL to write (matplotlib is slower; cv2 adds dep). PIL is bundled via matplotlib. + from PIL import Image + Image.fromarray(arr).save(out_path) + saved.append(str(out_path)) + except Exception as e: + print(f"[warn] failed saving first frame for ep {ep} cam {cam}: {e}") + return saved + def collect_first_10s_episode(dataset: LeRobotDataset, episode_index: int, - seconds=10.0, use_state=True, dl_workers: int = 2): - """Return (raw, avg) where: - raw[motor] -> np.array(T,), avg[motor] -> float + seconds=10.0, use_state=True, dl_workers: int = 2, + frames_dir: Path | None = None) -> tuple[dict, dict, list[str]]: + """ + Return (raw, avg, first_frames) + raw[motor] -> np.array(T,) + avg[motor] -> float + first_frames -> list of saved file paths for t=0 (one per camera if available) """ fps = float(dataset.meta.fps) n_frames = int(seconds * fps) @@ -40,10 +81,16 @@ def collect_first_10s_episode(dataset: LeRobotDataset, episode_index: int, num_workers=dl_workers, pin_memory=False) - # incremental buffers per motor to avoid big reallocs vals = {m: [] for m in MOTOR_IDXS} + first_frames_paths: list[str] = [] + grabbed_first = False for i, batch in enumerate(loader): + # Save first frames exactly at the first yielded sample + if not grabbed_first and frames_dir is not None: + first_frames_paths = save_first_frames_from_batch(batch, dataset, episode_index, frames_dir) + grabbed_first = True + if i >= n_frames: break vec = batch["observation.state"][0] if use_state else batch["action"][0] @@ -54,7 +101,7 @@ def collect_first_10s_episode(dataset: LeRobotDataset, episode_index: int, raw = {m: np.asarray(v, dtype=np.float32) for m, v in vals.items()} avg = {m: (float(v.mean()) if v.size else float("nan")) for m, v in raw.items()} - return raw, avg + return raw, avg, first_frames_paths def invert_episode_major(d_ep_motor): if not d_ep_motor: @@ -76,7 +123,6 @@ def save_dicts(raw_motor_major, avg_motor_major, outdir: Path): def plot_episode_means(avg_motor_major, outpath: Path): motors = list(MOTOR_IDXS.keys()) n = len(motors) - import matplotlib.pyplot as plt plt.figure(figsize=(12, 2.2 * n)) for i, m in enumerate(motors, 1): plt.subplot(n, 1, i) @@ -103,35 +149,48 @@ def main(): ap.add_argument("--dl-workers", type=int, default=2, help="DataLoader workers PER EPISODE") ap.add_argument("--max-threads", type=int, default=max(1, os.cpu_count() // 2), help="Max concurrent episodes") + ap.add_argument("--save-first-frames", action="store_true", + help="Save t=0 frames per episode per camera into outdir/first_frames and write first_frames.json") args = ap.parse_args() use_state = True if args.use_action: use_state = False if args.use_state: use_state = True - ds = LeRobotDataset(args.repo_id, root=args.root, tolerance_s=1e-4) + ds = LeRobotDataset(args.repo-id if hasattr(args, "repo-id") else args.repo_id, root=args.root, tolerance_s=1e-4) n_eps = len(ds.episode_data_index["from"]) + frames_dir = (args.outdir / "first_frames") if args.save_first_frames else None + if frames_dir is not None: + frames_dir.mkdir(parents=True, exist_ok=True) + # Parallel over episodes all_raw_ep_major, all_avg_ep_major = {}, {} + first_frames_map: dict[str, list[str]] = {} + + def work(ep: int): + try: + raw, avg, frames = collect_first_10s_episode( + ds, ep, + seconds=args.seconds, use_state=use_state, + dl_workers=args.dl_workers, + frames_dir=frames_dir + ) + return ep, raw, avg, frames + except Exception as e: + print(f"[warn] episode {ep} failed: {e}") + raw = {m: np.array([], dtype=np.float32) for m in MOTOR_IDXS} + avg = {m: float("nan") for m in MOTOR_IDXS} + return ep, raw, avg, [] + with ThreadPoolExecutor(max_workers=args.max_threads) as ex: - futures = { - ex.submit(collect_first_10s_episode, ds, ep, - seconds=args.seconds, use_state=use_state, - dl_workers=args.dl_workers): ep - for ep in range(n_eps) - } + futures = [ex.submit(work, ep) for ep in range(n_eps)] for fut in as_completed(futures): - ep = futures[fut] - try: - raw, avg = fut.result() - except Exception as e: - # Don't crash the run; record NaNs for this episode - raw = {m: np.array([], dtype=np.float32) for m in MOTOR_IDXS} - avg = {m: float("nan") for m in MOTOR_IDXS} - print(f"[warn] episode {ep} failed: {e}") + ep, raw, avg, frames = fut.result() all_raw_ep_major[ep] = raw all_avg_ep_major[ep] = avg + if args.save_first_frames: + first_frames_map[str(ep)] = frames # Convert to motor-major for saving/plotting raw_motor_major = invert_episode_major(all_raw_ep_major) # motor -> {ep: np.array} @@ -141,11 +200,18 @@ def main(): save_dicts(raw_motor_major, avg_motor_major, args.outdir) plot_episode_means(avg_motor_major, args.outdir / "episode_means.png") + # Save first_frames.json if requested + if args.save_first_frames: + (args.outdir / "first_frames.json").write_text(json.dumps(first_frames_map, indent=2)) + fps = float(ds.meta.fps) print(f"[done] fps={fps:.3f} | episodes={n_eps} | saved:") print(f" - {args.outdir/'first10s_raw.json'}") print(f" - {args.outdir/'first10s_avg.json'}") print(f" - {args.outdir/'episode_means.png'}") + if args.save_first_frames: + print(f" - {args.outdir/'first_frames.json'}") + print(f" - {frames_dir}/*") if __name__ == "__main__": main() From f1835bb90b92ad6c4ba513e96360dc0c87cab6d0 Mon Sep 17 00:00:00 2001 From: Sahana Venkatesh Date: Mon, 6 Oct 2025 22:19:25 +0100 Subject: [PATCH 04/10] Update src/lerobot/scripts/rag_robot_health.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Sahana Venkatesh --- src/lerobot/scripts/rag_robot_health.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lerobot/scripts/rag_robot_health.py b/src/lerobot/scripts/rag_robot_health.py index 829eb5584e..409e4fd4cc 100644 --- a/src/lerobot/scripts/rag_robot_health.py +++ b/src/lerobot/scripts/rag_robot_health.py @@ -45,7 +45,10 @@ """ from __future__ import annotations -import os, json, argparse, math +import os +import json +import argparse +import math from pathlib import Path from typing import Dict, List, Tuple, Any from dataclasses import dataclass, asdict From 0affb9cb2ae0ada5c972ab0abc8aa830a2484a34 Mon Sep 17 00:00:00 2001 From: Sahana Venkatesh Date: Mon, 6 Oct 2025 22:19:37 +0100 Subject: [PATCH 05/10] Update src/lerobot/scripts/rag_robot_health.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Sahana Venkatesh --- src/lerobot/scripts/rag_robot_health.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lerobot/scripts/rag_robot_health.py b/src/lerobot/scripts/rag_robot_health.py index 409e4fd4cc..44cc3eca89 100644 --- a/src/lerobot/scripts/rag_robot_health.py +++ b/src/lerobot/scripts/rag_robot_health.py @@ -60,7 +60,6 @@ import faiss import cohere -import os os.environ["COHERE_API_KEY"] = "6V7WaFPKcEnSC8iF6HTca8HHkhvIOJVuVg7Xzer7" # ---------------------------- Config ---------------------------- From de11127a1ea8bf2199eff44664c6f9e86634bb42 Mon Sep 17 00:00:00 2001 From: Sahana Venkatesh Date: Mon, 6 Oct 2025 22:19:56 +0100 Subject: [PATCH 06/10] Update src/lerobot/scripts/rag_robot_health.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Sahana Venkatesh --- src/lerobot/scripts/rag_robot_health.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lerobot/scripts/rag_robot_health.py b/src/lerobot/scripts/rag_robot_health.py index 44cc3eca89..a0586b56e2 100644 --- a/src/lerobot/scripts/rag_robot_health.py +++ b/src/lerobot/scripts/rag_robot_health.py @@ -60,7 +60,6 @@ import faiss import cohere -os.environ["COHERE_API_KEY"] = "6V7WaFPKcEnSC8iF6HTca8HHkhvIOJVuVg7Xzer7" # ---------------------------- Config ---------------------------- MOTORS = [ From ba2e37b20da1b6c5492eb6e6af805ec81a03b55c Mon Sep 17 00:00:00 2001 From: Sahana Venkatesh Date: Mon, 6 Oct 2025 22:20:41 +0100 Subject: [PATCH 07/10] Update src/lerobot/scripts/collect_initpos.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Sahana Venkatesh --- src/lerobot/scripts/collect_initpos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lerobot/scripts/collect_initpos.py b/src/lerobot/scripts/collect_initpos.py index 73cc2645d6..f477de6715 100644 --- a/src/lerobot/scripts/collect_initpos.py +++ b/src/lerobot/scripts/collect_initpos.py @@ -157,7 +157,7 @@ def main(): if args.use_action: use_state = False if args.use_state: use_state = True - ds = LeRobotDataset(args.repo-id if hasattr(args, "repo-id") else args.repo_id, root=args.root, tolerance_s=1e-4) + ds = LeRobotDataset(args.repo_id, root=args.root, tolerance_s=1e-4) n_eps = len(ds.episode_data_index["from"]) frames_dir = (args.outdir / "first_frames") if args.save_first_frames else None From 39a2f2687865d5f258dbd515a7b20fd74d5df481 Mon Sep 17 00:00:00 2001 From: Sahana Venkatesh Date: Mon, 6 Oct 2025 22:26:55 +0100 Subject: [PATCH 08/10] Update src/lerobot/scripts/rag_robot_health.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Sahana Venkatesh --- src/lerobot/scripts/rag_robot_health.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lerobot/scripts/rag_robot_health.py b/src/lerobot/scripts/rag_robot_health.py index a0586b56e2..b5e4896ebd 100644 --- a/src/lerobot/scripts/rag_robot_health.py +++ b/src/lerobot/scripts/rag_robot_health.py @@ -112,7 +112,7 @@ def fmt(v): def _looks_num_str(s: str) -> bool: s = s.strip() if not s: return False - if s[0] in "+-": s = s[1:] + if s[0] in ('+', '-'): s = s[1:] return s.replace(".", "", 1).isdigit() def _coerce_scalar(val) -> float: From ca4147765e49a2e73472685d99c92774b3f273cd Mon Sep 17 00:00:00 2001 From: Sahana Venkatesh Date: Mon, 6 Oct 2025 22:27:09 +0100 Subject: [PATCH 09/10] Update src/lerobot/scripts/rag_robot_health.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Sahana Venkatesh --- src/lerobot/scripts/rag_robot_health.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lerobot/scripts/rag_robot_health.py b/src/lerobot/scripts/rag_robot_health.py index b5e4896ebd..ac83525153 100644 --- a/src/lerobot/scripts/rag_robot_health.py +++ b/src/lerobot/scripts/rag_robot_health.py @@ -200,7 +200,9 @@ def build_docs( docs: List[EpisodeDoc] = [] for ep_id, motor_avgs in ep_table.items(): motor_z = {m: zscore(motor_avgs.get(m, float("nan")), mu.get(m, float("nan")), sigma.get(m, float("nan"))) for m in MOTORS} - flags = {m: (abs(motor_z.get(m, float("nan"))) > OUTLIER_Z) if not math.isnan(motor_z.get(m, float("nan"))) else False for m in MOTORS} + flags = {m: (abs(z) > OUTLIER_Z) if not math.isnan(z) else False + for m in MOTORS + for z in [motor_z.get(m, float("nan"))]} health = health_score_from_z(motor_z) frames = (frames_map or {}).get(str(ep_id)) docs.append(EpisodeDoc( From b210ed97fbc56a9a69734971d4eac66e3d8c5899 Mon Sep 17 00:00:00 2001 From: Sahana Venkatesh Date: Mon, 6 Oct 2025 22:27:16 +0100 Subject: [PATCH 10/10] Update src/lerobot/scripts/collect_initpos.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Sahana Venkatesh --- src/lerobot/scripts/collect_initpos.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lerobot/scripts/collect_initpos.py b/src/lerobot/scripts/collect_initpos.py index f477de6715..9300f9642f 100644 --- a/src/lerobot/scripts/collect_initpos.py +++ b/src/lerobot/scripts/collect_initpos.py @@ -36,7 +36,8 @@ def chw_to_hwc_uint8(img_t: torch.Tensor) -> np.ndarray: t = (t.clamp(0, 1) * 255.0).to(torch.uint8) elif t.dtype != torch.uint8: t = t.to(torch.uint8) - assert t.ndim == 3 and t.shape[0] <= t.shape[1] and t.shape[0] <= t.shape[2], f"Expected CxHxW, got {tuple(t.shape)}" + if not (t.ndim == 3 and t.shape[0] <= t.shape[1] and t.shape[0] <= t.shape[2]): + raise ValueError(f"Expected input tensor of shape CxHxW (C <= H and C <= W), got {tuple(t.shape)}") return t.permute(1, 2, 0).numpy() def save_first_frames_from_batch(batch, dataset: LeRobotDataset, ep: int, frames_dir: Path) -> list[str]: