Recover W2 novelty lane under the proven adaptive-clip stack

resouer · resouer · commit 62fd0eb1ce1c · 2026-04-13T15:45:29.000-07:00
Port the W18 training and quantization defaults onto the older W2 pass-conditioned modulation lane so the next probe tests our own round-22 mechanism under a compliant, already-validated artifact path instead of re-running another near-1586 reproduction. The harness sync keeps local monitoring reliable while preserving the worker-facing launcher contract. Constraint: The next lane must preserve W2's pass-conditioned modulation story while staying under the 16 MB cap and using the fixed local evaluator path. Rejected: Re-run W19 with more seeds | single-seed result already underperformed W18 and still leaned on thin novelty Rejected: More W18-family quantization tuning | stronger score story but too close to open PR openai#1586 to solve the submission problem Confidence: medium Scope-risk: moderate Reversibility: clean Directive: Treat this branch as a W2-on-W18 hybrid; if the score improves, review novelty framing against both openai#1518 and openai#1586 before escalating to 3 seeds Tested: python3 -m py_compile train_gpt.py evaluate.py auto_resume_watch.py; python3 evaluate.py --list Not-tested: Live GPU eval on this hybrid lane Related: c0c2d68 Related: 7d435d2
diff --git a/evaluate.py b/evaluate.py
@@ -25,8 +25,6 @@
 
 WORKSPACE = os.path.expanduser("~/autoresearch/pgolf")
 os.makedirs(WORKSPACE, exist_ok=True)
-HEARTBEAT_DIR = os.path.join(WORKSPACE, "heartbeats")
-os.makedirs(HEARTBEAT_DIR, exist_ok=True)
 
 DEFAULT_THRESHOLD = float(os.environ.get("AUTORESEARCH_THRESHOLD", "1.1164"))
 DEFAULT_TIMEOUT = 2700  # 45 min
@@ -62,7 +60,13 @@ def _load_env():
 # ---------------------------------------------------------------------------
 
 def _run(cmd, check=False, timeout=30):
-    r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
+    try:
+        r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
+    except subprocess.TimeoutExpired as e:
+        stdout = e.stdout if isinstance(e.stdout, str) else (e.stdout or b"").decode("utf-8", "replace")
+        stderr = e.stderr if isinstance(e.stderr, str) else (e.stderr or b"").decode("utf-8", "replace")
+        stderr = (stderr + f"\nTIMEOUT after {timeout}s").strip()
+        r = subprocess.CompletedProcess(cmd, 124, stdout=stdout, stderr=stderr)
     if check and r.returncode != 0:
         raise RuntimeError(f"Command failed: {cmd}\n{r.stderr}")
     return r
@@ -123,16 +127,16 @@ def _make_job_command(commit_sha, branch=None):
     VOCAB=$(python3 << 'PYEOF'
 import re, sys
 f = open('train_gpt.py').read()
-m = re.search(r'VOCAB_SIZE.*?,\s*(\d+)', f)
+m = re.search(r'VOCAB_SIZE.*?,\\s*(\\d+)', f)
 if m: print(m.group(1)); sys.exit()
 try:
     import lzma, base64
-    m2 = re.search(r"b85decode\([b]?['\"](.+?)['\"]\)", f, re.DOTALL)
+    m2 = re.search(r"b85decode\\([b]?['\\\"](.+?)['\\\"]\\)", f, re.DOTALL)
     if m2:
         blob = m2.group(1)
         try: code = lzma.decompress(base64.b85decode(blob)).decode()
         except: code = lzma.decompress(base64.b85decode(blob), format=lzma.FORMAT_RAW, filters=[{"id": lzma.FILTER_LZMA2}]).decode()
-        m3 = re.search(r'VOCAB_SIZE.*?,\s*(\d+)', code)
+        m3 = re.search(r'VOCAB_SIZE.*?,\\s*(\\d+)', code)
         if m3: print(m3.group(1)); sys.exit()
 except Exception: pass
 print('1024')
@@ -150,7 +154,7 @@ def _make_job_command(commit_sha, branch=None):
     SCYLLA_DIR="./data/datasets/fineweb10B_scylla"
     if [ ! -f "$SCYLLA_DIR/.download_complete" ]; then
         echo "data_setup: downloading Scylla data from HuggingFace..."
-        pip install -q huggingface_hub 2>/dev/null || true
+        PIP_NO_CACHE_DIR=1 pip install -q --no-cache-dir huggingface_hub 2>/dev/null || true
         python3 -c "from huggingface_hub import snapshot_download; snapshot_download('anthonym21/fineweb10B-scylla', local_dir='$SCYLLA_DIR', repo_type='dataset')"
         touch "$SCYLLA_DIR/.download_complete"
         echo "data_setup: Scylla download complete"
@@ -173,21 +177,24 @@ def _make_job_command(commit_sha, branch=None):
 """
 
     clone_setup = f"""
+rm -rf /workspace/pgolf /root/.cache/pip ~/.cache/pip /tmp/pip-cache
+mkdir -p /workspace
 if [ -n "$PGOLF_GIT_TOKEN" ]; then
     CLONE_URL="https://x-access-token:${{PGOLF_GIT_TOKEN}}@github.com/{owner}/{repo}.git"
 else
     CLONE_URL="{REPO_URL}"
 fi
-GIT_TERMINAL_PROMPT=0 git clone --quiet "$CLONE_URL" /workspace/pgolf
+GIT_TERMINAL_PROMPT=0 git clone --quiet --filter=blob:none --no-tags "$CLONE_URL" /workspace/pgolf
 """
 
     return f"""set -e
-pip install -q sentencepiece huggingface-hub tiktoken zstandard brotli 2>/dev/null || true
+PIP_NO_CACHE_DIR=1 pip install -q --no-cache-dir sentencepiece huggingface-hub tiktoken zstandard brotli 2>/dev/null || true
 
 {clone_setup}
 cd /workspace/pgolf
 git fetch origin {f'{branch}' if branch else '--all'}
 git checkout {commit_sha}
+rm -rf .git /root/.cache/pip ~/.cache/pip /tmp/pip-cache
 
 export PYTHONUNBUFFERED=1
 
@@ -333,77 +340,6 @@ def _log_path(job_id):
     return os.path.join(WORKSPACE, f"run_{job_id}.log")
 
 
-def _heartbeat_path(job_id=None):
-    if job_id:
-        return os.path.join(HEARTBEAT_DIR, f"{job_id}.json")
-    return os.path.join(WORKSPACE, "heartbeat-latest.json")
-
-
-def _log_snapshot(log_file, max_tail_lines=8):
-    if not log_file or not os.path.exists(log_file):
-        return {
-            "exists": False,
-            "line_count": 0,
-            "size_bytes": 0,
-            "mtime": None,
-            "tail": [],
-        }
-    st = os.stat(log_file)
-    with open(log_file, "r", encoding="utf-8", errors="replace") as f:
-        lines = f.read().splitlines()
-    return {
-        "exists": True,
-        "line_count": len(lines),
-        "size_bytes": st.st_size,
-        "mtime": int(st.st_mtime),
-        "tail": lines[-max_tail_lines:],
-    }
-
-
-def _heartbeat_state_label(job_status, log_snapshot, now_ts=None):
-    now_ts = int(now_ts or time.time())
-    if job_status == "queueing":
-        return "queued"
-    if job_status in ("completed", "failed", "stopped", "timeout"):
-        return job_status
-    if not log_snapshot["exists"] or log_snapshot["line_count"] == 0:
-        return "starting"
-    mtime = log_snapshot.get("mtime")
-    if mtime is None:
-        return "running"
-    if now_ts - mtime <= 90:
-        return "streaming"
-    return "quiet-running"
-
-
-def _write_heartbeat(kind, job_id=None, job_name=None, status=None, branch=None,
-                     commit=None, log_file=None, started_at=None, extra=None):
-    now = int(time.time())
-    snapshot = _log_snapshot(log_file)
-    payload = {
-        "kind": kind,
-        "job_id": job_id,
-        "job_name": job_name,
-        "status": status,
-        "state_label": _heartbeat_state_label(status, snapshot, now_ts=now),
-        "branch": branch,
-        "commit": commit,
-        "log_file": log_file,
-        "started_at": started_at,
-        "updated_at": now,
-        "elapsed_s": None if started_at is None else max(0, now - int(started_at)),
-        "log": snapshot,
-    }
-    if extra:
-        payload.update(extra)
-    for path in {_heartbeat_path(), _heartbeat_path(job_id)}:
-        try:
-            with open(path, "w", encoding="utf-8") as f:
-                json.dump(payload, f, indent=2, sort_keys=True)
-        except Exception:
-            pass
-
-
 def _has_final_results_content(content):
     """Return True only when the final metric for the active eval mode is present."""
     if "results_json" in content:
@@ -674,8 +610,6 @@ def _signal_handler(signum, frame):
     commit = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True).stdout.strip()
     _log(f"branch={branch} commit={commit[:7]}")
 
-    started_at = int(time.time())
-
     # 2. Create job
     try:
         job_name, job_id = _create_job(commit, node_group=args.node_group, branch=branch)
@@ -688,17 +622,6 @@ def _signal_handler(signum, frame):
     _main_state["log_file"] = log_file
     log_thread = threading.Thread(target=_stream_job_logs, args=(job_id, log_file), daemon=True)
     log_thread.start()
-    _write_heartbeat(
-        kind="eval",
-        job_id=job_id,
-        job_name=job_name,
-        status="created",
-        branch=branch,
-        commit=commit[:7],
-        log_file=log_file,
-        started_at=started_at,
-        extra={"threshold": args.threshold},
-    )
 
     # 4. Poll job status independently
     start = time.time()
@@ -707,17 +630,6 @@ def _signal_handler(signum, frame):
         status = _get_job_status(job_name, job_id)
         elapsed = int(time.time() - start)
         _log(f"[{elapsed}s] {job_name}: {status}")
-        _write_heartbeat(
-            kind="eval",
-            job_id=job_id,
-            job_name=job_name,
-            status=status,
-            branch=branch,
-            commit=commit[:7],
-            log_file=log_file,
-            started_at=started_at,
-            extra={"threshold": args.threshold},
-        )
 
         if status in ("completed", "failed", "stopped"):
             log_thread.join(timeout=15)
@@ -736,17 +648,6 @@ def _signal_handler(signum, frame):
         _log(f"Timeout after {args.timeout}s, stopping job")
         _stop_job_safe(job_id)
         log_thread.join(timeout=5)
-        _write_heartbeat(
-            kind="eval",
-            job_id=job_id,
-            job_name=job_name,
-            status="timeout",
-            branch=branch,
-            commit=commit[:7],
-            log_file=log_file,
-            started_at=started_at,
-            extra={"threshold": args.threshold},
-        )
         _output(False, error=f"job timeout after {args.timeout}s")
 
     # 5. Parse results from log
@@ -1074,17 +975,6 @@ def preflight(node_group=None, commit=None):
     job_name, job_id = _create_job(commit, ng, branch=branch)
     log_file = _log_path(job_id)
     _log(f"Job: {job_name} ({job_id})")
-    started_at = int(time.time())
-    _write_heartbeat(
-        kind="preflight",
-        job_id=job_id,
-        job_name=job_name,
-        status="created",
-        branch=branch,
-        commit=commit[:7],
-        log_file=log_file,
-        started_at=started_at,
-    )
 
     # Stream + poll
     stream_thread = threading.Thread(target=_stream_job_logs, args=(job_id, log_file), daemon=True)
@@ -1097,16 +987,6 @@ def preflight(node_group=None, commit=None):
         status = _get_job_status(job_name, job_id)
         elapsed = int(time.time() - start)
         _log(f"[preflight] [{elapsed}s] {status}")
-        _write_heartbeat(
-            kind="preflight",
-            job_id=job_id,
-            job_name=job_name,
-            status=status,
-            branch=branch,
-            commit=commit[:7],
-            log_file=log_file,
-            started_at=started_at,
-        )
         if status in ("completed", "failed", "stopped"):
             break
 
@@ -1152,21 +1032,6 @@ def preflight(node_group=None, commit=None):
     _log(f"  OVERALL:    {'PASS' if all_pass else 'FAIL'}")
     if not all_pass:
         _log(f"  Failed checks: {[k for k,v in checks.items() if not v]}")
-    _write_heartbeat(
-        kind="preflight",
-        job_id=job_id,
-        job_name=job_name,
-        status="completed" if all_pass else status,
-        branch=branch,
-        commit=commit[:7],
-        log_file=log_file,
-        started_at=started_at,
-        extra={
-            "preflight_pass": all_pass,
-            "preflight_checks": checks,
-            "details": results,
-        },
-    )
     return all_pass
 
 
diff --git a/requirements.txt b/requirements.txt
@@ -9,3 +9,4 @@ datasets
 tiktoken
 sentencepiece
 brotli
+flash-attn-3
diff --git a/train_gpt.py b/train_gpt.py
@@ -18,7 +18,7 @@ class Hyperparameters:
     seed = int(os.environ.get("SEED", 1337))
     run_id = os.environ.get("RUN_ID", str(uuid.uuid4()))
     iterations = int(os.environ.get("ITERATIONS", 20000))
-    warmdown_frac = float(os.environ.get("WARMDOWN_FRAC", 0.667))
+    warmdown_frac = float(os.environ.get("WARMDOWN_FRAC", 0.75))
     warmup_steps = int(os.environ.get("WARMUP_STEPS", 20))
     train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786432))
     train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048))
@@ -55,7 +55,7 @@ class Hyperparameters:
     head_lr = float(os.environ.get("HEAD_LR", 0.008))
     tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.03))
     tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005))
-    matrix_lr = float(os.environ.get("MATRIX_LR", 0.022))
+    matrix_lr = float(os.environ.get("MATRIX_LR", 0.026))
     scalar_lr = float(os.environ.get("SCALAR_LR", 0.02))
     muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.97))
     muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5))
@@ -72,8 +72,8 @@ class Hyperparameters:
     muon_beta2 = float(os.environ.get("MUON_BETA2", 0.95))
     adam_wd = float(os.environ.get("ADAM_WD", 0.02))
     muon_wd = float(os.environ.get("MUON_WD", 0.095))
-    embed_wd = float(os.environ.get("EMBED_WD", 0.095))
-    ema_decay = float(os.environ.get("EMA_DECAY", 0.997))
+    embed_wd = float(os.environ.get("EMBED_WD", 0.085))
+    ema_decay = float(os.environ.get("EMA_DECAY", 0.9965))
     ttt_enabled = bool(int(os.environ.get("TTT_ENABLED", "1")))
     ttt_lora_rank = int(os.environ.get("TTT_LORA_RANK", 96))
     ttt_lora_lr = float(os.environ.get("TTT_LORA_LR", 0.0001))
@@ -98,9 +98,11 @@ class Hyperparameters:
     gptq_calibration_batches = int(os.environ.get("GPTQ_CALIBRATION_BATCHES", 64))
     gptq_reserve_seconds = float(os.environ.get("GPTQ_RESERVE_SECONDS", 12.0))
     matrix_bits = int(os.environ.get("MATRIX_BITS", 6))
-    embed_bits = int(os.environ.get("EMBED_BITS", 8))
+    embed_bits = int(os.environ.get("EMBED_BITS", 7))
     matrix_clip_sigmas = float(os.environ.get("MATRIX_CLIP_SIGMAS", 12.85))
-    embed_clip_sigmas = float(os.environ.get("EMBED_CLIP_SIGMAS", 2e1))
+    embed_clip_sigmas = float(os.environ.get("EMBED_CLIP_SIGMAS", 15.0))
+    mlp_clip_sigmas = float(os.environ.get("MLP_CLIP_SIGMAS", 12.0))
+    attn_clip_sigmas = float(os.environ.get("ATTN_CLIP_SIGMAS", 13.0))
     distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ
     rank = int(os.environ.get("RANK", "0"))
     world_size = int(os.environ.get("WORLD_SIZE", "1"))
@@ -1674,7 +1676,14 @@ def gptq_mixed_quantize(state_dict, hessians, h):
             result[name] = t.to(torch.float16) if t.is_floating_point() else t
             meta[name] = "passthrough (float16)"
             continue
-        cs = h.embed_clip_sigmas if "tok_emb" in name else h.matrix_clip_sigmas
+        if "tok_emb" in name:
+            cs = h.embed_clip_sigmas
+        elif ".mlp." in name:
+            cs = h.mlp_clip_sigmas
+        elif ".attn." in name:
+            cs = h.attn_clip_sigmas
+        else:
+            cs = h.matrix_clip_sigmas
         bits = h.embed_bits if "tok_emb" in name else h.matrix_bits
         q, s = gptq_quantize_weight(
             t, hessians[name], clip_sigmas=cs, clip_range=2 ** (bits - 1) - 1