SemiAnalysisAI · Ankur-singh · May 20, 2026 · May 20, 2026 · claude · May 20, 2026
@@ -4504,15 +4504,7 @@ gptoss-fp4-h100-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 2, conc-start: 4, conc-end: 64 }
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 2, conc-start: 4, conc-end: 64 }
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-      - { tp: 8, conc-start: 4, conc-end: 16 }
+      - { tp: 2, conc-start: 4, conc-end: 4 }
 
 minimaxm2.5-fp8-h100-vllm:
   image: vllm/vllm-openai:v0.21.0
@@ -4527,13 +4519,7 @@ minimaxm2.5-fp8-h100-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-      - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-      - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+      - { tp: 4, ep: 4, conc-start: 4, conc-end: 4 }
 
 # Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is
 # identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this
@@ -9210,11 +9196,7 @@ qwen3.5-fp8-h100-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+      - { tp: 8, ep: 8, conc-start: 4, conc-end: 4 }
 
 qwen3.5-fp8-h100-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
@@ -9229,8 +9211,4 @@ qwen3.5-fp8-h100-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp }
diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh
@@ -15,7 +15,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 MAX_MODEL_LEN=10240
 

diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh
@@ -17,7 +17,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 nvidia-smi
 

diff --git a/benchmarks/single_node/qwen3.5_fp8_h100.sh b/benchmarks/single_node/qwen3.5_fp8_h100.sh
@@ -25,7 +25,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}

diff --git a/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh
@@ -22,7 +22,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 export SGLANG_ENABLE_SPEC_V2=1
 

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3050,3 +3050,14 @@
   description:
     - "Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1475
+
+- config-keys:
+    - gptoss-fp4-h100-vllm
+    - minimaxm2.5-fp8-h100-vllm
+    - qwen3.5-fp8-h100-sglang
+    - qwen3.5-fp8-h100-sglang-mtp
+  description:
+    - "Migrate H100 agg model path to /mnt/numa1/shared/models/ (audit-recommended root); set HF_HUB_CACHE to container's ~/.cache/huggingface for eval dataset downloads"
+    - "Rewrite MODEL in launcher from HF id (org/name) to absolute local path under HF_HUB_CACHE_MOUNT; remove hf download from bench scripts so contract is staged-or-fail"
+    - "Reduce search-space to single (isl=1024, osl=1024, conc=4) point per config to verify model-path wiring end-to-end"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
@@ -279,7 +279,19 @@ EOF
 
 else
 
-    HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/"
+    HF_HUB_CACHE_MOUNT="/mnt/numa1/shared/models/"
+
+    # HF_HUB_CACHE is set to help with dataset download inside the container
+    # for eval jobs. Can be updated to some other path on the cluster and
+    # mounted just like HF_HUB_CACHE_MOUNT.
+    export HF_HUB_CACHE="$HOME/.cache/huggingface"
+
+    # Rewrite MODEL from HF id (org/name) to the pre-staged local path under
+    # HF_HUB_CACHE_MOUNT. Skip if MODEL is already an absolute path.
+    if [[ -n "$MODEL" && "$MODEL" != /* ]]; then
+        export MODEL="${HF_HUB_CACHE_MOUNT}${MODEL##*/}"
+    fi
+
     SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
     LOCK_FILE="${SQUASH_FILE}.lock"
 
@@ -306,7 +318,7 @@ else
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
         --no-container-mount-home \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,7 +15,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then @@
       echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
     fi
-    if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
     MAX_MODEL_LEN=10240
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -22,7 +22,6 @@ fi

		nvidia-smi

		if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

		export SGLANG_ENABLE_SPEC_V2=1

Expand Down