From f5931476b85975703e6d2dc168cb635c0d588557 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sat, 30 May 2026 03:49:00 +0800 Subject: [PATCH 1/5] dsv4-fp4-b300-sglang: update image to nightly-dev-cu13-20260529-a8cfae0b, refactor script, switch to megamoe --- .github/configs/nvidia-master.yaml | 26 +-- .../single_node/dsv4_fp4_b300_sglang.sh | 201 ++++++++---------- 2 files changed, 100 insertions(+), 127 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f8cc486b2..fcdda2e6a 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1999,26 +1999,22 @@ dsr1-fp8-b300-sglang: - { tp: 8, ep: 1, conc-start: 1, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 1, conc-end: 32 } -# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 -# lists B200 (not B300) as the Blackwell target. This config reuses the -# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300 -# until a B300-specific recipe ships. Prefix caching is disabled. -# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. +# DeepSeek-V4-Pro on B300 with sglang (non-MTP). +# Uses nightly image with megamoe backend for high-concurrency profiles. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15 + image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 precision: fp4 framework: sglang multinode: false - # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 - # are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh by CONC: - # low-latency (CONC <= 32): TP-only - # balanced (32 < CONC <= 128): + DP-attn - # max-throughput (CONC > 128): + DP-attn - # Split so result filenames (ep=, dpa=) accurately reflect the recipe. - # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, + # Recipes are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh + # by CONC: + # CONC 1|32: TP-only, flashinfer_mxfp4 + # CONC 512: DP-attn, flashinfer_mxfp4 + # CONC 2048-8192: DP-attn, megamoe + # ep is implicit in sglang: --moe-a2a-backend megamoe forces ep_size=tp_size, # while low-latency leaves ep_size at the default of 1. scenarios: fixed-seq-len: @@ -2027,14 +2023,14 @@ dsv4-fp4-b300-sglang: search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 } - isl: 8192 osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index b6ced157c..e353a1cbe 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -30,20 +30,9 @@ fi nvidia-smi -# Common SGLANG env vars (apply to every config). +# ─── Common env vars (all profiles) ─────────────────────────────────────────── export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 -export SGLANG_OPT_USE_JIT_NORM=1 -export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 -export SGLANG_OPT_USE_TOPK_V2=1 -export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 - -# TODO(Cam): the deepseek-v4 sglang images install sglang editable at -# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. -# The runner mounts our repo at a non-/workspace path for these images so the -# editable install stays visible. Paths in this script are $PWD-relative for -# that reason. Drop the runner conditional once lmsys moves sglang back out of -# /workspace. SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} @@ -58,114 +47,101 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was +# ─── Per-concurrency launch profile ────────────────────────────────────────── +# Each block sets: PARALLEL_ARGS, MEM_FRACTION_STATIC, SWA_FULL_TOKENS_RATIO, +# and optionally MAX_RUNNING_REQUESTS plus profile-specific env vars. +# +# SWA ratio: 1k inputs need more SWA cache headroom than 8k inputs; 0.5 was # tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default. -if [[ "$ISL" == "1024" ]]; then - SWA_FULL_TOKENS_RATIO=0.5 -else - SWA_FULL_TOKENS_RATIO=0.1 -fi -# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm -# script's pattern). DP-attention runs the empirically-tuned high-concurrency -# recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer); -# single-instance uses flashinfer_mxfp4 with the cookbook defaults. -DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' - -# Default; the DP-attn branch below overrides to 0.94. -MEM_FRACTION_STATIC=0.90 +if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then + # TP-only, no DP attention + MEM_FRACTION_STATIC=0.90 + SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1) + PARALLEL_ARGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 8192 + --disable-flashinfer-autotune + ) -if [ "${DP_ATTENTION}" = "true" ]; then +elif [ "$CONC" = "512" ]; then + # DP attention, flashinfer_mxfp4 export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 - export SGLANG_OPT_USE_FAST_MASK_EP=1 - export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - # ep=8 in the yaml signals the mega_moe deepep backend; check high-conc - # recipes first (they also have ep=8) so they aren't shadowed by the - # medium-conc EP_SIZE=8 branch below. - if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ] || [ "$CONC" = "8192" ]; then - export NVSHMEM_DISABLE_IB=1 - export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 - if [ "$CONC" = "2048" ]; then - export SGLANG_LOG_FORWARD_ITERS=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 - CUDA_GRAPH_MAX_BS=288 - MAX_RUNNING_REQUESTS=2560 - MEM_FRACTION_STATIC=0.87 - SWA_FULL_TOKENS_RATIO=0.06 - TOKENIZER_WORKER_NUM=4 - elif [ "$CONC" = "4096" ]; then - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 - CUDA_GRAPH_MAX_BS=544 - MAX_RUNNING_REQUESTS=4352 - MEM_FRACTION_STATIC=0.835 - SWA_FULL_TOKENS_RATIO=0.075 - TOKENIZER_WORKER_NUM=8 - else - export SGLANG_OPT_USE_ONLINE_COMPRESS=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256 - CUDA_GRAPH_MAX_BS=1088 - MAX_RUNNING_REQUESTS=8192 - MEM_FRACTION_STATIC=0.80 - SWA_FULL_TOKENS_RATIO=0.3 - TOKENIZER_WORKER_NUM=16 - fi - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" - --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 65536 - --tokenizer-worker-num "$TOKENIZER_WORKER_NUM" - --enable-prefill-delayer - ) - if [ "$CONC" = "4096" ]; then - PARALLEL_ARGS+=(--decode-log-interval 5) - fi - if [ "$CONC" = "8192" ]; then - PARALLEL_ARGS+=(--stream-interval 30) - fi - elif [ "${EP_SIZE}" = "8" ]; then - export NVSHMEM_DISABLE_IB=1 - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550 - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --cuda-graph-max-bs 550 - --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 16384 - --enable-prefill-delayer - ) - MAX_RUNNING_REQUESTS=768 - MEM_FRACTION_STATIC=0.94 - else - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-runner-backend flashinfer_mxfp4 - --disable-flashinfer-autotune - --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 16384 - --enable-prefill-delayer - ) - MEM_FRACTION_STATIC=0.94 - fi -else + MEM_FRACTION_STATIC=0.94 + SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1) PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 8192 --disable-flashinfer-autotune + --chunked-prefill-size 16384 + --enable-prefill-delayer ) + +elif [ "$CONC" = "2048" ]; then + # DP attention, megamoe + export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 + export NVSHMEM_DISABLE_IB=1 + export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 + export SGLANG_LOG_FORWARD_ITERS=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 + MEM_FRACTION_STATIC=0.87 + SWA_FULL_TOKENS_RATIO=0.06 + MAX_RUNNING_REQUESTS=2560 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend megamoe + --cuda-graph-max-bs 288 + --chunked-prefill-size 65536 + --tokenizer-worker-num 4 + --enable-prefill-delayer + ) + +elif [ "$CONC" = "4096" ]; then + # DP attention, megamoe + export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 + export NVSHMEM_DISABLE_IB=1 + export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 + MEM_FRACTION_STATIC=0.835 + SWA_FULL_TOKENS_RATIO=0.075 + MAX_RUNNING_REQUESTS=4352 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend megamoe + --cuda-graph-max-bs 544 + --chunked-prefill-size 65536 + --tokenizer-worker-num 8 + --enable-prefill-delayer + --decode-log-interval 5 + ) + +elif [ "$CONC" = "8192" ]; then + # DP attention, megamoe + export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 + export NVSHMEM_DISABLE_IB=1 + export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 + export SGLANG_OPT_USE_ONLINE_COMPRESS=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256 + MEM_FRACTION_STATIC=0.80 + SWA_FULL_TOKENS_RATIO=0.3 + MAX_RUNNING_REQUESTS=8192 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend megamoe + --cuda-graph-max-bs 1088 + --chunked-prefill-size 65536 + --tokenizer-worker-num 16 + --enable-prefill-delayer + --stream-interval 30 + ) + +else + echo "ERROR: unsupported CONC=$CONC" >&2 + exit 1 fi # Print all SGLANG_* env vars to both the CI step log and server.log so the @@ -193,6 +169,7 @@ SERVER_PID=$! wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" pip install -q datasets pandas +pip install -q --upgrade transformers run_benchmark_serving \ --model "$MODEL" \ From 0ba92fdb39aa14439613456e7e836b23348bfa91 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sat, 30 May 2026 03:49:00 +0800 Subject: [PATCH 2/5] Append perf-changelog entry for PR #1506 --- perf-changelog.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 935cded22..93b4b587c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3200,3 +3200,16 @@ - "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523, 1P1D TP8/EP1, dp-attn false, conc [8..512]" - "MoRI conn.py overlay (48e459bd) via job.slurm; launcher qwen3.5_fp4_mi355x_sglang-disagg.sh" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1579 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Update sglang image to nightly-dev-cu13-20260529-a8cfae0b" + - "Refactor benchmark script to dispatch by CONC instead of nested DP_ATTENTION/CONC/EP_SIZE" + - "Switch CONC 2048/4096/8192 from --moe-a2a-backend deepep to megamoe" + - "Remove env vars deleted from sglang main (SGLANG_OPT_USE_JIT_NORM, SGLANG_OPT_USE_FAST_MASK_EP, SGLANG_OPT_FIX_NEXTN_MEGA_MOE, SGLANG_OPT_FIX_HASH_MEGA_MOE)" + - "Remove env vars redundant with sglang defaults (SGLANG_OPT_USE_JIT_INDEXER_METADATA, SGLANG_OPT_USE_TOPK_V2, SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2)" + - "Remove env vars auto-set by megamoe backend (SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE, SGLANG_OPT_FIX_MEGA_MOE_MEMORY)" + - "Remove --deepep-config and SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK (unused by megamoe/StandardDispatcher)" + - "Fix CONC=512 yaml ep from 4 to 1 (flashinfer_mxfp4 does not set ep=tp)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1506 From f584272e91c70b525f13ddb267a31330c5836e45 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 30 May 2026 10:03:51 -0700 Subject: [PATCH 3/5] launch_b300-nv: redirect enroot off stale /scratch during import The head node's /scratch is an NFS mount that can return a stale file handle. enroot's runtime/cache/data/temp dirs are pinned under /scratch by /etc/enroot/enroot.conf{,.d}, so on a stale mount `enroot import` cannot create its working dirs and produces no .sqsh. That surfaces downstream as a cryptic pyxis "No such file or directory: ...sqsh" on the compute node and fails the single-node canary (e.g. actions run 26658745339). When /scratch is unusable, probe it and redirect enroot's paths to the healthy /data share for the import only. The exports stay inside the import subshell, so the salloc/srun below (and the compute node's own /scratch) are unaffected; on a healthy head node the probe passes and behavior is identical. Also fail fast if the import still can't produce a squash instead of proceeding to a doomed srun. Co-Authored-By: Claude Opus 4.8 (1M context) --- runners/launch_b300-nv.sh | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 7b4a712f9..6a329cb7b 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -358,9 +358,32 @@ else echo "Squash file already exists and is valid, skipping import" else rm -f "$SQUASH_FILE" - enroot import -o "$SQUASH_FILE" "docker://$IMAGE" + # /etc/enroot/enroot.conf{,.d} pin enroot's runtime/cache/data/temp + # dirs under /scratch. On this head node /scratch is an NFS mount that + # can go stale ("Stale file handle"), which makes `enroot import` fail + # to create its working dirs and produce no .sqsh -- surfacing later as + # a cryptic pyxis "No such file or directory" on the compute node. When + # /scratch is unusable, redirect enroot to the healthy /data share for + # this import only. These exports stay inside this subshell, so the + # salloc/srun below (and the compute node's own /scratch) are untouched. + scratch_probe="/scratch/tmp/.estale_probe.$$" + if { mkdir -p /scratch/tmp && touch "$scratch_probe"; } 2>/dev/null; then + rm -f "$scratch_probe" + else + enroot_fallback="$(dirname "$(dirname "$SQUASH_FILE")")/enroot" + enroot_uid="$(id -u)" + echo "WARN: /scratch unusable (stale handle?); redirecting enroot to $enroot_fallback for this import" >&2 + export ENROOT_RUNTIME_PATH="$enroot_fallback/runtime/user-$enroot_uid" + export ENROOT_CACHE_PATH="$enroot_fallback/cache/user-$enroot_uid" + export ENROOT_DATA_PATH="$enroot_fallback/data/user-$enroot_uid" + export ENROOT_TEMP_PATH="$enroot_fallback/tmp" + mkdir -p "$ENROOT_RUNTIME_PATH" "$ENROOT_CACHE_PATH" \ + "$ENROOT_DATA_PATH" "$ENROOT_TEMP_PATH" + fi + enroot import -o "$SQUASH_FILE" "docker://$IMAGE" \ + || { echo "enroot import failed for $IMAGE" >&2; exit 1; } fi - ) + ) || exit 1 salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) From 88e4b9f9a72e0a3326f97af7f48a96301df9bd82 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 30 May 2026 11:17:02 -0700 Subject: [PATCH 4/5] Revert "launch_b300-nv: redirect enroot off stale /scratch during import" This reverts commit f584272e. /scratch has been remounted on the head node and the stale NFS handle is cleared, so the enroot temp/cache/data redirect workaround is no longer needed. Restores the original import. Co-Authored-By: Claude Opus 4.8 (1M context) --- runners/launch_b300-nv.sh | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 6a329cb7b..7b4a712f9 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -358,32 +358,9 @@ else echo "Squash file already exists and is valid, skipping import" else rm -f "$SQUASH_FILE" - # /etc/enroot/enroot.conf{,.d} pin enroot's runtime/cache/data/temp - # dirs under /scratch. On this head node /scratch is an NFS mount that - # can go stale ("Stale file handle"), which makes `enroot import` fail - # to create its working dirs and produce no .sqsh -- surfacing later as - # a cryptic pyxis "No such file or directory" on the compute node. When - # /scratch is unusable, redirect enroot to the healthy /data share for - # this import only. These exports stay inside this subshell, so the - # salloc/srun below (and the compute node's own /scratch) are untouched. - scratch_probe="/scratch/tmp/.estale_probe.$$" - if { mkdir -p /scratch/tmp && touch "$scratch_probe"; } 2>/dev/null; then - rm -f "$scratch_probe" - else - enroot_fallback="$(dirname "$(dirname "$SQUASH_FILE")")/enroot" - enroot_uid="$(id -u)" - echo "WARN: /scratch unusable (stale handle?); redirecting enroot to $enroot_fallback for this import" >&2 - export ENROOT_RUNTIME_PATH="$enroot_fallback/runtime/user-$enroot_uid" - export ENROOT_CACHE_PATH="$enroot_fallback/cache/user-$enroot_uid" - export ENROOT_DATA_PATH="$enroot_fallback/data/user-$enroot_uid" - export ENROOT_TEMP_PATH="$enroot_fallback/tmp" - mkdir -p "$ENROOT_RUNTIME_PATH" "$ENROOT_CACHE_PATH" \ - "$ENROOT_DATA_PATH" "$ENROOT_TEMP_PATH" - fi - enroot import -o "$SQUASH_FILE" "docker://$IMAGE" \ - || { echo "enroot import failed for $IMAGE" >&2; exit 1; } + enroot import -o "$SQUASH_FILE" "docker://$IMAGE" fi - ) || exit 1 + ) salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) From 33719a6f2bd260e5b198019f591dd5791db7ca60 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 30 May 2026 19:50:40 -0700 Subject: [PATCH 5/5] launch_b300-nv: run enroot import on local disk to fix NFS whiteout EPERM The single-node import extracts under ENROOT_TEMP_PATH, which /etc/enroot/enroot.conf pins to NFS /scratch. enroot-aufs2ovlfs unpacks the image's root-owned AUFS whiteout markers into a sticky /tmp and then can't unlink them over NFS (root-squash strips the CAP_FOWNER it needs), failing with 'failed to remove aufs whiteout: Operation not permitted' and producing no .sqsh -- which then surfaces as a pyxis 'No such file or directory' on the compute node. Run the import on local disk, where the extracted files are owned by the runner user and removable. Scoped to the import subshell and cleaned up on exit, so salloc/srun and the compute node's own /scratch are unaffected. Proper fix is to point ENROOT_TEMP_PATH at local disk in enroot.conf cluster-wide; this is the no-root workaround. Co-Authored-By: Claude Opus 4.8 (1M context) --- runners/launch_b300-nv.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 7b4a712f9..67e8b48cc 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -358,6 +358,23 @@ else echo "Squash file already exists and is valid, skipping import" else rm -f "$SQUASH_FILE" + # enroot's working dirs are pinned to NFS /scratch by + # /etc/enroot/enroot.conf, but enroot-aufs2ovlfs unpacks the image's + # root-owned whiteout markers into a sticky /tmp and then can't unlink + # them over NFS -- root-squash strips the CAP_FOWNER it would need, so + # it fails with "failed to remove aufs whiteout: Operation not + # permitted" and writes no .sqsh. Run the import on local disk, where + # the extracted files are owned by us and removable. Scoped to this + # subshell (and cleaned up on exit), so the salloc/srun below and the + # compute node's own /scratch are unaffected. + enroot_local="$(mktemp -d /tmp/enroot-import.XXXXXX)" + trap 'rm -rf "$enroot_local"' EXIT + export ENROOT_TEMP_PATH="$enroot_local/tmp" + export ENROOT_CACHE_PATH="$enroot_local/cache" + export ENROOT_DATA_PATH="$enroot_local/data" + export ENROOT_RUNTIME_PATH="$enroot_local/run" + mkdir -p "$ENROOT_TEMP_PATH" "$ENROOT_CACHE_PATH" \ + "$ENROOT_DATA_PATH" "$ENROOT_RUNTIME_PATH" enroot import -o "$SQUASH_FILE" "docker://$IMAGE" fi )