From 23786676947a34cace94a9543a7fc3eec7a2beab Mon Sep 17 00:00:00 2001 From: junyyang Date: Thu, 25 Jun 2026 08:08:32 +0000 Subject: [PATCH 1/4] Modify model cache mount --- .../atom-sglang-accuracy-validation-gpu-shard.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml b/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml index f01f48b99..577e96b0e 100644 --- a/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml +++ b/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml @@ -100,25 +100,25 @@ jobs: - name: Prepare model cache mount run: | MODEL_CACHE_MOUNT="" - MODEL_CACHE_ROOT="/models" + MODEL_CACHE_ROOT="" MODEL_CACHE_DESC="" if [ -d "/shared/data/WRH/models" ]; then MODEL_CACHE_ROOT="/shared/data/WRH/models" - MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_ROOT}:${MODEL_CACHE_ROOT}" + MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_ROOT}:/models" MODEL_CACHE_DESC="${MODEL_CACHE_ROOT} (host mount)" elif [ -d "/mnt/raid0/pretrained_model" ]; then MODEL_CACHE_ROOT="/mnt/raid0/pretrained_model" - MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_ROOT}:${MODEL_CACHE_ROOT}" + MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_ROOT}:/models" MODEL_CACHE_DESC="${MODEL_CACHE_ROOT} (host mount)" elif [ -d "/data/pretrained_model" ]; then MODEL_CACHE_ROOT="/data/pretrained_model" - MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_ROOT}:${MODEL_CACHE_ROOT}" + MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_ROOT}:/models" MODEL_CACHE_DESC="${MODEL_CACHE_ROOT} (host mount)" else MODEL_CACHE_ROOT="/mnt/raid0/pretrained_model" MODEL_CACHE_DESC="container-local ${MODEL_CACHE_ROOT} (no host cache mount)" - echo "Warning: Neither /mnt/raid0/pretrained_model nor /data/pretrained_model exists on runner; using container-local ${MODEL_CACHE_ROOT}." + echo "Warning: Neither /mnt/raid0/pretrained_model nor /data/pretrained_model nor /shared/data/WRH/models exists on runner; using container-local ${MODEL_CACHE_ROOT}." fi echo "Using model cache backend: ${MODEL_CACHE_DESC}" From af167dd47ee37b8a8ec19e07c6849afbcd0023ef Mon Sep 17 00:00:00 2001 From: junyyang Date: Thu, 25 Jun 2026 08:49:03 +0000 Subject: [PATCH 2/4] Remove environment parameters about cache from container --- .../workflows/atom-sglang-accuracy-validation-gpu-shard.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml b/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml index 577e96b0e..b3bffbe39 100644 --- a/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml +++ b/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml @@ -166,9 +166,6 @@ jobs: --ulimit stack=67108864 \ --env-file /tmp/sglang_env_file.txt \ -e HF_TOKEN="${HF_TOKEN:-}" \ - -e HF_HOME="${MODEL_CACHE_ROOT}/.cache/huggingface" \ - -e HUGGINGFACE_HUB_CACHE="${MODEL_CACHE_ROOT}/.cache/huggingface/hub" \ - -e TRANSFORMERS_CACHE="${MODEL_CACHE_ROOT}/.cache/huggingface/transformers" \ --name "$CONTAINER_NAME" \ "${SGLANG_IMAGE_REF:-${SGLANG_IMAGE_TAG}}" env: From 74eb5e73be5d655f6d394b7be510755b3c83a02b Mon Sep 17 00:00:00 2001 From: junyyang Date: Thu, 25 Jun 2026 09:23:40 +0000 Subject: [PATCH 3/4] remove 'clean up containers' --- .../atom-sglang-accuracy-validation-gpu-shard.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml b/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml index b3bffbe39..b03ac7589 100644 --- a/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml +++ b/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml @@ -126,13 +126,13 @@ jobs: echo "MODEL_CACHE_MOUNT=${MODEL_CACHE_MOUNT}" >> "$GITHUB_ENV" echo "MODEL_CACHE_DESC=${MODEL_CACHE_DESC}" >> "$GITHUB_ENV" - - name: Clean up old containers - run: | - containers=$($CONTAINER_ENGINE ps -q) - if [ -n "$containers" ]; then - $CONTAINER_ENGINE kill $containers || true - fi - $CONTAINER_ENGINE rm -f "$CONTAINER_NAME" 2>/dev/null || true + #- name: Clean up old containers + # run: | + # containers=$($CONTAINER_ENGINE ps -q) + # if [ -n "$containers" ]; then + # $CONTAINER_ENGINE kill $containers || true + # fi + # $CONTAINER_ENGINE rm -f "$CONTAINER_NAME" 2>/dev/null || true - name: Start validation container run: | From 56a1bef54a2fda2d02d7277d2e1e2a4f25331c1c Mon Sep 17 00:00:00 2001 From: xytpai Date: Thu, 25 Jun 2026 10:14:21 -0500 Subject: [PATCH 4/4] Add 'clean up containers' --- .../atom-sglang-accuracy-validation-gpu-shard.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml b/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml index b03ac7589..b3bffbe39 100644 --- a/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml +++ b/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml @@ -126,13 +126,13 @@ jobs: echo "MODEL_CACHE_MOUNT=${MODEL_CACHE_MOUNT}" >> "$GITHUB_ENV" echo "MODEL_CACHE_DESC=${MODEL_CACHE_DESC}" >> "$GITHUB_ENV" - #- name: Clean up old containers - # run: | - # containers=$($CONTAINER_ENGINE ps -q) - # if [ -n "$containers" ]; then - # $CONTAINER_ENGINE kill $containers || true - # fi - # $CONTAINER_ENGINE rm -f "$CONTAINER_NAME" 2>/dev/null || true + - name: Clean up old containers + run: | + containers=$($CONTAINER_ENGINE ps -q) + if [ -n "$containers" ]; then + $CONTAINER_ENGINE kill $containers || true + fi + $CONTAINER_ENGINE rm -f "$CONTAINER_NAME" 2>/dev/null || true - name: Start validation container run: |