FlagAI-Open · jouw · Sep 12, 2025 · Sep 12, 2025 · Sep 12, 2025 · Sep 12, 2025
diff --git a/openseek/competition/pz/jouw/final-round/openseek-final-round-tech-report-jouw.pdf b/openseek/competition/pz/jouw/final-round/openseek-final-round-tech-report-jouw.pdf
diff --git a/openseek/competition/pz/jouw/final-round/train-grpo-v4.sh b/openseek/competition/pz/jouw/final-round/train-grpo-v4.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+HOME="/root/workspace"
+MODEL_PATH="/root/workspace/OpenSeek-Small-v1-SFT"
+
+PROJECT_NAME="OpenSeek-Small-v1-SFT_RL_Training"
+RUN_NAME="OpenSeek-Small-v1-SFT_math_reasoning_v4"
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export HYDRA_FULL_ERROR=1
+
+export WANDB_API_KEY={WANDB_API_KEY}  # need to config
+export WANDB_MODE=online
+export WANDB__DISABLE_IPV6=true
+export WANDB_START_METHOD=thread
+export WANDB_SILENT=false
+export WANDB__SERVICE_WAIT=300
+
+PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
+  algorithm.adv_estimator=grpo \
+  data.train_files="$HOME/data/math_reasoning_refine/train.parquet" \
+  data.val_files="$HOME/data/math_reasoning_refine/test.parquet" \
+  data.train_batch_size=256 \
+  data.max_prompt_length=1024 \
+  data.max_response_length=512 \
+  data.filter_overlong_prompts=True \
+  data.truncation='error' \
+  data.trust_remote_code=True \
+  data.shuffle=True \
+  actor_rollout_ref.model.path=$MODEL_PATH \
+  actor_rollout_ref.actor.strategy=fsdp2 \
+  actor_rollout_ref.actor.optim.lr=5e-7 \
+  actor_rollout_ref.actor.optim.lr_warmup_steps=0 \
+  actor_rollout_ref.actor.optim.warmup_style="cosine" \
+  actor_rollout_ref.actor.optim.min_lr_ratio=0.0 \
+  actor_rollout_ref.actor.optim.num_cycles=0.5 \
+  actor_rollout_ref.actor.optim.weight_decay=0.01 \
+  actor_rollout_ref.actor.use_kl_loss=True \
+  actor_rollout_ref.actor.kl_loss_coef=0.001 \
+  actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+  actor_rollout_ref.actor.entropy_coeff=0 \
+  actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+  actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+  actor_rollout_ref.rollout.name=sglang \
+  actor_rollout_ref.rollout.n=5 \
+  actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+  actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend=flashinfer \
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.use_tiktoken=false \
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=/root/workspace/OpenSeek-Small-v1-SFT \
- +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=/root/workspace/OpenSeek-Small-v1-SFT \
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=$MODEL_PATH \
- +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=/root/workspace/OpenSeek-Small-v1-SFT \
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=$MODEL_PATH \
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.trust_remote_code=true \
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_mode=hf \
+  actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
+  actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
+  actor_rollout_ref.model.trust_remote_code=True \
+  critic.strategy=fsdp2 \
+  critic.optim.lr=1e-6 \
+  critic.model.path=$MODEL_PATH \
+  critic.model.trust_remote_code=True \
+  critic.ppo_micro_batch_size_per_gpu=16 \
+  algorithm.use_kl_in_reward=False \
+  trainer.critic_warmup=0 \
+  trainer.logger='["console","wandb"]' \
+  trainer.project_name=$PROJECT_NAME \
+  trainer.experiment_name=$RUN_NAME \
+  trainer.val_before_train=True \
+  trainer.n_gpus_per_node=4 \
+  trainer.nnodes=1 \
+  trainer.save_freq=250 \
+  trainer.test_freq=25 \
+  trainer.total_epochs=10
diff --git a/openseek/competition/pz/jouw/final-round/train-grpo.sh b/openseek/competition/pz/jouw/final-round/train-grpo.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+HOME="/root/workspace"
+MODEL_PATH="/root/workspace/OpenSeek-Small-v1-SFT"
+
+PROJECT_NAME="OpenSeek-Small-v1-SFT_RL_Training"
+RUN_NAME="OpenSeek-Small-v1-SFT_math_reasoning"
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export HYDRA_FULL_ERROR=1
+
+export WANDB_API_KEY={WANDB_API_KEY}  # need to config
+export WANDB_MODE=online
+export WANDB__DISABLE_IPV6=true
+export WANDB_START_METHOD=thread
+export WANDB_SILENT=false
+export WANDB__SERVICE_WAIT=300
+
+PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
+  algorithm.adv_estimator=grpo \
+  data.train_files="$HOME/data/math_reasoning/train.parquet" \
+  data.val_files="$HOME/data/math_reasoning/test.parquet" \
+  data.train_batch_size=256 \
+  data.max_prompt_length=2048 \
+  data.max_response_length=2048 \
+  data.filter_overlong_prompts=True \
+  data.truncation='error' \
+  data.trust_remote_code=True \
+  actor_rollout_ref.model.path=$MODEL_PATH \
+  actor_rollout_ref.actor.strategy=fsdp2 \
+  actor_rollout_ref.actor.optim.lr=1e-6 \
+  actor_rollout_ref.actor.optim.lr_warmup_steps=20 \
+  actor_rollout_ref.actor.optim.warmup_style="cosine" \
+  actor_rollout_ref.actor.optim.min_lr_ratio=0.1 \
+  actor_rollout_ref.actor.optim.num_cycles=0.5 \
+  actor_rollout_ref.actor.optim.weight_decay=0.01 \
+  actor_rollout_ref.actor.use_kl_loss=True \
+  actor_rollout_ref.actor.kl_loss_coef=0.001 \
+  actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+  actor_rollout_ref.actor.entropy_coeff=0 \
+  actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+  actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
+  actor_rollout_ref.rollout.name=sglang \
+  actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
+  actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend=flashinfer \
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.use_tiktoken=false \
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=/root/workspace/OpenSeek-Small-v1-SFT \
- +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=/root/workspace/OpenSeek-Small-v1-SFT \
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=$MODEL_PATH \
- +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=/root/workspace/OpenSeek-Small-v1-SFT \
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=$MODEL_PATH \
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.trust_remote_code=true \
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_mode=hf \
+  actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
+  actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
+  actor_rollout_ref.model.trust_remote_code=True \
+  critic.strategy=fsdp2 \
+  critic.optim.lr=1e-6 \
+  critic.model.path=$MODEL_PATH \
+  critic.model.trust_remote_code=True \
+  critic.ppo_micro_batch_size_per_gpu=4 \
+  algorithm.use_kl_in_reward=False \
+  trainer.critic_warmup=0 \
+  trainer.logger='["console","wandb"]' \
+  trainer.project_name=$PROJECT_NAME \
+  trainer.experiment_name=$RUN_NAME \
+  trainer.val_before_train=True \
+  trainer.n_gpus_per_node=8 \
+  trainer.nnodes=1 \
+  trainer.save_freq=500 \
+  trainer.test_freq=100 \
+  trainer.total_epochs=10
diff --git a/openseek/competition/pz/jouw/preliminary-round/openseek-preround-tech-report-jouw.pdf b/openseek/competition/pz/jouw/preliminary-round/openseek-preround-tech-report-jouw.pdf
diff --git a/openseek/competition/pz/jouw/preliminary-round/submit-training-job.sh b/openseek/competition/pz/jouw/preliminary-round/submit-training-job.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+#SBATCH --gpus=4
+#SBATCH -x paraai-n32-h-01-agent-[1,4,8,16,17,25,27,28,29,30,31]
+module load compilers/cuda/12.4
+module load cudnn/8.9.5.29_cuda12.x
+module load compilers/gcc/12.2.0
+module load cmake/3.26.3
+module load miniforge3/24.1
+
+WANDB_RUN="OpenSeek-Small-v1_tokens-15B-math-exp2"
+LOG_DIR="/home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details/host_0_localhost/20250814_2024"
-LOG_DIR="/home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details/host_0_localhost/20250814_2024"
+LOG_DIR="/home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details/host_0_localhost/$(date +'%Y%m%d_%H%M%S')"
-LOG_DIR="/home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details/host_0_localhost/20250814_2024"
+LOG_DIR="/home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details/host_0_localhost/$(date +'%Y%m%d_%H%M%S')"
+SAVE_STEPS=720
+EVAL_STEPS=100
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+export NCCL_DEBUG=INFO
+export NCCL_SOCKET_IFNAME=^docker0,lo
+export PYTHONUNBUFFERED=1
+export OMP_NUM_THREADS=1
+
+ulimit -n 1048576
+
+source activate flagscale-train
+
+mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints
+mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints
+mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs
+mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/pids
+mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details
+mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/tensorboard
+mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/wandb
-mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints
-mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints
-mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs
-mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/pids
-mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details
-mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/tensorboard
-mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/wandb
+mkdir -p "${BASE_PATH}/checkpoints"
+# The duplicate line was here
+mkdir -p "${BASE_PATH}/logs"
+mkdir -p "${BASE_PATH}/logs/pids"
+mkdir -p "${BASE_PATH}/logs/details"
+mkdir -p "${BASE_PATH}/tensorboard"
+mkdir -p "${BASE_PATH}/wandb"
-mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints
-mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints
-mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs
-mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/pids
-mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details
-mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/tensorboard
-mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/wandb
+mkdir -p "${BASE_PATH}/checkpoints"
+# The duplicate line was here
+mkdir -p "${BASE_PATH}/logs"
+mkdir -p "${BASE_PATH}/logs/pids"
+mkdir -p "${BASE_PATH}/logs/details"
+mkdir -p "${BASE_PATH}/tensorboard"
+mkdir -p "${BASE_PATH}/wandb"
+
+cd /home/bingxing2/home/scx7353/workspace/OpenSeek/FlagScale
+
+export PYTHONPATH=/home/bingxing2/home/scx7353/workspace/OpenSeek/FlagScale/third_party/Megatron-LM:/home/bingxing2/home/scx7353/workspace/OpenSeek/FlagScale:${PYTHONPATH}
+
+export http_proxy=http://u-cEoRwn:[email protected]:3128
+export https_proxy=http://u-cEoRwn:[email protected]:3128
-export http_proxy=http://u-cEoRwn:[email protected]:3128
-export https_proxy=http://u-cEoRwn:[email protected]:3128
+export http_proxy=${HTTP_PROXY}
+export https_proxy=${HTTPS_PROXY}
-export http_proxy=http://u-cEoRwn:[email protected]:3128
-export https_proxy=http://u-cEoRwn:[email protected]:3128
+export http_proxy=${HTTP_PROXY}
+export https_proxy=${HTTPS_PROXY}
+
+export WANDB_API_KEY={WANDB_API_KEY}  # need to config
+export WANDB_MODE=online
+export WANDB__SERVICE_WAIT=300
+export WANDB_SILENT=false
+export WANDB_START_METHOD=thread
+export WANDB_GROUP=$WANDB_RUN
+
+VISIBLE_DEVICES=0,1,2,3 DEVICE_MAX_CONNECTIONS=4 torchrun \
-VISIBLE_DEVICES=0,1,2,3 DEVICE_MAX_CONNECTIONS=4 torchrun \
+DEVICE_MAX_CONNECTIONS=4 torchrun \
-VISIBLE_DEVICES=0,1,2,3 DEVICE_MAX_CONNECTIONS=4 torchrun \
+DEVICE_MAX_CONNECTIONS=4 torchrun \
+        --rdzv_backend static \
+        --nnodes 1 \
+        --nproc_per_node 4 \
+        --rdzv_id default \
+        --node_rank 0 \
+        --rdzv_endpoint localhost:59249 \
+        --log_dir $LOG_DIR \
+        --redirects 3 \
+        --tee 3 \
+        flagscale/train/train_gpt.py \
+        --no-load-optim --no-load-rng \
+        --recompute-method uniform --recompute-granularity full --recompute-num-layers 6 \
+        --moe-router-dtype fp32 --num-workers 1 \
+        --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --expert-model-parallel-size 1 --context-parallel-size 1 \
+        --disable-bias-linear --reset-position-ids --reset-attention-mask \
+        --qk-layernorm --sequence-parallel --use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather \
+        --bf16 --attention-softmax-in-fp32 --accumulate-allreduce-grads-in-fp32 \
+        --log-interval 1 --tensorboard-log-interval 1 \
+        --wandb-mode online \
+        --wandb-api-key 2356f969f25a7b0f375f3bcf3aff92e70d912bda \
-        --wandb-api-key 2356f969f25a7b0f375f3bcf3aff92e70d912bda \
+        --wandb-api-key "$WANDB_API_KEY" \
-        --wandb-api-key 2356f969f25a7b0f375f3bcf3aff92e70d912bda \
+        --wandb-api-key "$WANDB_API_KEY" \
+        --wandb-project OpenSeek-Small-v1 \
+        --wandb-exp-name $WANDB_RUN \
+        --log-timers-to-tensorboard --log-validation-ppl-to-tensorboard \
+        --log-throughput --log-params-norm --log-num-zeros-in-grad --log-memory-to-tensorboard \
+        --tensorboard-dir /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/tensorboard \
+        --wandb-save-dir /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/wandb \
+        --save-interval $SAVE_STEPS \
+        --load /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints \
+        --ckpt-format torch \
+        --save /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints \
+        --transformer-impl transformer_engine --num-layers 6 --hidden-size 1280 \
+        --num-attention-heads 10 --num-query-groups 10 --seq-length 4096 --max-position-embeddings 4096 --norm-epsilon 1e-06 \
+        --use-rotary-position-embeddings --rotary-base 1000000 \
+        --swiglu --normalization RMSNorm --init-method-std 0.006 --attention-dropout 0.0 --hidden-dropout 0.0 --clip-grad 1.0 \
+        --position-embedding-type rope --no-position-embedding --no-rope-fusion --multi-latent-attention \
+        --kv-lora-rank 512 --qk-head-dim 128 --qk-pos-emb-head-dim 64 --v-head-dim 128 --ffn-hidden-size 7168 \
+        --moe-ffn-hidden-size 896 --moe-grouped-gemm --moe-shared-expert-intermediate-size 1792 \
+        --num-experts 64 --moe-router-load-balancing-type seq_aux_loss --moe-router-score-function sigmoid \
+        --moe-router-enable-expert-bias --moe-router-bias-update-rate 0.001 --moe-aux-loss-coeff 0.0001 \
+        --moe-layer-freq '[0]+[1]*5' --moe-router-num-groups 1 --moe-router-group-topk 1 --moe-router-topk 6 \
+        --moe-router-topk-scaling-factor 2.446 --moe-token-dispatcher-type alltoall --seed 42 \
+        --micro-batch-size 2 --global-batch-size 1024 \
+        --eval-iters 4 --eval-interval $EVAL_STEPS \
+        --train-iters 3620 \
+        --weight-decay 0.01 \
+        --adam-beta1 0.9 --adam-beta2 0.95 \
+        --lr 0.00001 --min-lr 0.0000001 \
+        --lr-warmup-iters 50 --lr-warmup-samples 0 --lr-decay-style cosine \
+        --data-path 1.1068 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-actual-actual-high/part_142_text_document 0.5397 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-synthetic-diverse_qa_pairs-high/part_244_text_document 0.4616 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-synthetic-extract_knowledge-high/part_498_text_document 0.261 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-synthetic-knowledge_list-high/part_86_text_document 0.6414 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/arxiv/007_00000_text_document 0.4696 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/books/016_00007_text_document 1.0102 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/code-high/part_13_text_document 0.3755 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_CC-high/23_text_document 0.4598 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_code-high/4_text_document 1.3135 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_math-high/12_text_document 0.3536 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_math-mid/5_text_document 0.6314 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_wiki-high/5_text_document 0.5074 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis_math-high/11_text_document 0.6406 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis_math-mid/29_text_document 1.8165 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/math-high/part_04_text_document 1.6311 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/math-mid/part_07_text_document 0.4202 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/wiki/012_00000_text_document 1.8171 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/zh_cc-high-loss0/part_28_text_document \
+        --split 998,1,1 \
+        --no-mmap-bin-files \
+        --tokenizer-type QwenTokenizerFS --tokenizer-path ../hf_openseek/tokenizer \
+        --vocab-size 151851 --make-vocab-size-divisible-by 64