Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
71 changes: 71 additions & 0 deletions openseek/competition/pz/jouw/final-round/train-grpo-v4.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/bin/bash

HOME="/root/workspace"
MODEL_PATH="/root/workspace/OpenSeek-Small-v1-SFT"

PROJECT_NAME="OpenSeek-Small-v1-SFT_RL_Training"
RUN_NAME="OpenSeek-Small-v1-SFT_math_reasoning_v4"

export CUDA_VISIBLE_DEVICES=0,1,2,3
export HYDRA_FULL_ERROR=1

export WANDB_API_KEY={WANDB_API_KEY} # need to config
export WANDB_MODE=online
export WANDB__DISABLE_IPV6=true
export WANDB_START_METHOD=thread
export WANDB_SILENT=false
export WANDB__SERVICE_WAIT=300

PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files="$HOME/data/math_reasoning_refine/train.parquet" \
data.val_files="$HOME/data/math_reasoning_refine/test.parquet" \
data.train_batch_size=256 \
data.max_prompt_length=1024 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.trust_remote_code=True \
data.shuffle=True \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.actor.strategy=fsdp2 \
actor_rollout_ref.actor.optim.lr=5e-7 \
actor_rollout_ref.actor.optim.lr_warmup_steps=0 \
actor_rollout_ref.actor.optim.warmup_style="cosine" \
actor_rollout_ref.actor.optim.min_lr_ratio=0.0 \
actor_rollout_ref.actor.optim.num_cycles=0.5 \
actor_rollout_ref.actor.optim.weight_decay=0.01 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.name=sglang \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend=flashinfer \
+actor_rollout_ref.rollout.engine_kwargs.sglang.use_tiktoken=false \
+actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=/root/workspace/OpenSeek-Small-v1-SFT \
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The tokenizer path is hardcoded. It's better to use the MODEL_PATH variable defined at the top of the script for consistency and easier maintenance.

Suggested change
+actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=/root/workspace/OpenSeek-Small-v1-SFT \
+actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=$MODEL_PATH \

+actor_rollout_ref.rollout.engine_kwargs.sglang.trust_remote_code=true \
+actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_mode=hf \
actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.model.trust_remote_code=True \
critic.strategy=fsdp2 \
critic.optim.lr=1e-6 \
critic.model.path=$MODEL_PATH \
critic.model.trust_remote_code=True \
critic.ppo_micro_batch_size_per_gpu=16 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name=$PROJECT_NAME \
trainer.experiment_name=$RUN_NAME \
trainer.val_before_train=True \
trainer.n_gpus_per_node=4 \
trainer.nnodes=1 \
trainer.save_freq=250 \
trainer.test_freq=25 \
trainer.total_epochs=10
69 changes: 69 additions & 0 deletions openseek/competition/pz/jouw/final-round/train-grpo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash

HOME="/root/workspace"
MODEL_PATH="/root/workspace/OpenSeek-Small-v1-SFT"

PROJECT_NAME="OpenSeek-Small-v1-SFT_RL_Training"
RUN_NAME="OpenSeek-Small-v1-SFT_math_reasoning"

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export HYDRA_FULL_ERROR=1

export WANDB_API_KEY={WANDB_API_KEY} # need to config
export WANDB_MODE=online
export WANDB__DISABLE_IPV6=true
export WANDB_START_METHOD=thread
export WANDB_SILENT=false
export WANDB__SERVICE_WAIT=300

PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files="$HOME/data/math_reasoning/train.parquet" \
data.val_files="$HOME/data/math_reasoning/test.parquet" \
data.train_batch_size=256 \
data.max_prompt_length=2048 \
data.max_response_length=2048 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.trust_remote_code=True \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.actor.strategy=fsdp2 \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.lr_warmup_steps=20 \
actor_rollout_ref.actor.optim.warmup_style="cosine" \
actor_rollout_ref.actor.optim.min_lr_ratio=0.1 \
actor_rollout_ref.actor.optim.num_cycles=0.5 \
actor_rollout_ref.actor.optim.weight_decay=0.01 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.name=sglang \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend=flashinfer \
+actor_rollout_ref.rollout.engine_kwargs.sglang.use_tiktoken=false \
+actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=/root/workspace/OpenSeek-Small-v1-SFT \
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The tokenizer path is hardcoded. It's better to use the MODEL_PATH variable defined at the top of the script for consistency and easier maintenance.

Suggested change
+actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=/root/workspace/OpenSeek-Small-v1-SFT \
+actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=$MODEL_PATH \

+actor_rollout_ref.rollout.engine_kwargs.sglang.trust_remote_code=true \
+actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_mode=hf \
actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.model.trust_remote_code=True \
critic.strategy=fsdp2 \
critic.optim.lr=1e-6 \
critic.model.path=$MODEL_PATH \
critic.model.trust_remote_code=True \
critic.ppo_micro_batch_size_per_gpu=4 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name=$PROJECT_NAME \
trainer.experiment_name=$RUN_NAME \
trainer.val_before_train=True \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=500 \
trainer.test_freq=100 \
trainer.total_epochs=10
Binary file not shown.
101 changes: 101 additions & 0 deletions openseek/competition/pz/jouw/preliminary-round/submit-training-job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/bin/bash
#SBATCH --gpus=4
#SBATCH -x paraai-n32-h-01-agent-[1,4,8,16,17,25,27,28,29,30,31]
module load compilers/cuda/12.4
module load cudnn/8.9.5.29_cuda12.x
module load compilers/gcc/12.2.0
module load cmake/3.26.3
module load miniforge3/24.1

WANDB_RUN="OpenSeek-Small-v1_tokens-15B-math-exp2"
LOG_DIR="/home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details/host_0_localhost/20250814_2024"
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The LOG_DIR contains a hardcoded timestamp. This makes the script less reusable for multiple runs. It's better to generate the timestamp dynamically using the date command. This will create a new log directory for each run.

Suggested change
LOG_DIR="/home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details/host_0_localhost/20250814_2024"
LOG_DIR="/home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details/host_0_localhost/$(date +'%Y%m%d_%H%M%S')"

SAVE_STEPS=720
EVAL_STEPS=100

export CUDA_VISIBLE_DEVICES=0,1,2,3

export NCCL_DEBUG=INFO
export NCCL_SOCKET_IFNAME=^docker0,lo
export PYTHONUNBUFFERED=1
export OMP_NUM_THREADS=1

ulimit -n 1048576

source activate flagscale-train

mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints
mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints
mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs
mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/pids
mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details
mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/tensorboard
mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/wandb
Comment on lines +26 to +32
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This block has two issues:

  1. Hardcoded absolute paths make the script non-portable.
  2. A duplicated mkdir command for checkpoints.

It's recommended to use a variable for the base path (e.g., BASE_PATH, defined at the top of the script) and remove the duplicate line to improve portability and correctness.

Suggested change
mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints
mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints
mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs
mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/pids
mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details
mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/tensorboard
mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/wandb
mkdir -p "${BASE_PATH}/checkpoints"
# The duplicate line was here
mkdir -p "${BASE_PATH}/logs"
mkdir -p "${BASE_PATH}/logs/pids"
mkdir -p "${BASE_PATH}/logs/details"
mkdir -p "${BASE_PATH}/tensorboard"
mkdir -p "${BASE_PATH}/wandb"


cd /home/bingxing2/home/scx7353/workspace/OpenSeek/FlagScale

export PYTHONPATH=/home/bingxing2/home/scx7353/workspace/OpenSeek/FlagScale/third_party/Megatron-LM:/home/bingxing2/home/scx7353/workspace/OpenSeek/FlagScale:${PYTHONPATH}

export http_proxy=http://u-cEoRwn:[email protected]:3128
export https_proxy=http://u-cEoRwn:[email protected]:3128
Comment on lines +38 to +39
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

Hardcoded credentials in the proxy URLs are a major security risk. These should be removed from the script and managed securely, for example, by loading them from environment variables which are set outside of version control.

Suggested change
export http_proxy=http://u-cEoRwn:[email protected]:3128
export https_proxy=http://u-cEoRwn:[email protected]:3128
export http_proxy=${HTTP_PROXY}
export https_proxy=${HTTPS_PROXY}


export WANDB_API_KEY={WANDB_API_KEY} # need to config
export WANDB_MODE=online
export WANDB__SERVICE_WAIT=300
export WANDB_SILENT=false
export WANDB_START_METHOD=thread
export WANDB_GROUP=$WANDB_RUN

VISIBLE_DEVICES=0,1,2,3 DEVICE_MAX_CONNECTIONS=4 torchrun \
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The variable VISIBLE_DEVICES is likely a typo and should be CUDA_VISIBLE_DEVICES. However, CUDA_VISIBLE_DEVICES is already exported on line 15. This makes this variable assignment redundant and potentially confusing. It's recommended to remove it as torchrun does not use it.

Suggested change
VISIBLE_DEVICES=0,1,2,3 DEVICE_MAX_CONNECTIONS=4 torchrun \
DEVICE_MAX_CONNECTIONS=4 torchrun \

--rdzv_backend static \
--nnodes 1 \
--nproc_per_node 4 \
--rdzv_id default \
--node_rank 0 \
--rdzv_endpoint localhost:59249 \
--log_dir $LOG_DIR \
--redirects 3 \
--tee 3 \
flagscale/train/train_gpt.py \
--no-load-optim --no-load-rng \
--recompute-method uniform --recompute-granularity full --recompute-num-layers 6 \
--moe-router-dtype fp32 --num-workers 1 \
--tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --expert-model-parallel-size 1 --context-parallel-size 1 \
--disable-bias-linear --reset-position-ids --reset-attention-mask \
--qk-layernorm --sequence-parallel --use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather \
--bf16 --attention-softmax-in-fp32 --accumulate-allreduce-grads-in-fp32 \
--log-interval 1 --tensorboard-log-interval 1 \
--wandb-mode online \
--wandb-api-key 2356f969f25a7b0f375f3bcf3aff92e70d912bda \
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

A hardcoded wandb-api-key is present. This is a significant security risk. The key should not be part of the source code. Please use the WANDB_API_KEY environment variable, which is already being exported (as a placeholder) on line 41. If the training script doesn't automatically pick up the environment variable, you can pass it as an argument like this.

Suggested change
--wandb-api-key 2356f969f25a7b0f375f3bcf3aff92e70d912bda \
--wandb-api-key "$WANDB_API_KEY" \

--wandb-project OpenSeek-Small-v1 \
--wandb-exp-name $WANDB_RUN \
--log-timers-to-tensorboard --log-validation-ppl-to-tensorboard \
--log-throughput --log-params-norm --log-num-zeros-in-grad --log-memory-to-tensorboard \
--tensorboard-dir /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/tensorboard \
--wandb-save-dir /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/wandb \
--save-interval $SAVE_STEPS \
--load /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints \
--ckpt-format torch \
--save /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints \
--transformer-impl transformer_engine --num-layers 6 --hidden-size 1280 \
--num-attention-heads 10 --num-query-groups 10 --seq-length 4096 --max-position-embeddings 4096 --norm-epsilon 1e-06 \
--use-rotary-position-embeddings --rotary-base 1000000 \
--swiglu --normalization RMSNorm --init-method-std 0.006 --attention-dropout 0.0 --hidden-dropout 0.0 --clip-grad 1.0 \
--position-embedding-type rope --no-position-embedding --no-rope-fusion --multi-latent-attention \
--kv-lora-rank 512 --qk-head-dim 128 --qk-pos-emb-head-dim 64 --v-head-dim 128 --ffn-hidden-size 7168 \
--moe-ffn-hidden-size 896 --moe-grouped-gemm --moe-shared-expert-intermediate-size 1792 \
--num-experts 64 --moe-router-load-balancing-type seq_aux_loss --moe-router-score-function sigmoid \
--moe-router-enable-expert-bias --moe-router-bias-update-rate 0.001 --moe-aux-loss-coeff 0.0001 \
--moe-layer-freq '[0]+[1]*5' --moe-router-num-groups 1 --moe-router-group-topk 1 --moe-router-topk 6 \
--moe-router-topk-scaling-factor 2.446 --moe-token-dispatcher-type alltoall --seed 42 \
--micro-batch-size 2 --global-batch-size 1024 \
--eval-iters 4 --eval-interval $EVAL_STEPS \
--train-iters 3620 \
--weight-decay 0.01 \
--adam-beta1 0.9 --adam-beta2 0.95 \
--lr 0.00001 --min-lr 0.0000001 \
--lr-warmup-iters 50 --lr-warmup-samples 0 --lr-decay-style cosine \
--data-path 1.1068 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-actual-actual-high/part_142_text_document 0.5397 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-synthetic-diverse_qa_pairs-high/part_244_text_document 0.4616 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-synthetic-extract_knowledge-high/part_498_text_document 0.261 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-synthetic-knowledge_list-high/part_86_text_document 0.6414 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/arxiv/007_00000_text_document 0.4696 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/books/016_00007_text_document 1.0102 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/code-high/part_13_text_document 0.3755 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_CC-high/23_text_document 0.4598 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_code-high/4_text_document 1.3135 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_math-high/12_text_document 0.3536 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_math-mid/5_text_document 0.6314 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_wiki-high/5_text_document 0.5074 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis_math-high/11_text_document 0.6406 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis_math-mid/29_text_document 1.8165 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/math-high/part_04_text_document 1.6311 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/math-mid/part_07_text_document 0.4202 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/wiki/012_00000_text_document 1.8171 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/zh_cc-high-loss0/part_28_text_document \
--split 998,1,1 \
--no-mmap-bin-files \
--tokenizer-type QwenTokenizerFS --tokenizer-path ../hf_openseek/tokenizer \
--vocab-size 151851 --make-vocab-size-divisible-by 64