diff --git a/openseek/competition/pz/jouw/final-round/openseek-final-round-tech-report-jouw.pdf b/openseek/competition/pz/jouw/final-round/openseek-final-round-tech-report-jouw.pdf new file mode 100644 index 0000000..a3839ba Binary files /dev/null and b/openseek/competition/pz/jouw/final-round/openseek-final-round-tech-report-jouw.pdf differ diff --git a/openseek/competition/pz/jouw/final-round/train-grpo-v4.sh b/openseek/competition/pz/jouw/final-round/train-grpo-v4.sh new file mode 100644 index 0000000..5d12c10 --- /dev/null +++ b/openseek/competition/pz/jouw/final-round/train-grpo-v4.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +HOME="/root/workspace" +MODEL_PATH="/root/workspace/OpenSeek-Small-v1-SFT" + +PROJECT_NAME="OpenSeek-Small-v1-SFT_RL_Training" +RUN_NAME="OpenSeek-Small-v1-SFT_math_reasoning_v4" + +export CUDA_VISIBLE_DEVICES=0,1,2,3 +export HYDRA_FULL_ERROR=1 + +export WANDB_API_KEY={WANDB_API_KEY} # need to config +export WANDB_MODE=online +export WANDB__DISABLE_IPV6=true +export WANDB_START_METHOD=thread +export WANDB_SILENT=false +export WANDB__SERVICE_WAIT=300 + +PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files="$HOME/data/math_reasoning_refine/train.parquet" \ + data.val_files="$HOME/data/math_reasoning_refine/test.parquet" \ + data.train_batch_size=256 \ + data.max_prompt_length=1024 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.trust_remote_code=True \ + data.shuffle=True \ + actor_rollout_ref.model.path=$MODEL_PATH \ + actor_rollout_ref.actor.strategy=fsdp2 \ + actor_rollout_ref.actor.optim.lr=5e-7 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=0 \ + actor_rollout_ref.actor.optim.warmup_style="cosine" \ + actor_rollout_ref.actor.optim.min_lr_ratio=0.0 \ + actor_rollout_ref.actor.optim.num_cycles=0.5 \ + actor_rollout_ref.actor.optim.weight_decay=0.01 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend=flashinfer \ + +actor_rollout_ref.rollout.engine_kwargs.sglang.use_tiktoken=false \ + +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=/root/workspace/OpenSeek-Small-v1-SFT \ + +actor_rollout_ref.rollout.engine_kwargs.sglang.trust_remote_code=true \ + +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_mode=hf \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.model.trust_remote_code=True \ + critic.strategy=fsdp2 \ + critic.optim.lr=1e-6 \ + critic.model.path=$MODEL_PATH \ + critic.model.trust_remote_code=True \ + critic.ppo_micro_batch_size_per_gpu=16 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$RUN_NAME \ + trainer.val_before_train=True \ + trainer.n_gpus_per_node=4 \ + trainer.nnodes=1 \ + trainer.save_freq=250 \ + trainer.test_freq=25 \ + trainer.total_epochs=10 diff --git a/openseek/competition/pz/jouw/final-round/train-grpo.sh b/openseek/competition/pz/jouw/final-round/train-grpo.sh new file mode 100644 index 0000000..e0edca0 --- /dev/null +++ b/openseek/competition/pz/jouw/final-round/train-grpo.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +HOME="/root/workspace" +MODEL_PATH="/root/workspace/OpenSeek-Small-v1-SFT" + +PROJECT_NAME="OpenSeek-Small-v1-SFT_RL_Training" +RUN_NAME="OpenSeek-Small-v1-SFT_math_reasoning" + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export HYDRA_FULL_ERROR=1 + +export WANDB_API_KEY={WANDB_API_KEY} # need to config +export WANDB_MODE=online +export WANDB__DISABLE_IPV6=true +export WANDB_START_METHOD=thread +export WANDB_SILENT=false +export WANDB__SERVICE_WAIT=300 + +PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files="$HOME/data/math_reasoning/train.parquet" \ + data.val_files="$HOME/data/math_reasoning/test.parquet" \ + data.train_batch_size=256 \ + data.max_prompt_length=2048 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.trust_remote_code=True \ + actor_rollout_ref.model.path=$MODEL_PATH \ + actor_rollout_ref.actor.strategy=fsdp2 \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=20 \ + actor_rollout_ref.actor.optim.warmup_style="cosine" \ + actor_rollout_ref.actor.optim.min_lr_ratio=0.1 \ + actor_rollout_ref.actor.optim.num_cycles=0.5 \ + actor_rollout_ref.actor.optim.weight_decay=0.01 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend=flashinfer \ + +actor_rollout_ref.rollout.engine_kwargs.sglang.use_tiktoken=false \ + +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=/root/workspace/OpenSeek-Small-v1-SFT \ + +actor_rollout_ref.rollout.engine_kwargs.sglang.trust_remote_code=true \ + +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_mode=hf \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.model.trust_remote_code=True \ + critic.strategy=fsdp2 \ + critic.optim.lr=1e-6 \ + critic.model.path=$MODEL_PATH \ + critic.model.trust_remote_code=True \ + critic.ppo_micro_batch_size_per_gpu=4 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$RUN_NAME \ + trainer.val_before_train=True \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=500 \ + trainer.test_freq=100 \ + trainer.total_epochs=10 diff --git a/openseek/competition/pz/jouw/model-and-data.txt b/openseek/competition/pz/jouw/model-and-data.txt new file mode 100644 index 0000000..5c6037b --- /dev/null +++ b/openseek/competition/pz/jouw/model-and-data.txt @@ -0,0 +1,3 @@ +The model and training data: + +https://www.modelscope.cn/ocean2023/openseek-competition.git diff --git a/openseek/competition/pz/jouw/preliminary-round/openseek-preround-tech-report-jouw.pdf b/openseek/competition/pz/jouw/preliminary-round/openseek-preround-tech-report-jouw.pdf new file mode 100644 index 0000000..62903d4 Binary files /dev/null and b/openseek/competition/pz/jouw/preliminary-round/openseek-preround-tech-report-jouw.pdf differ diff --git a/openseek/competition/pz/jouw/preliminary-round/submit-training-job.sh b/openseek/competition/pz/jouw/preliminary-round/submit-training-job.sh new file mode 100644 index 0000000..da59d24 --- /dev/null +++ b/openseek/competition/pz/jouw/preliminary-round/submit-training-job.sh @@ -0,0 +1,101 @@ +#!/bin/bash +#SBATCH --gpus=4 +#SBATCH -x paraai-n32-h-01-agent-[1,4,8,16,17,25,27,28,29,30,31] +module load compilers/cuda/12.4 +module load cudnn/8.9.5.29_cuda12.x +module load compilers/gcc/12.2.0 +module load cmake/3.26.3 +module load miniforge3/24.1 + +WANDB_RUN="OpenSeek-Small-v1_tokens-15B-math-exp2" +LOG_DIR="/home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details/host_0_localhost/20250814_2024" +SAVE_STEPS=720 +EVAL_STEPS=100 + +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +export NCCL_DEBUG=INFO +export NCCL_SOCKET_IFNAME=^docker0,lo +export PYTHONUNBUFFERED=1 +export OMP_NUM_THREADS=1 + +ulimit -n 1048576 + +source activate flagscale-train + +mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints +mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints +mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs +mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/pids +mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details +mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/tensorboard +mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/wandb + +cd /home/bingxing2/home/scx7353/workspace/OpenSeek/FlagScale + +export PYTHONPATH=/home/bingxing2/home/scx7353/workspace/OpenSeek/FlagScale/third_party/Megatron-LM:/home/bingxing2/home/scx7353/workspace/OpenSeek/FlagScale:${PYTHONPATH} + +export http_proxy=http://u-cEoRwn:EDvFuZTe@172.16.4.9:3128 +export https_proxy=http://u-cEoRwn:EDvFuZTe@172.16.4.9:3128 + +export WANDB_API_KEY={WANDB_API_KEY} # need to config +export WANDB_MODE=online +export WANDB__SERVICE_WAIT=300 +export WANDB_SILENT=false +export WANDB_START_METHOD=thread +export WANDB_GROUP=$WANDB_RUN + +VISIBLE_DEVICES=0,1,2,3 DEVICE_MAX_CONNECTIONS=4 torchrun \ + --rdzv_backend static \ + --nnodes 1 \ + --nproc_per_node 4 \ + --rdzv_id default \ + --node_rank 0 \ + --rdzv_endpoint localhost:59249 \ + --log_dir $LOG_DIR \ + --redirects 3 \ + --tee 3 \ + flagscale/train/train_gpt.py \ + --no-load-optim --no-load-rng \ + --recompute-method uniform --recompute-granularity full --recompute-num-layers 6 \ + --moe-router-dtype fp32 --num-workers 1 \ + --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --expert-model-parallel-size 1 --context-parallel-size 1 \ + --disable-bias-linear --reset-position-ids --reset-attention-mask \ + --qk-layernorm --sequence-parallel --use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather \ + --bf16 --attention-softmax-in-fp32 --accumulate-allreduce-grads-in-fp32 \ + --log-interval 1 --tensorboard-log-interval 1 \ + --wandb-mode online \ + --wandb-api-key 2356f969f25a7b0f375f3bcf3aff92e70d912bda \ + --wandb-project OpenSeek-Small-v1 \ + --wandb-exp-name $WANDB_RUN \ + --log-timers-to-tensorboard --log-validation-ppl-to-tensorboard \ + --log-throughput --log-params-norm --log-num-zeros-in-grad --log-memory-to-tensorboard \ + --tensorboard-dir /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/tensorboard \ + --wandb-save-dir /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/wandb \ + --save-interval $SAVE_STEPS \ + --load /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints \ + --ckpt-format torch \ + --save /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints \ + --transformer-impl transformer_engine --num-layers 6 --hidden-size 1280 \ + --num-attention-heads 10 --num-query-groups 10 --seq-length 4096 --max-position-embeddings 4096 --norm-epsilon 1e-06 \ + --use-rotary-position-embeddings --rotary-base 1000000 \ + --swiglu --normalization RMSNorm --init-method-std 0.006 --attention-dropout 0.0 --hidden-dropout 0.0 --clip-grad 1.0 \ + --position-embedding-type rope --no-position-embedding --no-rope-fusion --multi-latent-attention \ + --kv-lora-rank 512 --qk-head-dim 128 --qk-pos-emb-head-dim 64 --v-head-dim 128 --ffn-hidden-size 7168 \ + --moe-ffn-hidden-size 896 --moe-grouped-gemm --moe-shared-expert-intermediate-size 1792 \ + --num-experts 64 --moe-router-load-balancing-type seq_aux_loss --moe-router-score-function sigmoid \ + --moe-router-enable-expert-bias --moe-router-bias-update-rate 0.001 --moe-aux-loss-coeff 0.0001 \ + --moe-layer-freq '[0]+[1]*5' --moe-router-num-groups 1 --moe-router-group-topk 1 --moe-router-topk 6 \ + --moe-router-topk-scaling-factor 2.446 --moe-token-dispatcher-type alltoall --seed 42 \ + --micro-batch-size 2 --global-batch-size 1024 \ + --eval-iters 4 --eval-interval $EVAL_STEPS \ + --train-iters 3620 \ + --weight-decay 0.01 \ + --adam-beta1 0.9 --adam-beta2 0.95 \ + --lr 0.00001 --min-lr 0.0000001 \ + --lr-warmup-iters 50 --lr-warmup-samples 0 --lr-decay-style cosine \ + --data-path 1.1068 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-actual-actual-high/part_142_text_document 0.5397 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-synthetic-diverse_qa_pairs-high/part_244_text_document 0.4616 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-synthetic-extract_knowledge-high/part_498_text_document 0.261 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-synthetic-knowledge_list-high/part_86_text_document 0.6414 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/arxiv/007_00000_text_document 0.4696 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/books/016_00007_text_document 1.0102 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/code-high/part_13_text_document 0.3755 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_CC-high/23_text_document 0.4598 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_code-high/4_text_document 1.3135 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_math-high/12_text_document 0.3536 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_math-mid/5_text_document 0.6314 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_wiki-high/5_text_document 0.5074 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis_math-high/11_text_document 0.6406 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis_math-mid/29_text_document 1.8165 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/math-high/part_04_text_document 1.6311 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/math-mid/part_07_text_document 0.4202 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/wiki/012_00000_text_document 1.8171 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/zh_cc-high-loss0/part_28_text_document \ + --split 998,1,1 \ + --no-mmap-bin-files \ + --tokenizer-type QwenTokenizerFS --tokenizer-path ../hf_openseek/tokenizer \ + --vocab-size 151851 --make-vocab-size-divisible-by 64