diff --git a/examples/on_policy_distillation_trainer/run_qwen_gsm8k_npu.sh b/examples/on_policy_distillation_trainer/run_qwen_gsm8k_npu.sh new file mode 100644 index 00000000000..2b6f1571cb6 --- /dev/null +++ b/examples/on_policy_distillation_trainer/run_qwen_gsm8k_npu.sh @@ -0,0 +1,170 @@ +#!/usr/bin/env bash + +source /usr/local/Ascend/ascend-toolkit/set_env.sh +source /usr/local/Ascend/nnal/atb/set_env.sh +export NCCL_P2P_DISABLE=1 +export CUDA_DEVICE_ORDER=PCI_BUS_ID + +export ASCEND_RT_VISIBLE_DEVICES="${ASCEND_RT_VISIBLE_DEVICES:-0,1,2,3,4,5}" +export NPU_VISIBLE_DEVICES="${NPU_VISIBLE_DEVICES:-$ASCEND_RT_VISIBLE_DEVICES}" + +# vLLM on some NPU images requires V1 engine flag. +export VLLM_USE_V1="${VLLM_USE_V1:-1}" + +export DATA_PATH=$PWD/../verlData +export HF_HOME=$DATA_PATH +export VLLM_CACHE_DIR=$DATA_PATH/vllm_cache +mkdir -p "$DATA_PATH/gsm8k" "$VLLM_CACHE_DIR" + +set -xeuo pipefail + +############################ Quick Config ############################ + +ROLLOUT_NAME="vllm" # sglang or vllm + +FAMILY="Qwen" +STUDENT_MODEL=Qwen2.5-0.5B +TEACHER_MODEL=Qwen2.5-3B-Instruct + +# USE_POLICY_GRADIENT=False +# DISTILLATION_LOSS_MODE="k3" +# DISTILLATION_LOSS_MODE="forward_kl_topk" +# USE_FUSED_KERNELS=False + +USE_POLICY_GRADIENT=True +DISTILLATION_LOSS_MODE="k1" +USE_FUSED_KERNELS=True + +DISTILLATION_LOSS_MAX_CLAMP=10.0 +DISTILLATION_LOG_PROB_MIN_CLAMP=-10.0 + +PROJECT_NAME='verl_on_policy_distillation_example_gsm8k' +EXP_NAME="${FAMILY}/student-${STUDENT_MODEL}/teacher-${TEACHER_MODEL}/loss-${DISTILLATION_LOSS_MODE}-pg-${USE_POLICY_GRADIENT}-maxclamp-${DISTILLATION_LOSS_MAX_CLAMP}-logprobminclamp-${DISTILLATION_LOG_PROB_MIN_CLAMP}-fused_kernels-${USE_FUSED_KERNELS}" + +MAX_PROMPT=256 +MAX_RESPONSE_LENGTH=512 +MAX_NUM_TOKENS=$(( MAX_PROMPT + MAX_RESPONSE_LENGTH + 1 )) +TRAIN_PROMPT_BSZ=128 +STUDENT_MICRO_BATCH_SIZE_PER_GPU=2 +STUDENT_MAX_TOKEN_LEN_PER_GPU=$(( STUDENT_MICRO_BATCH_SIZE_PER_GPU * (MAX_PROMPT + MAX_RESPONSE_LENGTH) )) +USE_DYNAMIC_BSZ=False + +STUDENT_WORLD_SIZE=4 + +TEACHER_RESOURCE_POOL=False +TEACHER_WORLD_SIZE=2 + +ENFORCE_EAGER=True + +############################ Paths ############################ + +gsm8k_train_path=$DATA_PATH/gsm8k/train.parquet +gsm8k_test_path=$DATA_PATH/gsm8k/test.parquet + +TRAIN_FILES="['$gsm8k_train_path']" +TEST_FILES="['$gsm8k_test_path']" + +############################ Parameter Groups ############################ + +DATA=( + data.train_files="$TRAIN_FILES" + data.val_files="$TEST_FILES" + data.max_prompt_length=$MAX_PROMPT + data.max_response_length=$MAX_RESPONSE_LENGTH + data.train_batch_size=$TRAIN_PROMPT_BSZ + data.filter_overlong_prompts=True + data.truncation='error' + data.shuffle=False +) + +MODEL=( + actor_rollout_ref.model.path="${DATA_PATH}/weights/${FAMILY}/${STUDENT_MODEL}" + actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS + actor_rollout_ref.model.enable_gradient_checkpointing=True + actor_rollout_ref.model.use_remove_padding=True + actor_rollout_ref.actor.use_torch_compile=False + actor_rollout_ref.rollout.enforce_eager=$ENFORCE_EAGER +) + +DISTILLATION=( + distillation.enabled=True + distillation.num_workers=8 + distillation.teacher_model.enable_resource_pool=$TEACHER_RESOURCE_POOL + distillation.teacher_model.n_gpus_per_node=$TEACHER_WORLD_SIZE + distillation.teacher_model.nnodes=1 + distillation.teacher_model.model_path="${DATA_PATH}/weights/${FAMILY}/${TEACHER_MODEL}" + distillation.teacher_model.inference.tensor_model_parallel_size=1 + distillation.teacher_model.inference.name=$ROLLOUT_NAME + distillation.teacher_model.inference.gpu_memory_utilization=0.3 + distillation.teacher_model.inference.enforce_eager=$ENFORCE_EAGER + distillation.teacher_model.inference.max_model_len=$MAX_NUM_TOKENS + distillation.teacher_model.inference.max_num_batched_tokens=$MAX_NUM_TOKENS + distillation.teacher_model.inference.max_num_seqs=$MAX_NUM_TOKENS + distillation.distillation_loss.loss_mode=$DISTILLATION_LOSS_MODE + distillation.distillation_loss.topk=64 + distillation.distillation_loss.use_task_rewards=False + distillation.distillation_loss.use_policy_gradient=$USE_POLICY_GRADIENT + distillation.distillation_loss.loss_max_clamp=$DISTILLATION_LOSS_MAX_CLAMP + distillation.distillation_loss.log_prob_min_clamp=$DISTILLATION_LOG_PROB_MIN_CLAMP +) + +STUDENT=( + actor_rollout_ref.actor.optim.lr=1e-6 + actor_rollout_ref.actor.ppo_mini_batch_size=$TRAIN_PROMPT_BSZ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=$STUDENT_MICRO_BATCH_SIZE_PER_GPU + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$STUDENT_MAX_TOKEN_LEN_PER_GPU + actor_rollout_ref.actor.use_dynamic_bsz=$USE_DYNAMIC_BSZ + actor_rollout_ref.actor.fsdp_config.param_offload=True + actor_rollout_ref.actor.fsdp_config.optimizer_offload=True + actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 +) + +ROLLOUT=( + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=$STUDENT_MICRO_BATCH_SIZE_PER_GPU + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=$STUDENT_MAX_TOKEN_LEN_PER_GPU + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=$USE_DYNAMIC_BSZ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 + actor_rollout_ref.rollout.name=$ROLLOUT_NAME + actor_rollout_ref.rollout.gpu_memory_utilization=0.3 + actor_rollout_ref.rollout.calculate_log_probs=False + actor_rollout_ref.rollout.max_model_len=$MAX_NUM_TOKENS + actor_rollout_ref.rollout.max_num_batched_tokens=$MAX_NUM_TOKENS + actor_rollout_ref.rollout.max_num_seqs=$MAX_NUM_TOKENS + actor_rollout_ref.rollout.n=1 +) + +ALGORITHM=( + algorithm.adv_estimator=grpo + algorithm.use_kl_in_reward=False +) + +TRAINER=( + trainer.logger='["console"]' + trainer.project_name=$PROJECT_NAME + trainer.experiment_name=$EXP_NAME + trainer.n_gpus_per_node=$STUDENT_WORLD_SIZE + trainer.nnodes=1 + trainer.save_freq=200 + trainer.test_freq=5 + trainer.total_epochs=15 + trainer.val_before_train=True + trainer.use_legacy_worker_impl=disable + trainer.resume_mode=disable + trainer.log_val_generations=5 +) + + + +############################ Launch ############################ + +python3 -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_trainer.yaml' \ + "${DATA[@]}" \ + "${ALGORITHM[@]}" \ + "${MODEL[@]}" \ + "${DISTILLATION[@]}" \ + "${ROLLOUT[@]}" \ + "${STUDENT[@]}" \ + "${TRAINER[@]}" \ + "$@"