-
Notifications
You must be signed in to change notification settings - Fork 45
[PZ COMPETITION] Competition jouw #171
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,71 @@ | ||
| #!/bin/bash | ||
|
|
||
| HOME="/root/workspace" | ||
| MODEL_PATH="/root/workspace/OpenSeek-Small-v1-SFT" | ||
|
|
||
| PROJECT_NAME="OpenSeek-Small-v1-SFT_RL_Training" | ||
| RUN_NAME="OpenSeek-Small-v1-SFT_math_reasoning_v4" | ||
|
|
||
| export CUDA_VISIBLE_DEVICES=0,1,2,3 | ||
| export HYDRA_FULL_ERROR=1 | ||
|
|
||
| export WANDB_API_KEY={WANDB_API_KEY} # need to config | ||
| export WANDB_MODE=online | ||
| export WANDB__DISABLE_IPV6=true | ||
| export WANDB_START_METHOD=thread | ||
| export WANDB_SILENT=false | ||
| export WANDB__SERVICE_WAIT=300 | ||
|
|
||
| PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \ | ||
| algorithm.adv_estimator=grpo \ | ||
| data.train_files="$HOME/data/math_reasoning_refine/train.parquet" \ | ||
| data.val_files="$HOME/data/math_reasoning_refine/test.parquet" \ | ||
| data.train_batch_size=256 \ | ||
| data.max_prompt_length=1024 \ | ||
| data.max_response_length=512 \ | ||
| data.filter_overlong_prompts=True \ | ||
| data.truncation='error' \ | ||
| data.trust_remote_code=True \ | ||
| data.shuffle=True \ | ||
| actor_rollout_ref.model.path=$MODEL_PATH \ | ||
| actor_rollout_ref.actor.strategy=fsdp2 \ | ||
| actor_rollout_ref.actor.optim.lr=5e-7 \ | ||
| actor_rollout_ref.actor.optim.lr_warmup_steps=0 \ | ||
| actor_rollout_ref.actor.optim.warmup_style="cosine" \ | ||
| actor_rollout_ref.actor.optim.min_lr_ratio=0.0 \ | ||
| actor_rollout_ref.actor.optim.num_cycles=0.5 \ | ||
| actor_rollout_ref.actor.optim.weight_decay=0.01 \ | ||
| actor_rollout_ref.actor.use_kl_loss=True \ | ||
| actor_rollout_ref.actor.kl_loss_coef=0.001 \ | ||
| actor_rollout_ref.actor.kl_loss_type=low_var_kl \ | ||
| actor_rollout_ref.actor.entropy_coeff=0 \ | ||
| actor_rollout_ref.actor.ppo_mini_batch_size=64 \ | ||
| actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ | ||
| actor_rollout_ref.rollout.name=sglang \ | ||
| actor_rollout_ref.rollout.n=5 \ | ||
| actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ | ||
| actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ | ||
| +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend=flashinfer \ | ||
| +actor_rollout_ref.rollout.engine_kwargs.sglang.use_tiktoken=false \ | ||
| +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=/root/workspace/OpenSeek-Small-v1-SFT \ | ||
| +actor_rollout_ref.rollout.engine_kwargs.sglang.trust_remote_code=true \ | ||
| +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_mode=hf \ | ||
| actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \ | ||
| actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ | ||
| actor_rollout_ref.model.trust_remote_code=True \ | ||
| critic.strategy=fsdp2 \ | ||
| critic.optim.lr=1e-6 \ | ||
| critic.model.path=$MODEL_PATH \ | ||
| critic.model.trust_remote_code=True \ | ||
| critic.ppo_micro_batch_size_per_gpu=16 \ | ||
| algorithm.use_kl_in_reward=False \ | ||
| trainer.critic_warmup=0 \ | ||
| trainer.logger='["console","wandb"]' \ | ||
| trainer.project_name=$PROJECT_NAME \ | ||
| trainer.experiment_name=$RUN_NAME \ | ||
| trainer.val_before_train=True \ | ||
| trainer.n_gpus_per_node=4 \ | ||
| trainer.nnodes=1 \ | ||
| trainer.save_freq=250 \ | ||
| trainer.test_freq=25 \ | ||
| trainer.total_epochs=10 | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,69 @@ | ||||||
| #!/bin/bash | ||||||
|
|
||||||
| HOME="/root/workspace" | ||||||
| MODEL_PATH="/root/workspace/OpenSeek-Small-v1-SFT" | ||||||
|
|
||||||
| PROJECT_NAME="OpenSeek-Small-v1-SFT_RL_Training" | ||||||
| RUN_NAME="OpenSeek-Small-v1-SFT_math_reasoning" | ||||||
|
|
||||||
| export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | ||||||
| export HYDRA_FULL_ERROR=1 | ||||||
|
|
||||||
| export WANDB_API_KEY={WANDB_API_KEY} # need to config | ||||||
| export WANDB_MODE=online | ||||||
| export WANDB__DISABLE_IPV6=true | ||||||
| export WANDB_START_METHOD=thread | ||||||
| export WANDB_SILENT=false | ||||||
| export WANDB__SERVICE_WAIT=300 | ||||||
|
|
||||||
| PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \ | ||||||
| algorithm.adv_estimator=grpo \ | ||||||
| data.train_files="$HOME/data/math_reasoning/train.parquet" \ | ||||||
| data.val_files="$HOME/data/math_reasoning/test.parquet" \ | ||||||
| data.train_batch_size=256 \ | ||||||
| data.max_prompt_length=2048 \ | ||||||
| data.max_response_length=2048 \ | ||||||
| data.filter_overlong_prompts=True \ | ||||||
| data.truncation='error' \ | ||||||
| data.trust_remote_code=True \ | ||||||
| actor_rollout_ref.model.path=$MODEL_PATH \ | ||||||
| actor_rollout_ref.actor.strategy=fsdp2 \ | ||||||
| actor_rollout_ref.actor.optim.lr=1e-6 \ | ||||||
| actor_rollout_ref.actor.optim.lr_warmup_steps=20 \ | ||||||
| actor_rollout_ref.actor.optim.warmup_style="cosine" \ | ||||||
| actor_rollout_ref.actor.optim.min_lr_ratio=0.1 \ | ||||||
| actor_rollout_ref.actor.optim.num_cycles=0.5 \ | ||||||
| actor_rollout_ref.actor.optim.weight_decay=0.01 \ | ||||||
| actor_rollout_ref.actor.use_kl_loss=True \ | ||||||
| actor_rollout_ref.actor.kl_loss_coef=0.001 \ | ||||||
| actor_rollout_ref.actor.kl_loss_type=low_var_kl \ | ||||||
| actor_rollout_ref.actor.entropy_coeff=0 \ | ||||||
| actor_rollout_ref.actor.ppo_mini_batch_size=64 \ | ||||||
| actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ | ||||||
| actor_rollout_ref.rollout.name=sglang \ | ||||||
| actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ | ||||||
| actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ | ||||||
| +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend=flashinfer \ | ||||||
| +actor_rollout_ref.rollout.engine_kwargs.sglang.use_tiktoken=false \ | ||||||
| +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_path=/root/workspace/OpenSeek-Small-v1-SFT \ | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The tokenizer path is hardcoded. It's better to use the
Suggested change
|
||||||
| +actor_rollout_ref.rollout.engine_kwargs.sglang.trust_remote_code=true \ | ||||||
| +actor_rollout_ref.rollout.engine_kwargs.sglang.tokenizer_mode=hf \ | ||||||
| actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \ | ||||||
| actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \ | ||||||
| actor_rollout_ref.model.trust_remote_code=True \ | ||||||
| critic.strategy=fsdp2 \ | ||||||
| critic.optim.lr=1e-6 \ | ||||||
| critic.model.path=$MODEL_PATH \ | ||||||
| critic.model.trust_remote_code=True \ | ||||||
| critic.ppo_micro_batch_size_per_gpu=4 \ | ||||||
| algorithm.use_kl_in_reward=False \ | ||||||
| trainer.critic_warmup=0 \ | ||||||
| trainer.logger='["console","wandb"]' \ | ||||||
| trainer.project_name=$PROJECT_NAME \ | ||||||
| trainer.experiment_name=$RUN_NAME \ | ||||||
| trainer.val_before_train=True \ | ||||||
| trainer.n_gpus_per_node=8 \ | ||||||
| trainer.nnodes=1 \ | ||||||
| trainer.save_freq=500 \ | ||||||
| trainer.test_freq=100 \ | ||||||
| trainer.total_epochs=10 | ||||||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,101 @@ | ||||||||||||||||||||||||||||||
| #!/bin/bash | ||||||||||||||||||||||||||||||
| #SBATCH --gpus=4 | ||||||||||||||||||||||||||||||
| #SBATCH -x paraai-n32-h-01-agent-[1,4,8,16,17,25,27,28,29,30,31] | ||||||||||||||||||||||||||||||
| module load compilers/cuda/12.4 | ||||||||||||||||||||||||||||||
| module load cudnn/8.9.5.29_cuda12.x | ||||||||||||||||||||||||||||||
| module load compilers/gcc/12.2.0 | ||||||||||||||||||||||||||||||
| module load cmake/3.26.3 | ||||||||||||||||||||||||||||||
| module load miniforge3/24.1 | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| WANDB_RUN="OpenSeek-Small-v1_tokens-15B-math-exp2" | ||||||||||||||||||||||||||||||
| LOG_DIR="/home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details/host_0_localhost/20250814_2024" | ||||||||||||||||||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Suggested change
|
||||||||||||||||||||||||||||||
| SAVE_STEPS=720 | ||||||||||||||||||||||||||||||
| EVAL_STEPS=100 | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| export CUDA_VISIBLE_DEVICES=0,1,2,3 | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| export NCCL_DEBUG=INFO | ||||||||||||||||||||||||||||||
| export NCCL_SOCKET_IFNAME=^docker0,lo | ||||||||||||||||||||||||||||||
| export PYTHONUNBUFFERED=1 | ||||||||||||||||||||||||||||||
| export OMP_NUM_THREADS=1 | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| ulimit -n 1048576 | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| source activate flagscale-train | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints | ||||||||||||||||||||||||||||||
| mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints | ||||||||||||||||||||||||||||||
| mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs | ||||||||||||||||||||||||||||||
| mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/pids | ||||||||||||||||||||||||||||||
| mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/logs/details | ||||||||||||||||||||||||||||||
| mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/tensorboard | ||||||||||||||||||||||||||||||
| mkdir -p /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/wandb | ||||||||||||||||||||||||||||||
|
Comment on lines
+26
to
+32
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This block has two issues:
It's recommended to use a variable for the base path (e.g.,
Suggested change
|
||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| cd /home/bingxing2/home/scx7353/workspace/OpenSeek/FlagScale | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| export PYTHONPATH=/home/bingxing2/home/scx7353/workspace/OpenSeek/FlagScale/third_party/Megatron-LM:/home/bingxing2/home/scx7353/workspace/OpenSeek/FlagScale:${PYTHONPATH} | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| export http_proxy=http://u-cEoRwn:[email protected]:3128 | ||||||||||||||||||||||||||||||
| export https_proxy=http://u-cEoRwn:[email protected]:3128 | ||||||||||||||||||||||||||||||
|
Comment on lines
+38
to
+39
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hardcoded credentials in the proxy URLs are a major security risk. These should be removed from the script and managed securely, for example, by loading them from environment variables which are set outside of version control.
Suggested change
|
||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| export WANDB_API_KEY={WANDB_API_KEY} # need to config | ||||||||||||||||||||||||||||||
| export WANDB_MODE=online | ||||||||||||||||||||||||||||||
| export WANDB__SERVICE_WAIT=300 | ||||||||||||||||||||||||||||||
| export WANDB_SILENT=false | ||||||||||||||||||||||||||||||
| export WANDB_START_METHOD=thread | ||||||||||||||||||||||||||||||
| export WANDB_GROUP=$WANDB_RUN | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| VISIBLE_DEVICES=0,1,2,3 DEVICE_MAX_CONNECTIONS=4 torchrun \ | ||||||||||||||||||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The variable
Suggested change
|
||||||||||||||||||||||||||||||
| --rdzv_backend static \ | ||||||||||||||||||||||||||||||
| --nnodes 1 \ | ||||||||||||||||||||||||||||||
| --nproc_per_node 4 \ | ||||||||||||||||||||||||||||||
| --rdzv_id default \ | ||||||||||||||||||||||||||||||
| --node_rank 0 \ | ||||||||||||||||||||||||||||||
| --rdzv_endpoint localhost:59249 \ | ||||||||||||||||||||||||||||||
| --log_dir $LOG_DIR \ | ||||||||||||||||||||||||||||||
| --redirects 3 \ | ||||||||||||||||||||||||||||||
| --tee 3 \ | ||||||||||||||||||||||||||||||
| flagscale/train/train_gpt.py \ | ||||||||||||||||||||||||||||||
| --no-load-optim --no-load-rng \ | ||||||||||||||||||||||||||||||
| --recompute-method uniform --recompute-granularity full --recompute-num-layers 6 \ | ||||||||||||||||||||||||||||||
| --moe-router-dtype fp32 --num-workers 1 \ | ||||||||||||||||||||||||||||||
| --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --expert-model-parallel-size 1 --context-parallel-size 1 \ | ||||||||||||||||||||||||||||||
| --disable-bias-linear --reset-position-ids --reset-attention-mask \ | ||||||||||||||||||||||||||||||
| --qk-layernorm --sequence-parallel --use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather \ | ||||||||||||||||||||||||||||||
| --bf16 --attention-softmax-in-fp32 --accumulate-allreduce-grads-in-fp32 \ | ||||||||||||||||||||||||||||||
| --log-interval 1 --tensorboard-log-interval 1 \ | ||||||||||||||||||||||||||||||
| --wandb-mode online \ | ||||||||||||||||||||||||||||||
| --wandb-api-key 2356f969f25a7b0f375f3bcf3aff92e70d912bda \ | ||||||||||||||||||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A hardcoded
Suggested change
|
||||||||||||||||||||||||||||||
| --wandb-project OpenSeek-Small-v1 \ | ||||||||||||||||||||||||||||||
| --wandb-exp-name $WANDB_RUN \ | ||||||||||||||||||||||||||||||
| --log-timers-to-tensorboard --log-validation-ppl-to-tensorboard \ | ||||||||||||||||||||||||||||||
| --log-throughput --log-params-norm --log-num-zeros-in-grad --log-memory-to-tensorboard \ | ||||||||||||||||||||||||||||||
| --tensorboard-dir /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/tensorboard \ | ||||||||||||||||||||||||||||||
| --wandb-save-dir /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/wandb \ | ||||||||||||||||||||||||||||||
| --save-interval $SAVE_STEPS \ | ||||||||||||||||||||||||||||||
| --load /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints \ | ||||||||||||||||||||||||||||||
| --ckpt-format torch \ | ||||||||||||||||||||||||||||||
| --save /home/bingxing2/home/scx7353/workspace/OpenSeek/OpenSeek-Small-v1-Baseline/checkpoints \ | ||||||||||||||||||||||||||||||
| --transformer-impl transformer_engine --num-layers 6 --hidden-size 1280 \ | ||||||||||||||||||||||||||||||
| --num-attention-heads 10 --num-query-groups 10 --seq-length 4096 --max-position-embeddings 4096 --norm-epsilon 1e-06 \ | ||||||||||||||||||||||||||||||
| --use-rotary-position-embeddings --rotary-base 1000000 \ | ||||||||||||||||||||||||||||||
| --swiglu --normalization RMSNorm --init-method-std 0.006 --attention-dropout 0.0 --hidden-dropout 0.0 --clip-grad 1.0 \ | ||||||||||||||||||||||||||||||
| --position-embedding-type rope --no-position-embedding --no-rope-fusion --multi-latent-attention \ | ||||||||||||||||||||||||||||||
| --kv-lora-rank 512 --qk-head-dim 128 --qk-pos-emb-head-dim 64 --v-head-dim 128 --ffn-hidden-size 7168 \ | ||||||||||||||||||||||||||||||
| --moe-ffn-hidden-size 896 --moe-grouped-gemm --moe-shared-expert-intermediate-size 1792 \ | ||||||||||||||||||||||||||||||
| --num-experts 64 --moe-router-load-balancing-type seq_aux_loss --moe-router-score-function sigmoid \ | ||||||||||||||||||||||||||||||
| --moe-router-enable-expert-bias --moe-router-bias-update-rate 0.001 --moe-aux-loss-coeff 0.0001 \ | ||||||||||||||||||||||||||||||
| --moe-layer-freq '[0]+[1]*5' --moe-router-num-groups 1 --moe-router-group-topk 1 --moe-router-topk 6 \ | ||||||||||||||||||||||||||||||
| --moe-router-topk-scaling-factor 2.446 --moe-token-dispatcher-type alltoall --seed 42 \ | ||||||||||||||||||||||||||||||
| --micro-batch-size 2 --global-batch-size 1024 \ | ||||||||||||||||||||||||||||||
| --eval-iters 4 --eval-interval $EVAL_STEPS \ | ||||||||||||||||||||||||||||||
| --train-iters 3620 \ | ||||||||||||||||||||||||||||||
| --weight-decay 0.01 \ | ||||||||||||||||||||||||||||||
| --adam-beta1 0.9 --adam-beta2 0.95 \ | ||||||||||||||||||||||||||||||
| --lr 0.00001 --min-lr 0.0000001 \ | ||||||||||||||||||||||||||||||
| --lr-warmup-iters 50 --lr-warmup-samples 0 --lr-decay-style cosine \ | ||||||||||||||||||||||||||||||
| --data-path 1.1068 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-actual-actual-high/part_142_text_document 0.5397 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-synthetic-diverse_qa_pairs-high/part_244_text_document 0.4616 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-synthetic-extract_knowledge-high/part_498_text_document 0.261 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/Nemotron-CC-high-synthetic-knowledge_list-high/part_86_text_document 0.6414 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/arxiv/007_00000_text_document 0.4696 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/books/016_00007_text_document 1.0102 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/code-high/part_13_text_document 0.3755 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_CC-high/23_text_document 0.4598 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_code-high/4_text_document 1.3135 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_math-high/12_text_document 0.3536 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_math-mid/5_text_document 0.6314 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis2_wiki-high/5_text_document 0.5074 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis_math-high/11_text_document 0.6406 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/cot_synthesis_math-mid/29_text_document 1.8165 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/math-high/part_04_text_document 1.6311 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/math-mid/part_07_text_document 0.4202 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/wiki/012_00000_text_document 1.8171 /home/bingxing2/home/scx7353/workspace/OpenSeek-Pretrain-100B/zh_cc-high-loss0/part_28_text_document \ | ||||||||||||||||||||||||||||||
| --split 998,1,1 \ | ||||||||||||||||||||||||||||||
| --no-mmap-bin-files \ | ||||||||||||||||||||||||||||||
| --tokenizer-type QwenTokenizerFS --tokenizer-path ../hf_openseek/tokenizer \ | ||||||||||||||||||||||||||||||
| --vocab-size 151851 --make-vocab-size-divisible-by 64 | ||||||||||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The tokenizer path is hardcoded. It's better to use the
MODEL_PATHvariable defined at the top of the script for consistency and easier maintenance.