Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
606f7db
Set nccl flags
finbarrtimbers Nov 10, 2025
7319bdc
Changed cluster
finbarrtimbers Nov 10, 2025
96bc3ed
testing with a stego HF checkpoint
finbarrtimbers Nov 10, 2025
130d591
uses olmo3 model
finbarrtimbers Nov 10, 2025
6145d82
uses hf checkpoint
finbarrtimbers Nov 10, 2025
cdac2de
now uses olmo-instruct
finbarrtimbers Nov 10, 2025
11401a8
Updated script
finbarrtimbers Nov 10, 2025
60ed4b2
now uses a new image
finbarrtimbers Nov 10, 2025
906369e
Removed unnused flags
finbarrtimbers Nov 10, 2025
f7c2736
Added scott's cript
finbarrtimbers Nov 10, 2025
6ad7889
changes to smoke test
finbarrtimbers Nov 10, 2025
f37a6fb
removed nccl flags
finbarrtimbers Nov 10, 2025
d09ce77
Added back nccl flags
finbarrtimbers Nov 10, 2025
7f259d2
Set LR to the one we're about to preempt
finbarrtimbers Nov 10, 2025
e868a51
Now uses 16
finbarrtimbers Nov 10, 2025
1cd94c1
matched scott's config
finbarrtimbers Nov 10, 2025
744c48d
Added a cache and plumbed the deepspeed config through
finbarrtimbers Nov 10, 2025
a6e7fad
set zero_stage
finbarrtimbers Nov 10, 2025
1fa8827
Added flags to both
finbarrtimbers Nov 10, 2025
31d85b7
updated scripts to use new image
finbarrtimbers Nov 10, 2025
d87f7c1
removed problematic logging statements
finbarrtimbers Nov 10, 2025
ad47c7b
Set concatenated forward False
finbarrtimbers Nov 10, 2025
a490a51
updated script
finbarrtimbers Nov 10, 2025
5faba63
fixed script with newline
finbarrtimbers Nov 10, 2025
ae90ada
Added zero hpz partition size
finbarrtimbers Nov 10, 2025
5cfb972
use 8 nodes
finbarrtimbers Nov 10, 2025
cf17aff
update code
finbarrtimbers Nov 10, 2025
e3a6f7c
now uses train numbers in the filename
finbarrtimbers Nov 10, 2025
850b9f3
set concat forward false on scottg script
finbarrtimbers Nov 10, 2025
7788513
cleaned up caching logic
finbarrtimbers Nov 10, 2025
ad239ed
now get_wandb_tags won't fail
finbarrtimbers Nov 10, 2025
21042a7
trying with stage 3
finbarrtimbers Nov 10, 2025
1f08436
now we save per rank caches
finbarrtimbers Nov 10, 2025
f166603
set preemptible flag
finbarrtimbers Nov 10, 2025
eede995
now 16 nodes
finbarrtimbers Nov 10, 2025
ef419d6
add seed to ref logprobs cache file, and fail if using more than one …
scottgeng00 Nov 10, 2025
acd3dcd
launch script on unmerged sft
scottgeng00 Nov 10, 2025
c819dea
vibe coded upload ref cache to gs bucket so that other jobs can pull
scottgeng00 Nov 10, 2025
3b1f449
Revert "vibe coded upload ref cache to gs bucket so that other jobs c…
scottgeng00 Nov 10, 2025
a4536d8
add grad norm logging
saurabh111233212 Nov 11, 2025
415de99
scripts
scottgeng00 Nov 11, 2025
f437ba0
fix grad norm
saurabh111233212 Nov 11, 2025
b2b075f
Merge branch 'finbarr/dpo-faster' of github.com:allenai/open-instruct…
scottgeng00 Nov 11, 2025
705dd2b
add tyler implementation of grad norm
scottgeng00 Nov 11, 2025
cb59e86
just one more grad norm fix bro....
saurabh111233212 Nov 11, 2025
f780eae
more debug scripts
scottgeng00 Nov 11, 2025
c44dcf3
hardcoded eval prio, update later
scottgeng00 Nov 11, 2025
3556184
eval workspace fix
scottgeng00 Nov 11, 2025
368ca70
updated code
finbarrtimbers Nov 11, 2025
8ceaf5f
fix eval launch
scottgeng00 Nov 11, 2025
2834f85
Merge branch 'main' into finbarr/dpo-faster
finbarrtimbers Nov 11, 2025
d5adbba
updated code to run with zro_hpz_partition size
finbarrtimbers Nov 11, 2025
092ae95
set offload optimizer
finbarrtimbers Nov 11, 2025
ccb9aaa
set partition size 1
finbarrtimbers Nov 11, 2025
ad2a9e6
dpo eval workspace, gpu, prio flags, and update utils
scottgeng00 Nov 12, 2025
daf52ee
Merge branch 'main' into finbarr/dpo-faster
finbarrtimbers Nov 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
290 changes: 271 additions & 19 deletions open_instruct/dpo_tune_cache.py

Large diffs are not rendered by default.

20 changes: 11 additions & 9 deletions open_instruct/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,13 +664,15 @@ def get_wandb_tags() -> list[str]:
if "GIT_COMMIT" in os.environ:
git_commit = os.environ["GIT_COMMIT"]
tags.append(f"commit: {git_commit}")
# try finding the pull request number on github
prs = requests.get(f"https://api.github.com/search/issues?q=repo:allenai/open-instruct+is:pr+{git_commit}")
if prs.status_code == 200:
prs = prs.json()
if len(prs["items"]):
pr = prs["items"][0]
tags.append(f"pr: {pr['number']}")
try:
prs = requests.get(f"https://api.github.com/search/issues?q=repo:allenai/open-instruct+is:pr+{git_commit}")
if prs.status_code == 200:
prs = prs.json()
if len(prs["items"]):
pr = prs["items"][0]
tags.append(f"pr: {pr['number']}")
except requests.exceptions.ConnectionError as e:
logger.warning(f"Failed to fetch PR information from GitHub API: {e}")
if "GIT_BRANCH" in os.environ:
tags.append(f"branch: {os.environ['GIT_BRANCH']}")
tags = [tag[:64] for tag in tags if len(tag) > 64]
Expand Down Expand Up @@ -1130,8 +1132,8 @@ def launch_ai2_evals_on_weka(
oe_eval_tasks: list[str] | None = None,
stop_strings: list[str] | None = None,
gs_bucket_path: str | None = None,
eval_priority: str | None = "normal",
eval_workspace: str | None = "ai2/tulu-3-results",
eval_priority: str | None = "urgent",
eval_workspace: str | None = "ai2/olmo-instruct",
beaker_image: str | None = None,
oe_eval_gpu_multiplier: int | None = None,
) -> None:
Expand Down
2 changes: 2 additions & 0 deletions scripts/train/olmo3/32b_dpo_smoke_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ uv run python mason.py \
--max_train_samples 150000 \
--dataset_skip_cache \
--zero_stage 3 \
--zero_hpz_partition_size 1 \
--offload_optimizer True \
--ref_logprobs_cache_dir "/filestore/.cache/" \
--concatenated_forward False \
--max_seq_length 16384 \
Expand Down
63 changes: 63 additions & 0 deletions scripts/train/olmo3/scottg_dpo_sweep_on_unmerged_sft.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
BEAKER_IMAGE=$1

MODEL_NAME=/weka/oe-adapt-default/saumyam/checkpoints/olmo2-7B-sft/rl-sft/olmo3-32b-SFT-5e-5/step10790-hf
NUM_NODES=16
# for LR in 8e-8 7e-8 6e-8 9e-8 1e-7 5e-8 2e-7
for LR in 1e-4
do
EXP_NAME="olmo3-32b-5e10790-DPO-deltas-10k-${LR}-3"
uv run python mason.py \
--cluster ai2/augusta \
--gs_model_name olmo3-32b-SFT-5e-5-step10790 \
--workspace ai2/olmo-instruct \
--priority urgent \
--max_retries 5 \
--preemptible \
--image scottg/open_instruct_dev_dpo_faster --pure_docker_mode \
--env NCCL_LIB_DIR=/var/lib/tcpxo/lib64 \
--env LD_LIBRARY_PATH=/var/lib/tcpxo/lib64:$LD_LIBRARY_PATH \
--env NCCL_PROTO=Simple,LL128 \
--env NCCL_TUNER_CONFIG_PATH=/var/lib/tcpxo/lib64/a3plus_tuner_config_ll128.textproto \
--env NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE=/var/lib/tcpxo/lib64/a3plus_guest_config_ll128.textproto \
--num_nodes $NUM_NODES \
--budget ai2/oe-adapt \
--no_auto_dataset_cache \
--gpus 8 -- source /var/lib/tcpxo/lib64/nccl-env-profile.sh \&\& accelerate launch \
--mixed_precision bf16 \
--num_processes 8 \
--use_deepspeed \
--deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf \
--deepspeed_multinode_launcher standard \
open_instruct/dpo_tune_cache.py \
--exp_name $EXP_NAME \
--model_name_or_path $MODEL_NAME \
--tokenizer_name $MODEL_NAME \
--use_slow_tokenizer False \
--dataset_mixer_list allenai/olmo-3-preference-mix-deltas_reasoning-scottmix-DECON-keyword-filtered 1.0 \
--max_train_samples 10000 \
--dataset_skip_cache \
--zero_stage 3 \
--concatenated_forward False \
--max_seq_length 16384 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 1 \
--learning_rate $LR \
--lr_scheduler_type linear \
--warmup_ratio 0.1 \
--weight_decay 0.0 \
--num_train_epochs 1 \
--logging_steps 1 \
--dpo_loss_type dpo_norm \
--dpo_beta 5 \
--use_flash_attn \
--gradient_checkpointing \
--report_to wandb \
--chat_template_name olmo123 \
--with_tracking \
--try_launch_beaker_eval_jobs False \
--log_grad_norm True \
--ref_logprobs_cache_dir "/filestore/.cache/"
done
# --oe_eval_max_length 32768 \
# --oe_eval_tasks "gpqa:0shot_cot::qwen3-instruct,codex_humanevalplus:0-shot-chat::tulu-thinker_deepseek,mbppplus:0-shot-chat::tulu-thinker_deepseek,alpaca_eval_v3::hamish_zs_reasoning_deepseek,ifeval::hamish_zs_reasoning_deepseek,agi_eval_english:0shot_cot::hamish_zs_reasoning_deepseek,omega_500:0-shot-chat_deepseek,minerva_math_500::hamish_zs_reasoning_deepseek,livecodebench_codegeneration::tulu-thinker_deepseek_no_think_tags_lite,aime:zs_cot_r1::pass_at_32_2024_deepseek,aime:zs_cot_r1::pass_at_32_2025_deepseek,zebralogic::hamish_zs_reasoning_deepseek,bbh:cot::hamish_zs_reasoning_deepseek_v2,mmlu:cot::hamish_zs_reasoning_deepseek,popqa::hamish_zs_reasoning_deepseek"

64 changes: 64 additions & 0 deletions scripts/train/olmo3/scottg_dpo_sweep_on_unmerged_sft_8breject.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
BEAKER_IMAGE=$1

MODEL_NAME=/weka/oe-adapt-default/saumyam/checkpoints/olmo2-7B-sft/rl-sft/olmo3-32b-SFT-5e-5/step10790-hf
NUM_NODES=16
# for LR in 8e-8 7e-8 6e-8 9e-8 1e-7 5e-8 2e-7
for LR in 8e-7
do
EXP_NAME="olmo3-32b-5e10790-DPO-deltas-10k-${LR}-8b"
uv run python mason.py \
--cluster ai2/augusta \
--gs_model_name olmo3-32b-SFT-5e-5-step10790 \
--workspace ai2/olmo-instruct \
--priority urgent \
--max_retries 5 \
--preemptible \
--image scottg/open_instruct_dev_dpo_faster --pure_docker_mode \
--env NCCL_LIB_DIR=/var/lib/tcpxo/lib64 \
--env LD_LIBRARY_PATH=/var/lib/tcpxo/lib64:$LD_LIBRARY_PATH \
--env NCCL_PROTO=Simple,LL128 \
--env NCCL_TUNER_CONFIG_PATH=/var/lib/tcpxo/lib64/a3plus_tuner_config_ll128.textproto \
--env NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE=/var/lib/tcpxo/lib64/a3plus_guest_config_ll128.textproto \
--num_nodes $NUM_NODES \
--budget ai2/oe-adapt \
--no_auto_dataset_cache \
--gpus 8 -- source /var/lib/tcpxo/lib64/nccl-env-profile.sh \&\& accelerate launch \
--mixed_precision bf16 \
--num_processes 8 \
--use_deepspeed \
--deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf \
--deepspeed_multinode_launcher standard \
open_instruct/dpo_tune_cache.py \
--exp_name $EXP_NAME \
--model_name_or_path $MODEL_NAME \
--tokenizer_name $MODEL_NAME \
--use_slow_tokenizer False \
--dataset_mixer_list allenai/olmo-3-preference-mix-deltas_reasoning-8b_reject-scottmix-DECON-keyword-filtered 1.0 \
--max_train_samples 10000 \
--dataset_skip_cache \
--zero_stage 3 \
--concatenated_forward False \
--max_seq_length 16384 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 1 \
--learning_rate $LR \
--lr_scheduler_type linear \
--warmup_ratio 0.1 \
--weight_decay 0.0 \
--num_train_epochs 1 \
--logging_steps 1 \
--dpo_loss_type dpo_norm \
--dpo_beta 5 \
--use_flash_attn \
--gradient_checkpointing \
--report_to wandb \
--chat_template_name olmo123 \
--with_tracking \
--try_launch_beaker_eval_jobs False \
--log_grad_norm True \
--ref_logprobs_cache_dir "/filestore/.cache/"
done
# --oe_eval_max_length 32768 \
# --oe_eval_tasks "gpqa:0shot_cot::qwen3-instruct,codex_humanevalplus:0-shot-chat::tulu-thinker_deepseek,mbppplus:0-shot-chat::tulu-thinker_deepseek,alpaca_eval_v3::hamish_zs_reasoning_deepseek,ifeval::hamish_zs_reasoning_deepseek,agi_eval_english:0shot_cot::hamish_zs_reasoning_deepseek,omega_500:0-shot-chat_deepseek,minerva_math_500::hamish_zs_reasoning_deepseek,livecodebench_codegeneration::tulu-thinker_deepseek_no_think_tags_lite,aime:zs_cot_r1::pass_at_32_2024_deepseek,aime:zs_cot_r1::pass_at_32_2025_deepseek,zebralogic::hamish_zs_reasoning_deepseek,bbh:cot::hamish_zs_reasoning_deepseek_v2,mmlu:cot::hamish_zs_reasoning_deepseek,popqa::hamish_zs_reasoning_deepseek"

# --ref_logprobs_cache_dir "/filestore/.cache/" \
57 changes: 57 additions & 0 deletions scripts/train/olmo3/scottg_dpo_sweep_on_unmerged_sft_jupiter.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
BEAKER_IMAGE=$1

MODEL_NAME=/weka/oe-adapt-default/saumyam/checkpoints/olmo2-7B-sft/rl-sft/olmo3-32b-SFT-5e-5/step10790-hf
NUM_NODES=16
# for LR in 8e-8 7e-8 6e-8 9e-8 1e-7 5e-8 2e-7
for LR in 8e-7
do
EXP_NAME="olmo3-32b-5e10790-DPO-deltas-10k-${LR}-j"
uv run python mason.py \
--cluster ai2/jupiter \
--gs_model_name olmo3-32b-SFT-5e-5-step10790 \
--workspace ai2/olmo-instruct \
--priority urgent \
--max_retries 5 \
--preemptible \
--image scottg/open_instruct_dev_dpo_faster --pure_docker_mode \
--num_nodes $NUM_NODES \
--budget ai2/oe-adapt \
--no_auto_dataset_cache \
--gpus 8 -- accelerate launch \
--mixed_precision bf16 \
--num_processes 8 \
--use_deepspeed \
--deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf \
--deepspeed_multinode_launcher standard \
open_instruct/dpo_tune_cache.py \
--exp_name $EXP_NAME \
--model_name_or_path $MODEL_NAME \
--tokenizer_name $MODEL_NAME \
--use_slow_tokenizer False \
--dataset_mixer_list allenai/olmo-3-preference-mix-deltas_reasoning-scottmix-DECON-keyword-filtered 1.0 \
--max_train_samples 10000 \
--dataset_skip_cache \
--zero_stage 3 \
--concatenated_forward False \
--max_seq_length 16384 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 1 \
--learning_rate $LR \
--lr_scheduler_type linear \
--warmup_ratio 0.1 \
--weight_decay 0.0 \
--num_train_epochs 1 \
--logging_steps 1 \
--dpo_loss_type dpo_norm \
--dpo_beta 5 \
--use_flash_attn \
--gradient_checkpointing \
--report_to wandb \
--chat_template_name olmo123 \
--with_tracking \
--try_launch_beaker_eval_jobs False \
--log_grad_norm True
done
# --oe_eval_max_length 32768 \
# --oe_eval_tasks "gpqa:0shot_cot::qwen3-instruct,codex_humanevalplus:0-shot-chat::tulu-thinker_deepseek,mbppplus:0-shot-chat::tulu-thinker_deepseek,alpaca_eval_v3::hamish_zs_reasoning_deepseek,ifeval::hamish_zs_reasoning_deepseek,agi_eval_english:0shot_cot::hamish_zs_reasoning_deepseek,omega_500:0-shot-chat_deepseek,minerva_math_500::hamish_zs_reasoning_deepseek,livecodebench_codegeneration::tulu-thinker_deepseek_no_think_tags_lite,aime:zs_cot_r1::pass_at_32_2024_deepseek,aime:zs_cot_r1::pass_at_32_2025_deepseek,zebralogic::hamish_zs_reasoning_deepseek,bbh:cot::hamish_zs_reasoning_deepseek_v2,mmlu:cot::hamish_zs_reasoning_deepseek,popqa::hamish_zs_reasoning_deepseek"

65 changes: 65 additions & 0 deletions scripts/train/olmo3/scottg_dpo_sweep_on_unmerged_sft_oldenv.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
BEAKER_IMAGE=$1

MODEL_NAME=/weka/oe-adapt-default/saumyam/checkpoints/olmo2-7B-sft/rl-sft/olmo3-32b-SFT-5e-5/step10790-hf
NUM_NODES=16
# for LR in 8e-8 7e-8 6e-8 9e-8 1e-7 5e-8 2e-7
for LR in 8e-7
do
EXP_NAME="olmo3-32b-5e10790-DPO-deltas-10k-${LR}env2"
uv run python mason.py \
--cluster ai2/augusta \
--gs_model_name olmo3-32b-SFT-5e-5-step10790 \
--workspace ai2/olmo-instruct \
--priority urgent \
--max_retries 5 \
--preemptible \
--image scottg/open_instruct_dev_dpo_faster --pure_docker_mode \
--num_nodes $NUM_NODES \
--budget ai2/oe-adapt \
--no_auto_dataset_cache \
--gpus 8 -- accelerate launch \
--mixed_precision bf16 \
--num_processes 8 \
--use_deepspeed \
--deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf \
--deepspeed_multinode_launcher standard \
open_instruct/dpo_tune_cache.py \
--exp_name $EXP_NAME \
--model_name_or_path $MODEL_NAME \
--tokenizer_name $MODEL_NAME \
--use_slow_tokenizer False \
--dataset_mixer_list allenai/olmo-3-preference-mix-deltas_reasoning-scottmix-DECON-keyword-filtered 1.0 \
--max_train_samples 10000 \
--dataset_skip_cache \
--zero_stage 3 \
--concatenated_forward False \
--max_seq_length 16384 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 1 \
--learning_rate $LR \
--lr_scheduler_type linear \
--warmup_ratio 0.1 \
--weight_decay 0.0 \
--num_train_epochs 1 \
--logging_steps 1 \
--dpo_loss_type dpo_norm \
--dpo_beta 5 \
--use_flash_attn \
--gradient_checkpointing \
--report_to wandb \
--chat_template_name olmo123 \
--with_tracking \
--try_launch_beaker_eval_jobs False \
--log_grad_norm True \
--ref_logprobs_cache_dir "/filestore/.cache/"
done
# --oe_eval_max_length 32768 \
# --oe_eval_tasks "gpqa:0shot_cot::qwen3-instruct,codex_humanevalplus:0-shot-chat::tulu-thinker_deepseek,mbppplus:0-shot-chat::tulu-thinker_deepseek,alpaca_eval_v3::hamish_zs_reasoning_deepseek,ifeval::hamish_zs_reasoning_deepseek,agi_eval_english:0shot_cot::hamish_zs_reasoning_deepseek,omega_500:0-shot-chat_deepseek,minerva_math_500::hamish_zs_reasoning_deepseek,livecodebench_codegeneration::tulu-thinker_deepseek_no_think_tags_lite,aime:zs_cot_r1::pass_at_32_2024_deepseek,aime:zs_cot_r1::pass_at_32_2025_deepseek,zebralogic::hamish_zs_reasoning_deepseek,bbh:cot::hamish_zs_reasoning_deepseek_v2,mmlu:cot::hamish_zs_reasoning_deepseek,popqa::hamish_zs_reasoning_deepseek"



# --env NCCL_LIB_DIR=/var/lib/tcpxo/lib64 \
# --env LD_LIBRARY_PATH=/var/lib/tcpxo/lib64:$LD_LIBRARY_PATH \
# --env NCCL_PROTO=Simple,LL128 \
# --env NCCL_TUNER_CONFIG_PATH=/var/lib/tcpxo/lib64/a3plus_tuner_config_ll128.textproto \
# --env NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE=/var/lib/tcpxo/lib64/a3plus_guest_config_ll128.textproto \
62 changes: 62 additions & 0 deletions scripts/train/olmo3/scottg_dpo_sweep_on_unmerged_sft_overfit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
BEAKER_IMAGE=$1

MODEL_NAME=/weka/oe-adapt-default/saumyam/checkpoints/olmo2-7B-sft/rl-sft/olmo3-32b-SFT-5e-5/step10790-hf
NUM_NODES=16
# for LR in 8e-8 7e-8 6e-8 9e-8 1e-7 5e-8 2e-7
for LR in 1e-6
do
EXP_NAME="olmo3-32b-5e10790-DPO-deltas-${LR}-overfit"
uv run python mason.py \
--cluster ai2/augusta \
--gs_model_name olmo3-32b-SFT-5e-5-step10790 \
--workspace ai2/olmo-instruct \
--priority urgent \
--max_retries 5 \
--preemptible \
--image scottg/open_instruct_dev_dpo_faster --pure_docker_mode \
--env NCCL_LIB_DIR=/var/lib/tcpxo/lib64 \
--env LD_LIBRARY_PATH=/var/lib/tcpxo/lib64:$LD_LIBRARY_PATH \
--env NCCL_PROTO=Simple,LL128 \
--env NCCL_TUNER_CONFIG_PATH=/var/lib/tcpxo/lib64/a3plus_tuner_config_ll128.textproto \
--env NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE=/var/lib/tcpxo/lib64/a3plus_guest_config_ll128.textproto \
--num_nodes $NUM_NODES \
--budget ai2/oe-adapt \
--no_auto_dataset_cache \
--gpus 8 -- source /var/lib/tcpxo/lib64/nccl-env-profile.sh \&\& accelerate launch \
--mixed_precision bf16 \
--num_processes 8 \
--use_deepspeed \
--deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf \
--deepspeed_multinode_launcher standard \
open_instruct/dpo_tune_cache.py \
--exp_name $EXP_NAME \
--model_name_or_path $MODEL_NAME \
--tokenizer_name $MODEL_NAME \
--use_slow_tokenizer False \
--dataset_mixer_list allenai/olmo-3-preference-mix-deltas_reasoning-scottmix-DECON-keyword-filtered 128 \
--dataset_skip_cache \
--zero_stage 3 \
--concatenated_forward False \
--max_seq_length 16384 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 1 \
--learning_rate $LR \
--lr_scheduler_type linear \
--warmup_ratio 0.1 \
--weight_decay 0.0 \
--num_train_epochs 10 \
--logging_steps 1 \
--dpo_loss_type dpo_norm \
--dpo_beta 5 \
--use_flash_attn \
--gradient_checkpointing \
--report_to wandb \
--chat_template_name olmo123 \
--with_tracking \
--try_launch_beaker_eval_jobs False \
--log_grad_norm True
done
# --oe_eval_max_length 32768 \
# --oe_eval_tasks "gpqa:0shot_cot::qwen3-instruct,codex_humanevalplus:0-shot-chat::tulu-thinker_deepseek,mbppplus:0-shot-chat::tulu-thinker_deepseek,alpaca_eval_v3::hamish_zs_reasoning_deepseek,ifeval::hamish_zs_reasoning_deepseek,agi_eval_english:0shot_cot::hamish_zs_reasoning_deepseek,omega_500:0-shot-chat_deepseek,minerva_math_500::hamish_zs_reasoning_deepseek,livecodebench_codegeneration::tulu-thinker_deepseek_no_think_tags_lite,aime:zs_cot_r1::pass_at_32_2024_deepseek,aime:zs_cot_r1::pass_at_32_2025_deepseek,zebralogic::hamish_zs_reasoning_deepseek,bbh:cot::hamish_zs_reasoning_deepseek_v2,mmlu:cot::hamish_zs_reasoning_deepseek,popqa::hamish_zs_reasoning_deepseek"

# --ref_logprobs_cache_dir "/filestore/.cache/" \