diff --git a/.github/benchmark/models.json b/.github/benchmark/models.json index 863702228..f0a98a9c4 100644 --- a/.github/benchmark/models.json +++ b/.github/benchmark/models.json @@ -138,22 +138,52 @@ "variants": [{ "label": "", "suffix": "", "conc_max": 256 }] }, { - "display": "MiniMax-M2.7", - "path": "MiniMaxAI/MiniMax-M2.7", - "prefix": "MiniMax-M2.7", + "display": "MiniMax-M3-MXFP8", + "path": "MiniMaxAI/MiniMax-M3-MXFP8", + "prefix": "m3-mxfp8", "runner": "atom-mi355-8gpu.predownload", - "env_vars": "", - "config": { "tp": 2, "kv_cache_dtype": "fp8", "trust_remote_code": true }, - "variants": [{ "label": "", "suffix": "", "conc_max": 256 }] + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_FORCE_ATTN_TRITON=1\nAITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0", + "config": { + "tp": 4, + "kv_cache_dtype": "fp8", + "trust_remote_code": true, + "extra_args": "--gpu-memory-utilization 0.8 --block-size 128 --max-model-len 32768 --max-num-batched-tokens 32768 --no-enable_prefix_caching --online_quant_config '{\"global_quant_config\": \"ptpc_fp8\", \"exclude_layer\": [\"lm_head\", \"model.embed_tokens\", \"vision_tower\", \"multi_modal_projector\", \"patch_merge_mlp\", \"*block_sparse_moe\"]}'" + }, + "variants": [ + { "label": "", "suffix": "", "extra_args": "--max-num-seqs 256" }, + { + "label": "EAGLE3", + "suffix": "-eagle3", + "extra_args": "--max-num-seqs 256 --method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens 3", + "bench_args": "--use-chat-template", + "conc_min": 4, + "conc_max": 256 + } + ] }, { - "display": "MiniMax-M2.7-MXFP4", - "path": "amd/MiniMax-M2.7-MXFP4", - "prefix": "MiniMax-M2.7-MXFP4", + "display": "MiniMax-M3-MXFP4", + "path": "amd/MiniMax-M3-MXFP4", + "prefix": "m3-mxfp4", "runner": "atom-mi355-8gpu.predownload", - "env_vars": "", - "config": { "tp": 1, "kv_cache_dtype": "fp8", "trust_remote_code": true }, - "variants": [{ "label": "", "suffix": "", "conc_max": 256 }] + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_FORCE_ATTN_TRITON=1\nAITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0", + "config": { + "tp": 4, + "kv_cache_dtype": "fp8", + "trust_remote_code": true, + "extra_args": "--gpu-memory-utilization 0.8 --block-size 128 --max-model-len 32768 --max-num-batched-tokens 32768 --no-enable_prefix_caching" + }, + "variants": [ + { "label": "", "suffix": "", "extra_args": "--max-num-seqs 256" }, + { + "label": "EAGLE3", + "suffix": "-eagle3", + "extra_args": "--max-num-seqs 256 --method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens 3", + "bench_args": "--use-chat-template", + "conc_min": 4, + "conc_max": 256 + } + ] }, { "display": "Qwen3.5-397B-A17B-FP8", diff --git a/.github/benchmark/models_accuracy.json b/.github/benchmark/models_accuracy.json index e6c400faa..0f69b04fe 100644 --- a/.github/benchmark/models_accuracy.json +++ b/.github/benchmark/models_accuracy.json @@ -337,6 +337,32 @@ "accuracy_baseline_model": "MiniMaxAI/MiniMax-M2.7", "_baseline_note": "ATOM CI measured BF16=0.9022 (gsm8k 3-shot flexible-extract). HF amd/MiniMax-M2.7-MXFP4: MXFP4=91.89, baseline=91.81 (percentage)." }, + { + "model_name": "MiniMax-M3-MXFP4", + "model_path": "amd/MiniMax-M3-MXFP4", + "extraArgs": "--kv_cache_dtype fp8 -tp 8 --trust-remote-code --gpu-memory-utilization 0.8 --block-size 128 --max-model-len 32768 --max-num-seqs 128 --max-num-batched-tokens 32768 --no-enable_prefix_caching", + "client_command": "lm_eval --model local-chat-completions --apply_chat_template --fewshot_as_multiturn --model_args model=${MODEL_PATH},base_url=http://localhost:8000/v1/chat/completions,num_concurrent=32,max_retries=3,max_gen_toks=16384,tokenized_requests=False,trust_remote_code=True --tasks gsm8k --num_fewshot 5 --output_path ${OUTPUT_PATH}", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_FORCE_ATTN_TRITON=1\nAITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0", + "runner": "linux-atom-do-mi350x-8", + "test_level": "pr", + "accuracy_threshold": 0.93, + "accuracy_baseline": 0.9363, + "accuracy_baseline_model": "amd/MiniMax-M3-MXFP4", + "_baseline_note": "FP4 M3 tp8. GSM8K 5-shot chat (apply_chat_template + fewshot_as_multiturn, num_concurrent=32, max_gen_toks=16384) per /data/users/zejun/zejun_atom/m3.md: validated MXFP4 flexible-extract=0.9363 ± 0.0067 (measured at tp4). Threshold 0.93 leaves ~0.6pp headroom; refresh after first CI measurement at tp8." + }, + { + "model_name": "MiniMax-M3-MXFP4 Eagle3", + "model_path": "amd/MiniMax-M3-MXFP4", + "extraArgs": "--kv_cache_dtype fp8 -tp 8 --trust-remote-code --gpu-memory-utilization 0.8 --block-size 128 --max-model-len 32768 --max-num-seqs 256 --max-num-batched-tokens 32768 --no-enable_prefix_caching --method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens 3", + "client_command": "lm_eval --model local-chat-completions --apply_chat_template --fewshot_as_multiturn --model_args model=${MODEL_PATH},base_url=http://localhost:8000/v1/chat/completions,num_concurrent=32,max_retries=3,max_gen_toks=16384,tokenized_requests=False,trust_remote_code=True --tasks gsm8k --num_fewshot 5 --output_path ${OUTPUT_PATH}", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_FORCE_ATTN_TRITON=1\nAITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0", + "runner": "linux-atom-do-mi350x-8", + "test_level": "nightly", + "accuracy_threshold": 0.93, + "accuracy_baseline": 0.9469, + "accuracy_baseline_model": "amd/MiniMax-M3-MXFP4 + Inferact/MiniMax-M3-EAGLE3", + "_baseline_note": "FP4 M3 + EAGLE3 draft (tp8), lossless vs greedy target. GSM8K 5-shot chat per /data/users/zejun/zejun_atom/m3.md: validated fp4_eagle_tp4 flexible-extract=0.9469 ± 0.0062 (accept ratio 73.36%, avg 3.20 toks/fwd, commit 9fc48338); baseline carried over from the tp4 validation since EAGLE3 is lossless wrt the target. Threshold 0.93 leaves ~1.7pp headroom; refresh after first CI measurement at tp8." + }, { "model_name": "MiMo-V2-Flash", "model_path": "XiaomiMiMo/MiMo-V2-Flash", diff --git a/.github/workflows/atom-benchmark.yaml b/.github/workflows/atom-benchmark.yaml index 5fec695b2..96a1b385b 100644 --- a/.github/workflows/atom-benchmark.yaml +++ b/.github/workflows/atom-benchmark.yaml @@ -38,12 +38,12 @@ on: description: "Benchmark Kimi-K2.5-MXFP4" type: boolean default: true - MiniMax-M2.7: - description: "Benchmark MiniMax-M2.7" + m3-mxfp8: + description: "Benchmark MiniMax-M3-MXFP8 (+ EAGLE3)" type: boolean default: true - MiniMax-M2.7-MXFP4: - description: "Benchmark MiniMax-M2.7-MXFP4" + m3-mxfp4: + description: "Benchmark MiniMax-M3-MXFP4 (+ EAGLE3)" type: boolean default: true qwen35-397b-fp8: