Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,13 @@ rl = [
video = [
"decord",
]
all = [
"decord",
"ray[default]",
"httpx",
"fastapi",
"uvicorn",
]

[tool.mypy]
ignore_missing_imports = true
Expand Down
4 changes: 2 additions & 2 deletions tests/engine/test_dense_train_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import parametrize
import torch
import torch.distributed as dist
from torch.testing._internal.common_distributed import DistributedTestBase
from xtuner._testing import DeterministicDDPTestCase
from transformers import AutoTokenizer

from xtuner.v1.model.moe.moe import SequenceContext
Expand All @@ -25,7 +25,7 @@
DEVICE = get_device()


class TestDenseEngine(DistributedTestBase):
class TestDenseEngine(DeterministicDDPTestCase):
@parametrize.parametrize(
"device,tp_size,sp_size",
[
Expand Down
11 changes: 6 additions & 5 deletions tests/engine/test_moe_train_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import parametrize
import torch
import torch.distributed as dist
from torch.testing._internal.common_distributed import DistributedTestBase
from xtuner._testing import DeterministicDDPTestCase
from transformers import AutoTokenizer

from xtuner.v1.model.moe.moe import SequenceContext
Expand All @@ -21,12 +21,13 @@
from xtuner.v1.utils.device import get_device
from xtuner.v1.utils.test_utils import init_data_mesh


# Qwen3 30B A3
QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"]
DEVICE = get_device()


class TestMoEEngine(DistributedTestBase):
class TestMoEEngine(DeterministicDDPTestCase):
@parametrize.parametrize(
"device,ep_size,sp_size",
[
Expand Down Expand Up @@ -101,9 +102,9 @@ def warmup_fn(x):
lr_scheduler.step()
losses.append(loss_log["reduced_llm_loss"])

losses_ref = [2.44, 2.44, 2.42, 2.41, 2.34, 2.33, 2.16, 2.13, 1.71, 1.55]
for loss, loss_ref in zip(losses, losses_ref):
self.assertTrue(abs(loss - loss_ref) / loss_ref < 0.02)
losses_ref = torch.tensor([2.44, 2.44, 2.42, 2.41, 2.34, 2.33, 2.16, 2.13, 1.71, 1.55])
losses = torch.tensor(losses)
self._check_loss_curve(losses, losses_ref)

torch.cuda.empty_cache()
try:
Expand Down
38 changes: 19 additions & 19 deletions tests/engine/test_moe_train_engine_float8.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import parametrize
import torch
import torch.distributed as dist
from torch.testing._internal.common_distributed import DistributedTestBase
from xtuner._testing import DeterministicDDPTestCase
from transformers import AutoTokenizer

from xtuner.v1.model.moe.moe import SequenceContext
Expand All @@ -22,12 +22,13 @@
from xtuner.v1.model.moe.moe import BalancingLossConfig



# Qwen3 30B A3
QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"]
DEVICE = get_device()


class TestMoEEngineFloat8(DistributedTestBase):
class TestMoEEngineFloat8(DeterministicDDPTestCase):

@parametrize.parametrize(
"device,ep_size,hsdp_sharding_size",
Expand Down Expand Up @@ -101,17 +102,16 @@ def warmup_fn(x):
engine.step_optimizer(grad_norm)
lr_scheduler.step()
losses.append(loss_log["reduced_llm_loss"])
losses_ref = [2.41, 2.41, 1.79, 1.39, 1.02, 0.68, 0.52, 0.31, 0.18, 0.12]
losses = torch.tensor(losses)
losses_ref = torch.tensor([2.41, 2.41, 1.79, 1.39, 1.02, 0.68, 0.52, 0.31, 0.18, 0.12])

for loss, loss_ref in zip(losses, losses_ref):
self.assertTrue(abs(loss - loss_ref) < 0.2)

self._check_loss_curve(losses, losses_ref, sim_tol=0.02, rtol=0.2)
torch.cuda.empty_cache()
try:
dist.destroy_process_group(pg)
except:
pass

@parametrize.parametrize(
"device,ep_size,hsdp_sharding_size",
[
Expand Down Expand Up @@ -184,17 +184,18 @@ def warmup_fn(x):
engine.step_optimizer(grad_norm)
lr_scheduler.step()
losses.append(loss_log["reduced_llm_loss"])
losses_ref = [2.45, 2.45, 1.78, 1.31, 0.95, 0.67, 0.45, 0.31, 0.18, 0.12]

for loss, loss_ref in zip(losses, losses_ref):
self.assertTrue(abs(loss - loss_ref) < 0.2)

losses_ref = torch.tensor([2.45, 2.45, 1.78, 1.31, 0.95, 0.67, 0.45, 0.31, 0.18, 0.12])
losses = torch.tensor(losses)

self._check_loss_curve(losses, losses_ref, sim_tol=0.02, rtol=0.1)

torch.cuda.empty_cache()
try:
dist.destroy_process_group(pg)
except:
pass

@parametrize.parametrize(
"device,ep_size,hsdp_sharding_size",
[
Expand Down Expand Up @@ -286,20 +287,19 @@ def warmup_fn(x):
engine.step_optimizer(grad_norm)
lr_scheduler.step()
losses.append(loss_log["reduced_llm_loss"])
losses_ref = [2.41, 2.41, 2.47, 2.42, 2.44, 2.44, 2.42, 2.38, 2.31, 2.30]

for loss, loss_ref in zip(losses, losses_ref):
self.assertTrue(abs(loss - loss_ref) < 0.2)
losses_ref = torch.tensor([2.41, 2.41, 2.47, 2.42, 2.44, 2.44, 2.42, 2.38, 2.31, 2.30])
losses = torch.tensor(losses)
self._check_loss_curve(losses, losses_ref)

if dist.get_rank() == 0:
shutil.rmtree(temp_dir)

torch.cuda.empty_cache()
try:
dist.destroy_process_group(pg)
except:
pass

@property
def world_size(self) -> int:
return int(os.getenv("XTUNER_TEST_WORLD_SIZE", "8"))
Expand All @@ -309,7 +309,7 @@ def destroy_pg_upon_exit(self) -> bool:
return False


class TestMoEEngineFloat8Case2(DistributedTestBase):
class TestMoEEngineFloat8Case2(DeterministicDDPTestCase):

@parametrize.parametrize(
"device,ep_size,hsdp_sharding_size",
Expand Down
6 changes: 4 additions & 2 deletions tests/model/test_gpt_oss_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import parametrize
import torch
from torch.testing._internal.common_distributed import DistributedTestBase
from xtuner._testing import DeterministicDDPTestCase, patch_hf_rms_norm
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import tempfile
from pathlib import Path
Expand All @@ -30,7 +30,7 @@ def wrapper(self, *args, **kwargs):
return wrapper


class TestGptOss(DistributedTestBase):
class TestGptOss(DeterministicDDPTestCase):
@parametrize.parametrize(
"device,dispatcher,ep_size,compile,tol,loss_class",
[
Expand All @@ -56,6 +56,7 @@ def test_gpt_oss_run(self, device, dispatcher, ep_size, compile, tol, loss_class
device_map="cuda"
)
hf_model.train()
patch_hf_rms_norm((hf_model))
tokenizer = AutoTokenizer.from_pretrained(GPT_OSS_MINI_PATH)
input_ids = tokenizer("吃葡萄不吐葡萄皮", return_tensors="pt").input_ids.to("cuda")
# assert input_ids.size(1) > 128
Expand Down Expand Up @@ -117,6 +118,7 @@ def test_fsdp_accuracy(self, device, dispatcher, ep_size):
config=hf_config,
device_map="cuda"
)
patch_hf_rms_norm((hf_model))
hf_model.train()
tokenizer = AutoTokenizer.from_pretrained(GPT_OSS_MINI_PATH)
input_ids = tokenizer("吃葡萄不吐葡萄皮", return_tensors="pt").input_ids.to("cuda")
Expand Down
8 changes: 6 additions & 2 deletions tests/model/test_intern_s1.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import parametrize
import torch
from torch.testing._internal.common_distributed import DistributedTestBase
from xtuner._testing import patch_hf_rms_norm, DeterministicDDPTestCase
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import torch.distributed as dist
import tempfile
Expand All @@ -24,7 +24,7 @@
INTERNS1_DENSE_PATH = os.environ["INTERNS1_DENSE_PATH"]


class TestInternS1(DistributedTestBase):
class TestInternS1(DeterministicDDPTestCase):
@parametrize.parametrize(
"device,tol",
[
Expand All @@ -48,6 +48,7 @@ def test_interns1_text_run(self, device, tol):
trust_remote_code=True,
device_map="cuda"
).eval() # avoid open drop_path
patch_hf_rms_norm(hf_model)

tokenizer = AutoTokenizer.from_pretrained(INTERNS1_DENSE_PATH, trust_remote_code=True)
input_ids = tokenizer("吃葡萄不吐葡萄皮", return_tensors="pt").input_ids.to(device)
Expand Down Expand Up @@ -118,6 +119,7 @@ def test_interns1_image_run(self, device, sp_size, tol):
trust_remote_code=True,
device_map=device
).eval() # avoid open drop_path
patch_hf_rms_norm(hf_model)

tokenizer = AutoTokenizer.from_pretrained(INTERNS1_DENSE_PATH, trust_remote_code=True)

Expand Down Expand Up @@ -233,6 +235,7 @@ def test_fsdp_text_accuracy(self, device, tol):
trust_remote_code=True,
device_map="cuda"
).eval() # avoid open drop_path
patch_hf_rms_norm(hf_model)

tokenizer = AutoTokenizer.from_pretrained(INTERNS1_DENSE_PATH, trust_remote_code=True)
input_ids = tokenizer("吃葡萄不吐葡萄皮", return_tensors="pt").input_ids.to("cuda")
Expand Down Expand Up @@ -317,6 +320,7 @@ def test_fsdp_image_accuracy(self, device, sp_size, compile, tol):
trust_remote_code=True,
device_map="cuda"
).eval() # avoid open drop_path
patch_hf_rms_norm(hf_model)

tokenizer = AutoTokenizer.from_pretrained(INTERNS1_DENSE_PATH, trust_remote_code=True)
conversations = [{"from": "human", "value": '<image>\nPlease describe the image shortly.'}]
Expand Down
10 changes: 6 additions & 4 deletions tests/model/test_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
from copy import deepcopy
from xtuner.v1.loss.ce_loss import CELossContext, CELossConfig, CELossContextInputItem

from torch.testing._internal.common_distributed import DistributedTestBase
from xtuner._testing import DeterministicDDPTestCase
from xtuner.v1.utils.compile import maybe_compile
import parametrize


class TestMoE:
@parametrize.parametrize("dtype,device", [(torch.bfloat16, "cuda")])
def test_moe_config(self, dtype, device):
maybe_compile.clear_compile_targets()
router_config = NoAuxRouterConfig(
scoring_func="sigmoid",
router_scaling_factor=1.0,
Expand Down Expand Up @@ -46,7 +48,7 @@ def test_moe_config(self, dtype, device):
num_experts_per_tok=2,
first_k_dense_replace=1,
hidden_factor=1.0,
moe_intermediate_size=256, # grouped linear kernel need this to be multiple of 256
moe_intermediate_size=512, # TODO: Restriction of triton grouped gemm, should be optimizer
router=router_config,
)
model = MoE(config=config).to(dtype).to(device)
Expand All @@ -73,7 +75,7 @@ def test_moe_config(self, dtype, device):
model(seq_ctx=seq_ctx, loss_ctx=loss_ctx)


class TestDistributedMoE(DistributedTestBase):
class TestDistributedMoE(DeterministicDDPTestCase):
@parametrize.parametrize(
"dtype,device,dispatcher,n_shared_experts,first_k_dense_replace",
[
Expand Down Expand Up @@ -116,7 +118,7 @@ def test_parralel_accuracy(self, dtype, device, dispatcher, n_shared_experts, fi
num_experts_per_tok=2,
first_k_dense_replace=first_k_dense_replace,
hidden_factor=1.0,
moe_intermediate_size=256, # grouped linear kernel need this to be multiple of 256
moe_intermediate_size=512, # TODO: Restriction of triton grouped gemm, should be optimizer
router=router_config,
)
loss_cfg = CELossConfig()
Expand Down
4 changes: 3 additions & 1 deletion tests/model/test_qwen3_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@
from xtuner.v1.config import FSDPConfig
from xtuner.v1.utils.compile import maybe_compile
from xtuner.v1.loss.ce_loss import CELossConfig, CELossContextInputItem
from xtuner._testing import patch_hf_rms_norm, DeterministicDDPTestCase

# Qwen3 8B
QWEN3_PATH = os.environ["QWEN3_PATH"]


class TestQwen3Dense(DistributedTestBase):
class TestQwen3Dense(DeterministicDDPTestCase):
@parametrize.parametrize(
"device,tp_size,compile,tol,loss_class",
[
Expand All @@ -39,6 +40,7 @@ def test_qwen3_dense_run(self, device, tp_size, compile, tol, loss_class):
torch_dtype=torch.bfloat16,
device_map="cuda"
)
patch_hf_rms_norm(hf_model)
tokenizer = AutoTokenizer.from_pretrained(QWEN3_PATH)
input_ids = tokenizer("吃葡萄不吐葡萄皮", return_tensors="pt").input_ids.to("cuda")
with torch.no_grad():
Expand Down
16 changes: 5 additions & 11 deletions tests/model/test_qwen3_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import parametrize
import torch
from torch.testing._internal.common_distributed import DistributedTestBase
import torch.distributed as dist
from transformers import AutoModelForCausalLM, AutoTokenizer
import tempfile
Expand All @@ -17,23 +16,18 @@
from xtuner.v1.config import FSDPConfig
from xtuner.v1.utils.compile import maybe_compile
from xtuner.v1.loss.ce_loss import CELossConfig, CELossContextInputItem
from xtuner._testing import patch_hf_rms_norm, DeterministicDDPTestCase


# Qwen3 30B A3
QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"]


def prepare(fn):
@wraps(fn)
def wrapper(self, *args, **kwargs):
class TestQwen3MoE(DeterministicDDPTestCase):
def prepare(self):
self.temp_dir = tempfile.TemporaryDirectory()
ret = fn(self, *args, **kwargs)
self.temp_dir.cleanup()
return ret

return wrapper


class TestQwen3MoE(DistributedTestBase):
@parametrize.parametrize(
"device,dispatcher,ep_size,compile,tol,loss_class",
[
Expand All @@ -44,7 +38,6 @@ class TestQwen3MoE(DistributedTestBase):
("cuda", None, 1, False, 1e-2, "chunk_cross_entropy"),
],
)
@prepare
def test_qwen3_moe_run(self, device, dispatcher, ep_size, compile, tol, loss_class):
os.environ["TRITON_CACHE_DIR"] = str(Path(self.temp_dir.name) / "triton_cache")
self.create_pg(device)
Expand All @@ -57,6 +50,7 @@ def test_qwen3_moe_run(self, device, dispatcher, ep_size, compile, tol, loss_cla
trust_remote_code=True,
device_map="cuda"
)
patch_hf_rms_norm(hf_model)
tokenizer = AutoTokenizer.from_pretrained(QWEN3_MOE_PATH, trust_remote_code=True)
input_ids = tokenizer("吃葡萄不吐葡萄皮", return_tensors="pt").input_ids.to("cuda")
with torch.no_grad():
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch
import random
from xtuner.v1.ops import grouped_gemm_triton
from xtuner.v1.ops import group_gemm


def grouped_gemm_torch(x, w, tokens_per_expert):
Expand Down Expand Up @@ -56,7 +56,7 @@ def test_grouped_gemm_triton():
x_ref = x.clone().detach().requires_grad_(True)
w_ref = w.clone().detach().requires_grad_(True)
out_ref = grouped_gemm_torch(x_ref, w_ref, tokens_per_expert)
out = grouped_gemm_triton(x, w, tokens_per_expert)
out = group_gemm(x, w, tokens_per_expert)
out.mean().backward()
out_ref.mean().backward()
assert torch.allclose(out, out_ref, rtol=1e-2, atol=1e-2), "Output mismatch between Triton and PyTorch implementations"
Expand Down
3 changes: 3 additions & 0 deletions xtuner/_testing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .patch_hf import patch_hf_rms_norm
from .utils import enable_full_determinism
from .testcase import DeterministicDDPTestCase
Loading