From 87ccd917aaec8981ff17af06e448e0552f635f17 Mon Sep 17 00:00:00 2001 From: Zhang Jing Date: Mon, 27 Apr 2026 21:03:47 +0800 Subject: [PATCH 1/6] Fix Python typing and micro_perf arg parsing Fix invalid ClassVar annotation for Python 3.10. Correct moe_gating_gemm to read dst_dtype properly. Minor cleanup in launch.py/server.py initialization order (no behavior change intended). Signed-off-by: Zhang Jing --- projects/micro_perf/launch.py | 3 ++- projects/micro_perf/op_defs/llm_ops/moe_gating_gemm.py | 2 +- projects/micro_perf/server.py | 3 ++- src/xpu_perf/model_perf/utils.py | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/projects/micro_perf/launch.py b/projects/micro_perf/launch.py index 90a0124e..f0db4ba2 100644 --- a/projects/micro_perf/launch.py +++ b/projects/micro_perf/launch.py @@ -4,13 +4,14 @@ import torch.multiprocessing as mp +FILE_DIR = pathlib.Path(__file__).parent.absolute() + from xpu_perf.micro_perf.core.perf_engine import XpuPerfServer from xpu_perf.micro_perf.core.common_utils import logger, setup_logger from xpu_perf.micro_perf.core.common_utils import get_submodules, existing_dir_path, valid_file from xpu_perf.micro_perf.core.common_utils import parse_tasks, parse_workload, export_reports -FILE_DIR = pathlib.Path(__file__).parent.absolute() BYTE_MLPERF_ROOT = FILE_DIR OP_DEFS_DIR = BYTE_MLPERF_ROOT.joinpath("op_defs") diff --git a/projects/micro_perf/op_defs/llm_ops/moe_gating_gemm.py b/projects/micro_perf/op_defs/llm_ops/moe_gating_gemm.py index 26ff0146..dabc6346 100644 --- a/projects/micro_perf/op_defs/llm_ops/moe_gating_gemm.py +++ b/projects/micro_perf/op_defs/llm_ops/moe_gating_gemm.py @@ -20,7 +20,7 @@ def prepare_args(self): # 以下参数决定当前 moe_gating_gemm 的具体数据类型 self.dtype = self.args_dict.get("dtype", "float32") self.compute_dtype = self.args_dict.get("compute_dtype", "float32") - self.dst_dtype = self.args_dict.get("dtype", "float32") + self.dst_dtype = self.args_dict.get("dst_dtype", self.dtype) def vendor_parser(self): if self.dtype == "float32" and self.compute_dtype == "float32" and self.dst_dtype == "float32": diff --git a/projects/micro_perf/server.py b/projects/micro_perf/server.py index e0ee6cea..db60720d 100644 --- a/projects/micro_perf/server.py +++ b/projects/micro_perf/server.py @@ -8,6 +8,8 @@ import torch.multiprocessing as mp +FILE_DIR = pathlib.Path(__file__).parent.absolute() + from xpu_perf.micro_perf.core.perf_engine import XpuPerfServer from xpu_perf.micro_perf.core.common_utils import logger, setup_logger from xpu_perf.micro_perf.core.common_utils import get_submodules, existing_dir_path, valid_file @@ -15,7 +17,6 @@ from flask import Flask, request, jsonify, Response, stream_with_context -FILE_DIR = pathlib.Path(__file__).parent.absolute() BYTE_MLPERF_ROOT = FILE_DIR OP_DEFS_DIR = BYTE_MLPERF_ROOT.joinpath("op_defs") diff --git a/src/xpu_perf/model_perf/utils.py b/src/xpu_perf/model_perf/utils.py index 0afc2c1c..436e2965 100644 --- a/src/xpu_perf/model_perf/utils.py +++ b/src/xpu_perf/model_perf/utils.py @@ -233,7 +233,7 @@ def from_parallel_config_dict( """ @dataclass class BenchTestCase: - ALLOWED_KEYS: ClassVar[[Tuple[str, ...]]] = ( + ALLOWED_KEYS: ClassVar[Tuple[str, ...]] = ( "batch_size", "cache_len", "q_len", From 63cf3aad305bff144c86047125f5f25d844873f8 Mon Sep 17 00:00:00 2001 From: Zhang Jing Date: Mon, 27 Apr 2026 21:08:17 +0800 Subject: [PATCH 2/6] Add DCU backend support Introduce BackendDCU and DCU backend entrypoint under src/ following existing backend structure. Signed-off-by: Zhang Jing --- .../micro_perf/backends/DCU/__init__.py | 1 + .../micro_perf/backends/DCU/backend_dcu.py | 280 ++++++++++++++++++ 2 files changed, 281 insertions(+) create mode 100644 src/xpu_perf/micro_perf/backends/DCU/__init__.py create mode 100644 src/xpu_perf/micro_perf/backends/DCU/backend_dcu.py diff --git a/src/xpu_perf/micro_perf/backends/DCU/__init__.py b/src/xpu_perf/micro_perf/backends/DCU/__init__.py new file mode 100644 index 00000000..682ebed7 --- /dev/null +++ b/src/xpu_perf/micro_perf/backends/DCU/__init__.py @@ -0,0 +1 @@ +from .backend_dcu import BackendDCU # noqa: E402 diff --git a/src/xpu_perf/micro_perf/backends/DCU/backend_dcu.py b/src/xpu_perf/micro_perf/backends/DCU/backend_dcu.py new file mode 100644 index 00000000..33f76c2d --- /dev/null +++ b/src/xpu_perf/micro_perf/backends/DCU/backend_dcu.py @@ -0,0 +1,280 @@ +import json +import os +import pathlib +import random +import shutil +import subprocess +import time +from pathlib import Path +from typing import List, Optional + +import torch +import torch.distributed as dist + +from xpu_perf.micro_perf.core.backend import Backend + + +class BackendDCU(Backend): + def __init__( + self, + backend, + env_file=None, + op_defs: Optional[pathlib.Path] = None, + vendor_ops: Optional[List[pathlib.Path]] = None, + **kwargs, + ): + if vendor_ops is None: + vendor_ops = [] + super().__init__( + backend=backend, + env_file=env_file, + op_defs=op_defs, + vendor_ops=vendor_ops, + **kwargs, + ) + + def get_backend_info(self): + info_dict = {} + device_name = torch.cuda.get_device_name(0) + info_dict["device_name"] = device_name + info_dict["device_count"] = torch.cuda.device_count() + device_properties = torch.cuda.get_device_properties(0) + info_dict["device_memory_mb"] = device_properties.total_memory / (1024**2) + backend_env = self.get_backend_env() + info_dict["torch_version"] = backend_env.get("torch", "") + info_dict["torch_cuda_version"] = getattr(torch.version, "hip", None) or getattr( + torch.version, "cuda", "" + ) + info_dict["dtk_version"] = backend_env.get("dtk_version", "") + info_dict["driver_version"] = backend_env.get("driver", "") + return info_dict + + def perf(self, op_instance): + """ + Keep upstream perf mechanism, but use legacy DCU iteration policy: + fixed 32 iterations (for stable profiling/measurement) with a longer target window. + """ + import math + import time + import traceback + + tensor_size = op_instance.tensor_size + avail_memory = self.get_mem_info()[0] + + assume_avail_bytes = int(avail_memory * 0.9) + assume_cache_size = 1 * (1024**3) + + latency_us = 0.0 + kernel_mapping = {} + + try: + min_test_iters = 32 + max_test_iters = 32 + max_test_time = 1e6 # 1s in us + max_data_cnt = 1 + if not op_instance.is_concurrent: + if tensor_size > assume_avail_bytes: + raise RuntimeError("Not enough memory to run the op") + elif 2 * tensor_size > assume_avail_bytes: + max_data_cnt = 1 + elif tensor_size > assume_cache_size: + max_data_cnt = 2 + else: + max_data_cnt = min( + math.floor(max(assume_avail_bytes, assume_cache_size) / tensor_size), + math.floor(assume_cache_size / tensor_size), + ) + + tensor_list = op_instance.create_tensors(max_data_cnt) + random.shuffle(tensor_list) + + latency_us, _ = self.core_perf(op_instance, 2, 2, tensor_list, profiling=False) + prefer_iters = min(max(math.ceil(max_test_time / latency_us), min_test_iters), max_test_iters) + + if op_instance.group_size > 1: + dist_module = self.get_dist_module() + prefer_iters_list = [None for _ in range(op_instance.group_size)] + dist_module.all_gather_object(prefer_iters_list, prefer_iters, group=op_instance.op_group) + prefer_iters = max(prefer_iters_list) + + time.sleep(0.2) + + # DCU policy: enable profiling by default, unless explicitly disabled per-case. + # This keeps upstream's vendor-controlled flag available as an override. + require_profiling = op_instance.args_dict.get("require_profiling", True) + op_instance.require_profiling = bool(require_profiling) + actual_profiling = self.enable_profiling and bool(require_profiling) + latency_us, kernel_mapping = self.core_perf( + op_instance, 2, prefer_iters, tensor_list, profiling=actual_profiling + ) + + del tensor_list + self.empty_cache() + except Exception: + traceback.print_exc() + + return op_instance.summary(latency_us, kernel_mapping) + + def clean_extra_files(self): + prof_dir = pathlib.Path.cwd().joinpath("profiling") + if prof_dir.exists() and not getattr(self, "keep_traces", False): + shutil.rmtree(prof_dir) + + def get_torch_device_name(self): + return "cuda" + + def get_device_name(self, index=0): + return torch.cuda.get_device_name(index) + + def get_device_properties(self, index=0): + return torch.cuda.get_device_properties(index) + + def get_mem_info(self, index=0): + total_memory = torch.cuda.get_device_properties(index).total_memory + allocated_memory = torch.cuda.memory_allocated(index) + free_memory = total_memory - allocated_memory + return (free_memory, total_memory) + + def get_device_count(self): + device_count = torch.cuda.device_count() + return device_count, list(range(device_count)) + + def set_device(self, device_index: int): + torch.cuda.set_device(device_index) + + def get_device(self): + return torch.cuda.current_device() + + def device_synchronize(self): + torch.cuda.synchronize() + + def empty_cache(self): + torch.cuda.empty_cache() + + def get_rocm_version(self): + hipcc_path = subprocess.run( + ["which", "hipcc"], stdout=subprocess.PIPE, text=True + ).stdout.strip() + if not hipcc_path: + return "N/A" + dtk_root = str(Path(hipcc_path).parent.parent) + version_path = os.path.join(dtk_root, ".info/rocm_version") + try: + with open(version_path, "r", encoding="utf-8") as f: + return f.read().strip() + except OSError: + return "N/A" + + def get_backend_env(self): + __torch_version = torch.__version__ + __dtk_version = self.get_rocm_version() + __driver_version = "" + rocm_smi = subprocess.run( + ["rocm-smi", "--showdriverversion"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + ) + if rocm_smi.returncode == 0: + for line in rocm_smi.stdout.split("\n"): + if "Driver Version" in line: + __driver_version = line.split(":", 1)[1].strip() + break + return { + "torch": __torch_version, + "dtk_version": __dtk_version, + "driver": __driver_version, + } + + def get_dist_module(self): + return dist + + def get_dist_backend(self): + return "nccl" + + def core_perf( + self, + op_instance, + warmup_iterations, + prefer_iterations, + tensor_list, + profiling=True, + ): + op_group = op_instance.op_group + group_size = op_instance.group_size + + if not op_instance.is_concurrent and profiling: + process_id = os.getpid() + prof_dir = pathlib.Path.cwd().joinpath("profiling", f"{process_id}") + prof_dir.mkdir(parents=True, exist_ok=True) + if getattr(self, "keep_traces", False): + trace_file = prof_dir.joinpath( + f"trace_{op_instance.__class__.__name__}_{int(time.time() * 1000)}.json" + ) + else: + trace_file = prof_dir.joinpath("trace.json") + + with torch.profiler.profile( + activities=[torch.profiler.ProfilerActivity.CUDA], + schedule=torch.profiler.schedule( + wait=0, + warmup=warmup_iterations, + active=prefer_iterations, + repeat=1, + ), + on_trace_ready=lambda prof: prof.export_chrome_trace(str(trace_file)), + ) as prof: + for i in range(prefer_iterations + warmup_iterations): + op_instance.core_run(tensor_list[i % len(tensor_list)]) + self.device_synchronize() + prof.step() + + average_latency = 0.0 + kernel_latency_list = {} + if prof_dir.exists(): + json_files = list(prof_dir.glob("*.json")) + if json_files: + profiling_data = json.load(open(json_files[0], encoding="utf-8")) + for event in profiling_data.get("traceEvents", []): + if event.get("cat", None) in ["kernel", "gpu_memcpy"]: + kernel_name = event["name"] + kernel_latency = event["dur"] + kernel_latency_list.setdefault(kernel_name, []).append( + kernel_latency + ) + take_iters = prefer_iterations // 2 + iters_offset = prefer_iterations - take_iters + removed_keys = [] + for kernel in list(kernel_latency_list.keys()): + if len(kernel_latency_list[kernel]) != prefer_iterations: + removed_keys.append(kernel) + else: + average_latency += sum( + kernel_latency_list[kernel][iters_offset:] + ) + for k in removed_keys: + kernel_latency_list.pop(k, None) + if take_iters: + average_latency /= take_iters + if not getattr(self, "keep_traces", False): + try: + trace_file.unlink() + except OSError: + pass + return average_latency, list(kernel_latency_list.keys()) + + for i in range(warmup_iterations): + index = random.randint(0, len(tensor_list) - 1) + op_instance.core_run(tensor_list[index]) + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + self.device_synchronize() + self.op_group_barrier(op_group=op_group, group_size=group_size) + start_event.record() + for i in range(prefer_iterations): + op_instance.core_run(tensor_list[i % len(tensor_list)]) + end_event.record() + end_event.synchronize() + latency_us = start_event.elapsed_time(end_event) * 1e3 / prefer_iterations + return latency_us, [] From 1e3ea034006bab589334b0ab608bc3ea74269827 Mon Sep 17 00:00:00 2001 From: Zhang Jing Date: Mon, 27 Apr 2026 21:14:21 +0800 Subject: [PATCH 3/6] Add base op defs for moe_quant_group_gemm up/down Signed-off-by: Zhang Jing Co-authored-by: Zhu Fuzhu --- .../llm_ops/moe_quant_group_gemm_down.py | 67 +++++++++++++++++++ .../llm_ops/moe_quant_group_gemm_up.py | 67 +++++++++++++++++++ .../xpu_oj/llm_sim/model_zoo/op_templates.py | 3 + 3 files changed, 137 insertions(+) create mode 100644 projects/micro_perf/op_defs/llm_ops/moe_quant_group_gemm_down.py create mode 100644 projects/micro_perf/op_defs/llm_ops/moe_quant_group_gemm_up.py diff --git a/projects/micro_perf/op_defs/llm_ops/moe_quant_group_gemm_down.py b/projects/micro_perf/op_defs/llm_ops/moe_quant_group_gemm_down.py new file mode 100644 index 00000000..c59fedd8 --- /dev/null +++ b/projects/micro_perf/op_defs/llm_ops/moe_quant_group_gemm_down.py @@ -0,0 +1,67 @@ +"""LLM op: moe_quant_group_gemm_down (base definition). + +This exists for compatibility with legacy split-op vendor implementations. +The semantic definition is intentionally identical to `moe_quant_group_gemm`; +vendor providers may use different kernel/packing for up/down. +""" + +from ._common import * + + +@ProviderRegistry.register_base_impl("moe_quant_group_gemm_down", "ComputeEngine") +class MoeQuantGroupGemmDownOp(BasicOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + + def prepare_args(self): + self.arg_type = self.args_dict["arg_type"] + if self.arg_type not in ["llm"]: + raise ValueError( + f"{type(self).__name__} only supports llm arg_type, but got {self.arg_type}" + ) + + # predefined attrs + self.num_tokens = self.args_dict["num_tokens"] + self.hidden_size = self.args_dict["hidden_size"] + self.new_hidden_size = self.args_dict["new_hidden_size"] + + # moe info + self.num_experts = self.args_dict["num_experts"] + self.topk = self.args_dict["topk"] + + # parallel info + self.ep_size = self.args_dict.get("ep_size", 1) + self.ep_rank = self.args_dict.get("ep_rank", 0) + + # get moe token dispatch info + ( + self.num_scatter_tokens, + self.num_scatter_tokens_per_rank, + self.num_experts_per_rank, + self.experts_start_idx, + self.experts_end_idx, + self.all_select_experts, + self.all_select_weights, + self.dispatch_tokens, + self.used_src_tokens, + self.expert_dispatch_tokens, + self.expert_dispatch_weights, + self.scatter_token_id, + self.scatter_token_weight, + self.expert_dispatch_token_count, + self.expert_dispatch_token_offset, + ) = get_moe_tokens_info( + self.num_tokens, + self.num_experts, + self.topk, + ep_size=self.ep_size, + ep_rank=self.ep_rank, + ) + + # dtype tuple + self.dtype = self.args_dict.get("dtype", "int8") + self.w_dtype = self.args_dict.get("w_dtype", "int8") + self.compute_dtype = self.args_dict.get("compute_dtype", "int8") + self.dst_dtype = self.args_dict.get("dst_dtype", "bfloat16") + + diff --git a/projects/micro_perf/op_defs/llm_ops/moe_quant_group_gemm_up.py b/projects/micro_perf/op_defs/llm_ops/moe_quant_group_gemm_up.py new file mode 100644 index 00000000..9b714a39 --- /dev/null +++ b/projects/micro_perf/op_defs/llm_ops/moe_quant_group_gemm_up.py @@ -0,0 +1,67 @@ +"""LLM op: moe_quant_group_gemm_up (base definition). + +This exists for compatibility with legacy split-op vendor implementations. +The semantic definition is intentionally identical to `moe_quant_group_gemm`; +vendor providers may use different kernel/packing for up/down. +""" + +from ._common import * + + +@ProviderRegistry.register_base_impl("moe_quant_group_gemm_up", "ComputeEngine") +class MoeQuantGroupGemmUpOp(BasicOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + + def prepare_args(self): + self.arg_type = self.args_dict["arg_type"] + if self.arg_type not in ["llm"]: + raise ValueError( + f"{type(self).__name__} only supports llm arg_type, but got {self.arg_type}" + ) + + # predefined attrs + self.num_tokens = self.args_dict["num_tokens"] + self.hidden_size = self.args_dict["hidden_size"] + self.new_hidden_size = self.args_dict["new_hidden_size"] + + # moe info + self.num_experts = self.args_dict["num_experts"] + self.topk = self.args_dict["topk"] + + # parallel info + self.ep_size = self.args_dict.get("ep_size", 1) + self.ep_rank = self.args_dict.get("ep_rank", 0) + + # get moe token dispatch info + ( + self.num_scatter_tokens, + self.num_scatter_tokens_per_rank, + self.num_experts_per_rank, + self.experts_start_idx, + self.experts_end_idx, + self.all_select_experts, + self.all_select_weights, + self.dispatch_tokens, + self.used_src_tokens, + self.expert_dispatch_tokens, + self.expert_dispatch_weights, + self.scatter_token_id, + self.scatter_token_weight, + self.expert_dispatch_token_count, + self.expert_dispatch_token_offset, + ) = get_moe_tokens_info( + self.num_tokens, + self.num_experts, + self.topk, + ep_size=self.ep_size, + ep_rank=self.ep_rank, + ) + + # dtype tuple + self.dtype = self.args_dict.get("dtype", "int8") + self.w_dtype = self.args_dict.get("w_dtype", "int8") + self.compute_dtype = self.args_dict.get("compute_dtype", "int8") + self.dst_dtype = self.args_dict.get("dst_dtype", "bfloat16") + + diff --git a/projects/xpu_oj/llm_sim/model_zoo/op_templates.py b/projects/xpu_oj/llm_sim/model_zoo/op_templates.py index 62d49ffe..570a9e4c 100644 --- a/projects/xpu_oj/llm_sim/model_zoo/op_templates.py +++ b/projects/xpu_oj/llm_sim/model_zoo/op_templates.py @@ -77,6 +77,9 @@ def mode_bs_cache_q_set_template( "moe_gating_gemm": num_tokens_set_template, "quant_matmul": num_tokens_set_template, "moe_quant_group_gemm": num_tokens_set_template, + # Compatibility split-ops for DCU/lightop tp-ep templates. + "moe_quant_group_gemm_up": num_tokens_set_template, + "moe_quant_group_gemm_down": num_tokens_set_template, "moe_quant_group_gemm_combine": num_tokens_set_template, "quant_group_gemm_reduce_sum": num_tokens_set_template, From 17fd88b25f56bb65b1e164945c94249e0a0a5574 Mon Sep 17 00:00:00 2001 From: Zhang Jing Date: Mon, 27 Apr 2026 21:09:43 +0800 Subject: [PATCH 4/6] DCU vendor ops: add providers and implementations Add DCU vendor provider packages (lightop/custom_ops/lmslim/flash_attn/torch) and their op implementations. Fix vendor ops to follow upstream lifecycle (vendor_parser/vendor_impl/vendor_impl_run) and ensure correct tensor mapping/output allocation where required. Signed-off-by: Zhang Jing Co-authored-by: Zhu Fuzhu Co-authored-by: Wang Sen Co-authored-by: Zhuang Luo --- .../vendor_ops/DCU/ops/custom_ops/__init__.py | 12 + .../DCU/ops/custom_ops/add_rms_norm.py | 32 ++ .../custom_ops/add_rms_norm_dynamic_quant.py | 48 +++ .../custom_ops/moe_scatter_dynamic_quant.py | 78 ++++ .../DCU/ops/custom_ops/rotary_embedding.py | 53 +++ .../DCU/ops/custom_ops/store_kv_cache.py | 309 ++++++++++++++++ .../ops/custom_ops/swiglu_dynamic_quant.py | 44 +++ .../vendor_ops/DCU/ops/flash_attn/__init__.py | 35 ++ .../DCU/ops/flash_attn/flash_attention.py | 151 ++++++++ .../vendor_ops/DCU/ops/lightop/__init__.py | 12 + .../DCU/ops/lightop/add_rms_norm.py | 31 ++ .../ops/lightop/add_rms_norm_dynamic_quant.py | 50 +++ .../vendor_ops/DCU/ops/lightop/moe_gather.py | 349 ++++++++++++++++++ .../DCU/ops/lightop/moe_quant_group_gemm.py | 187 ++++++++++ .../ops/lightop/moe_quant_group_gemm_down.py | 14 + .../ops/lightop/moe_quant_group_gemm_up.py | 14 + .../DCU/ops/lightop/moe_softmax_topk.py | 47 +++ .../ops/lightop/moe_swiglu_dynamic_quant.py | 115 ++++++ .../vendor_ops/DCU/ops/lightop/qk_rms_norm.py | 26 ++ .../DCU/ops/lightop/rotary_embedding.py | 54 +++ .../vendor_ops/DCU/ops/lmslim/__init__.py | 12 + .../vendor_ops/DCU/ops/lmslim/quant_matmul.py | 44 +++ .../vendor_ops/DCU/ops/torch/__init__.py | 1 + .../vendor_ops/DCU/ops/torch/all_reduce.py | 9 + .../DCU/ops/torch/moe_gating_gemm.py | 55 +++ .../DCU/ops/torch/swiglu_dynamic_quant.py | 5 + 26 files changed, 1787 insertions(+) create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/custom_ops/__init__.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/custom_ops/add_rms_norm.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/custom_ops/add_rms_norm_dynamic_quant.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/custom_ops/moe_scatter_dynamic_quant.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/custom_ops/rotary_embedding.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/custom_ops/store_kv_cache.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/custom_ops/swiglu_dynamic_quant.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/flash_attn/__init__.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/flash_attn/flash_attention.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/lightop/__init__.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/lightop/add_rms_norm.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/lightop/add_rms_norm_dynamic_quant.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_gather.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_quant_group_gemm.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_quant_group_gemm_down.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_quant_group_gemm_up.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_softmax_topk.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_swiglu_dynamic_quant.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/lightop/qk_rms_norm.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/lightop/rotary_embedding.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/lmslim/__init__.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/lmslim/quant_matmul.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/torch/__init__.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/torch/all_reduce.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/torch/moe_gating_gemm.py create mode 100644 projects/micro_perf/vendor_ops/DCU/ops/torch/swiglu_dynamic_quant.py diff --git a/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/__init__.py b/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/__init__.py new file mode 100644 index 00000000..386231e9 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/__init__.py @@ -0,0 +1,12 @@ +import importlib.metadata + +from xpu_perf.micro_perf.core.op import ProviderRegistry + +PROVIDER_NAME = "dcu_custom_ops" + +try: + ProviderRegistry.register_provider_info( + "custom_ops", {"custom_ops": importlib.metadata.version("custom_ops")} + ) +except Exception: + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/add_rms_norm.py b/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/add_rms_norm.py new file mode 100644 index 00000000..24ab0a19 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/add_rms_norm.py @@ -0,0 +1,32 @@ +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.add_rms_norm import AddRmsNormOp +from xpu_perf.micro_perf.core.utils import calc_tensor_size + +try: + from custom_ops import addrmsnorm + @ProviderRegistry.register_vendor_impl("add_rms_norm", "custom_ops") + class CustomopsAddRMSNormop(AddRmsNormOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + + self.extra_providers = ["custom_ops"] + + def vendor_impl(self): + # Keep base semantic tensor definitions, only swap run function. + super().vendor_impl() + if "output" in self.output_tensor_info: + self.write_bytes = calc_tensor_size(self.output_tensor_info["output"]) + self.io_bytes = self.read_bytes + self.write_bytes + self._run_func = self.vendor_impl_run + + def vendor_impl_run(self, tensor_mapping): + src = tensor_mapping["hidden_states"] + weight = tensor_mapping["norm_weight"] + residual = tensor_mapping["residual"] + + dst = addrmsnorm(src, residual, weight, self.eps) + + return dst + +except: + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/add_rms_norm_dynamic_quant.py b/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/add_rms_norm_dynamic_quant.py new file mode 100644 index 00000000..623cb473 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/add_rms_norm_dynamic_quant.py @@ -0,0 +1,48 @@ +from functools import partial + +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.add_rms_norm_dynamic_quant import AddRmsNormDynamicQuantOp + +try: + from custom_ops import addrmsnormdynamicquant + @ProviderRegistry.register_vendor_impl("add_rms_norm_dynamic_quant", "custom_ops") + class CustomopsAddRMSNormDynamicQuantOp(AddRmsNormDynamicQuantOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + + self.extra_providers = ["custom_ops"] + + def vendor_impl(self): + # custom_ops kernel writes into preallocated output tensors, so we must + # create outputs in the tensor mapping (base impl uses create_outputs=False). + super().vendor_impl() + self._create_tensors_func = partial( + self._create_in_out_tensors, + create_inputs=True, + create_outputs=True, + ) + self._run_func = self.vendor_impl_run + + def vendor_impl_run(self, tensor_mapping): + + src = tensor_mapping["hidden_states"] + weight = tensor_mapping["norm_weight"] + smoothScale = tensor_mapping["smooth_scale"] + residual = tensor_mapping["residual"] + per_token_scale = tensor_mapping["per_token_scale"] + dst = tensor_mapping["quant_tokens"] + + if self.output_mode == "none": + addrmsnormdynamicquant(src,weight,smoothScale,residual,dst,per_token_scale,None,None,0,self.eps) + return dst, per_token_scale + elif self.output_mode == "res": + after_res = tensor_mapping["after_res"] + addrmsnormdynamicquant(src,weight,smoothScale,residual,dst,per_token_scale,after_res,None,1,self.eps) + return dst, per_token_scale, after_res + elif self.output_mode == "norm": + after_norm = tensor_mapping["after_norm"] + addrmsnormdynamicquant(src,weight,smoothScale,residual,dst,per_token_scale,None,after_norm,2,self.eps) + return dst, per_token_scale, after_norm + +except: + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/moe_scatter_dynamic_quant.py b/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/moe_scatter_dynamic_quant.py new file mode 100644 index 00000000..d6b09729 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/moe_scatter_dynamic_quant.py @@ -0,0 +1,78 @@ +from functools import partial +import torch + +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.moe_scatter_dynamic_quant import MoeScatterDynamicQuantOp +from xpu_perf.micro_perf.core.utils import static_quant + +try: + from custom_ops import moe_scatter_dynamic_quant + + @ProviderRegistry.register_vendor_impl("moe_scatter_dynamic_quant", "custom_ops") + class CustomOpsMoeScatterDynamicQuantOp(MoeScatterDynamicQuantOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + + self.extra_providers = ["custom_ops"] + + def vendor_impl(self): + # Keep base semantic tensor definitions, only swap run function. + super().vendor_impl() + self._run_func = self.vendor_impl_run + + def vendor_impl_run(self, tensor_mapping): + # get pre-allocated input tensors + hidden_states = tensor_mapping["hidden_states"] + experts_smooth_scale = tensor_mapping["experts_smooth_scale"] + selected_experts = tensor_mapping["selected_experts"] + moe_weights = tensor_mapping["moe_weights"] + + # get pre-allocated output tensors + scatter_tokens = tensor_mapping["scatter_tokens"] + scatter_per_token_scale = tensor_mapping["scatter_per_token_scale"] + + # For ease of reference in code demonstration, + # all the following tensors are precomputed. + # Vendors are required to implement the corresponding computation logic during integration. + scatter_token_id = tensor_mapping["scatter_token_id"] + scatter_token_weight = tensor_mapping["scatter_token_weight"] + experts_token_count = tensor_mapping["experts_token_count"] + experts_token_offset = tensor_mapping["experts_token_offset"] + + #import traceback + #traceback.print_stack() + + if experts_smooth_scale.shape[0] == self.num_experts: + experts_smooth_scale_per_rank = experts_smooth_scale[self.experts_start_idx:self.experts_end_idx] + else: + experts_smooth_scale_per_rank = experts_smooth_scale + + result = moe_scatter_dynamic_quant( + hidden_states=hidden_states, + experts_smooth_scale=experts_smooth_scale_per_rank, + selected_experts=selected_experts, + moe_weights=moe_weights, + scatter_tokens=scatter_tokens, + scatter_per_token_scale=scatter_per_token_scale, + scatter_token_id=scatter_token_id, + scatter_token_weight=scatter_token_weight, + experts_token_count=experts_token_count, + experts_token_offset=experts_token_offset, + topk=self.topk, + ep_size=self.ep_size, + ep_rank=self.ep_rank, + dst_dtype=self.dst_torch_dtype, + balanced=True, + ) + + if isinstance(result, tuple) and len(result) == 6: + scatter_tokens, scatter_per_token_scale, \ + scatter_token_id, scatter_token_weight, \ + experts_token_count, experts_token_offset = result + + return scatter_tokens, scatter_per_token_scale, \ + scatter_token_id, scatter_token_weight, \ + experts_token_count, experts_token_offset + +except: + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/rotary_embedding.py b/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/rotary_embedding.py new file mode 100644 index 00000000..760d5f8e --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/rotary_embedding.py @@ -0,0 +1,53 @@ +from itertools import chain +import torch + +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.rotary_embedding import RotaryEmbeddingOp + +try: + # from vllm import _custom_ops as ops + from custom_ops import rotary_embedding + + @ProviderRegistry.register_vendor_impl("rotary_embedding", "custom_ops") + class LightopRotaryEmbeddingOp(RotaryEmbeddingOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + self.extra_providers = ["custom_ops"] + self.require_profiling = True + + def vendor_impl_run(self, tensor_mapping): + packed_qkv = tensor_mapping["packed_qkv"] + q_lens = tensor_mapping["q_lens"] + accum_q_lens = tensor_mapping["accum_q_lens"] + cache_lens = tensor_mapping["cache_lens"] + cos = tensor_mapping["cos"] + sin = tensor_mapping["sin"] + + cos_sin_cache = torch.cat([cos, sin], dim=-1).contiguous() + + dim_start = self.rope_offset + dim_end = self.rope_offset + self.rope_dim + + positions = getattr(self, "positions", None) + if positions is None or positions.numel() != self.num_tokens: + # Use self.cache_lens / self.q_lens to avoid indexing GPU cache_lens (can trigger sync). + positions_list = [ + self.cache_lens[b] + j + for b in range(self.batch_size) + for j in range(self.q_lens[b]) + ] + positions = torch.tensor(positions_list, dtype=torch.int64, device=packed_qkv.device) + + q,k = rotary_embedding( + positions=positions, + query=packed_qkv[:, :self.q_head_num, dim_start:dim_end].view(packed_qkv.size(0), -1), + key=packed_qkv[:, self.q_head_num:self.q_head_num + self.kv_head_num, dim_start:dim_end].view(packed_qkv.size(0), -1), + cos_sin_cache=cos_sin_cache, + head_size=self.rope_dim, + q_head_num=self.q_head_num, + kv_head_num=self.kv_head_num, + is_neox=1, + ) + return packed_qkv +except Exception: + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/store_kv_cache.py b/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/store_kv_cache.py new file mode 100644 index 00000000..b6018c7e --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/store_kv_cache.py @@ -0,0 +1,309 @@ +from functools import partial +import torch + +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.store_kv_cache import StoreKVCacheOp +from xpu_perf.micro_perf.core.utils import get_torch_dtype, static_quant +from xpu_perf.micro_perf.core.utils import OpTensorInfo, calc_tensor_size, get_torch_dtype, get_torch_dtype_size + +try: + from custom_ops import store_kv_cache as _store_kv_cache + + @ProviderRegistry.register_vendor_impl("store_kv_cache", "custom_ops") + class CustomopStoreKVCacheOp(StoreKVCacheOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + + self.extra_providers = ["custom_ops"] + + def _run_paged_fallback(self, tensor_mapping): + """ + First paged version in Python (correctness-first). + Custom HIP kernel path is still linear-only. + """ + packed_qkv = tensor_mapping["packed_qkv"] + q_lens = tensor_mapping["q_lens"] + accum_q_lens = tensor_mapping["accum_q_lens"] + cache_lens = tensor_mapping["cache_lens"] + block_table = tensor_mapping["block_table"] + k_cache = tensor_mapping["k_cache"] + v_cache = tensor_mapping["v_cache"] + k_scale = tensor_mapping.get("k_scale", None) + v_scale = tensor_mapping.get("v_scale", None) + + k_head_start = self.q_head_num + k_head_end = self.q_head_num + self.kv_head_num + v_head_start = self.q_head_num + self.kv_head_num + v_head_end = self.q_head_num + self.kv_head_num * 2 + + # Prefer runtime tensor values so generated inputs and attrs stay consistent. + for batch_idx in range(self.batch_size): + q_len = int(q_lens[batch_idx].item()) + if q_len <= 0: + continue + q_offset = int(accum_q_lens[batch_idx].item()) + cache_len = int(cache_lens[batch_idx].item()) + + src_k = packed_qkv[q_offset:q_offset + q_len, k_head_start:k_head_end, :] + src_v = packed_qkv[q_offset:q_offset + q_len, v_head_start:v_head_end, :] + + if self.use_quant: + src_k = static_quant(src_k, k_scale, self.cache_torch_dtype) + src_v = static_quant(src_v, v_scale, self.cache_torch_dtype) + + # [q_len, kv_head_num, head_dim] -> [kv_head_num, q_len, head_dim] + src_k = src_k.contiguous().transpose(0, 1) + src_v = src_v.contiguous().transpose(0, 1) + + for t in range(q_len): + token_pos = cache_len + t + block_idx = token_pos // self.block_size + offset_in_block = token_pos % self.block_size + physical_block = int(block_table[batch_idx, block_idx].item()) + if physical_block < 0: + continue + k_cache[physical_block, :, offset_in_block, :].copy_(src_k[:, t, :]) + v_cache[physical_block, :, offset_in_block, :].copy_(src_v[:, t, :]) + + return k_cache, v_cache + + def vendor_impl(self): + self.torch_dtype = get_torch_dtype(self.dtype) + self.cache_torch_dtype = get_torch_dtype(self.cache_dtype) + + self.input_tensor_info = {} + self.output_tensor_info = {} + + """ + Input QKV in packed / unsplitted layout. + """ + self.input_tensor_info["packed_qkv"] = OpTensorInfo( + shape=[self.num_tokens, self.total_head_num, self.head_dim], + dtype=self.torch_dtype, + device=self.backend.get_torch_device_name(), + ) + + + """ + Build tensors describing how current num_tokens is composed (q/cache/kv lens metadata). + """ + self.attn_info_tensors = { + "q_lens": OpTensorInfo( + shape=[self.batch_size], + dtype=torch.int32, + device=self.backend.get_torch_device_name(), + creator=lambda size, dtype, device: torch.tensor(self.q_lens, dtype=dtype, device=device) + ), + "cache_lens": OpTensorInfo( + shape=[self.batch_size], + dtype=torch.int32, + device=self.backend.get_torch_device_name(), + creator=lambda size, dtype, device: torch.tensor(self.cache_lens, dtype=dtype, device=device) + ), + "kv_lens": OpTensorInfo( + shape=[self.batch_size], + dtype=torch.int32, + device=self.backend.get_torch_device_name(), + creator=lambda size, dtype, device: torch.tensor(self.kv_lens, dtype=dtype, device=device) + ), + "accum_q_lens": OpTensorInfo( + shape=[self.batch_size + 1], + dtype=torch.int32, + device=self.backend.get_torch_device_name(), + creator=lambda size, dtype, device: torch.tensor(self.accum_q_lens, dtype=dtype, device=device) + ), + "accum_cache_lens": OpTensorInfo( + shape=[self.batch_size + 1], + dtype=torch.int32, + device=self.backend.get_torch_device_name(), + creator=lambda size, dtype, device: torch.tensor(self.accum_cache_lens, dtype=dtype, device=device) + ), + "accum_kv_lens": OpTensorInfo( + shape=[self.batch_size + 1], + dtype=torch.int32, + device=self.backend.get_torch_device_name(), + creator=lambda size, dtype, device: torch.tensor(self.accum_kv_lens, dtype=dtype, device=device) + ), + } + self.input_tensor_info.update(self.attn_info_tensors) + + + """ + KV cache tensors; linear vs paged is determined by block_size / cache_type. + """ + if self.cache_type == "linear": + self.input_tensor_info["slot_mapping"] = OpTensorInfo( + shape=[self.batch_size], + dtype=torch.int32, + device=self.backend.get_torch_device_name(), + creator=lambda size, dtype, device: \ + torch.tensor(self.slot_mapping, dtype=dtype, device=device) + ) + cache_shape = [self.batch_size, self.kv_head_num, self.max_kv_len, self.head_dim] + elif self.cache_type == "paged": + self.input_tensor_info["block_table"] = OpTensorInfo( + shape=[self.target_batch_size, self.target_per_seq_num_block], + dtype=torch.int32, + device=self.backend.get_torch_device_name(), + creator=lambda size, dtype, device: \ + torch.tensor(self.block_table, dtype=dtype, device=device) + ) + cache_shape = [self.total_cache_blocks, self.kv_head_num, self.block_size, self.head_dim] + self.input_tensor_info["k_cache"] = OpTensorInfo( + shape=cache_shape, + dtype=self.cache_torch_dtype, + device=self.backend.get_torch_device_name(), + creator=torch.empty + ) + self.input_tensor_info["v_cache"] = OpTensorInfo( + shape=cache_shape, + dtype=self.cache_torch_dtype, + device=self.backend.get_torch_device_name(), + creator=torch.empty + ) + + """ + Quantization parameters (only if kv cache is quantized). + """ + if self.use_quant: + quant_scale_shape = [self.kv_head_num, self.head_dim] + self.input_tensor_info["k_scale"] = OpTensorInfo( + shape=quant_scale_shape, + dtype=torch.float32, + device=self.backend.get_torch_device_name(), + creator=torch.ones + ) + self.input_tensor_info["v_scale"] = OpTensorInfo( + shape=quant_scale_shape, + dtype=torch.float32, + device=self.backend.get_torch_device_name(), + creator=torch.ones + ) + + # calculator + self.input_tensor_size = sum( + [calc_tensor_size(info) for info in self.input_tensor_info.values()] + ) + self.output_tensor_size = sum([calc_tensor_size(info) for info in self.output_tensor_info.values()]) + self.tensor_size = self.input_tensor_size + self.output_tensor_size + + """ + Bandwidth accounting (aligned with bytemlperf semantics): + - read_bytes: K/V components read from packed_qkv + indexing/length metadata + quant scales. + Do NOT count the entire k/v cache as a "full read" (avoids double counting / omissions when + mixed with tensor_size). + - write_bytes: bytes actually written to k_cache / v_cache (scaled for linear/paged). Previously + output_tensor_info was empty, which made write_bytes=0 and systematically under-reported mem_bw. + """ + pq = self.input_tensor_info["packed_qkv"] + self.read_bytes = ( + calc_tensor_size(pq) / self.total_head_num * (2 * self.kv_head_num) + + calc_tensor_size(self.input_tensor_info["q_lens"]) + + calc_tensor_size(self.input_tensor_info["cache_lens"]) + + calc_tensor_size(self.input_tensor_info["accum_q_lens"]) + + calc_tensor_size(self.input_tensor_info["kv_lens"]) + + calc_tensor_size(self.input_tensor_info["accum_cache_lens"]) + + calc_tensor_size(self.input_tensor_info["accum_kv_lens"]) + ) + if self.cache_type == "linear": + self.read_bytes += calc_tensor_size(self.input_tensor_info["slot_mapping"]) + elif self.cache_type == "paged": + self.read_bytes += calc_tensor_size(self.input_tensor_info["block_table"]) + + if self.use_quant: + self.read_bytes += ( + calc_tensor_size(self.input_tensor_info["k_scale"]) + + calc_tensor_size(self.input_tensor_info["v_scale"]) + ) + + sz_k = calc_tensor_size(self.input_tensor_info["k_cache"]) + sz_v = calc_tensor_size(self.input_tensor_info["v_cache"]) + if self.cache_type == "linear": + self.write_bytes = ( + sz_k / self.batch_size / self.max_kv_len * self.num_tokens + + sz_v / self.batch_size / self.max_kv_len * self.num_tokens + ) + elif self.cache_type == "paged": + self.write_bytes = ( + sz_k / self.num_kv_blocks / self.block_size * self.num_tokens + + sz_v / self.num_kv_blocks / self.block_size * self.num_tokens + + calc_tensor_size(self.input_tensor_info["block_table"]) + / self.batch_size + / self.max_block_num_per_seq + * self.num_q_blocks + ) + + self.io_bytes = self.read_bytes + self.write_bytes + + + # creator func + self._create_tensors_func = partial( + self._create_in_out_tensors, + create_inputs=True, + create_outputs=False + ) + + # run func + self._run_func = self.vendor_impl_run + + def vendor_impl_run(self, tensor_mapping): + # get pre-allocated input tensors + packed_qkv = tensor_mapping["packed_qkv"] + k_cache = tensor_mapping["k_cache"] + v_cache = tensor_mapping["v_cache"] + + use_paged = self.cache_type == "paged" + block_size = self.block_size if use_paged else 0 + + if self.cache_type == "linear": + slot_mapping = tensor_mapping["slot_mapping"] + block_table = None + elif self.cache_type == "paged": + block_table = tensor_mapping["block_table"] + slot_mapping = None + + try: + _store_kv_cache( + packed_qkv, + k_cache, + v_cache, + tensor_mapping["q_lens"], + tensor_mapping["accum_q_lens"], + tensor_mapping["cache_lens"], + slot_mapping, + block_table, + use_paged, + block_size, + tensor_mapping.get("k_scale"), + tensor_mapping.get("v_scale"), + use_quant=self.use_quant, + q_head_num=self.q_head_num, + kv_head_num=self.kv_head_num, + head_dim=self.head_dim, + total_head_num=self.total_head_num, + max_kv_len=self.max_kv_len, + ) + except TypeError: + # Backward compatibility with older extension signature. + if use_paged: + return self._run_paged_fallback(tensor_mapping) + _store_kv_cache( + packed_qkv, + k_cache, + v_cache, + tensor_mapping["q_lens"], + tensor_mapping["accum_q_lens"], + tensor_mapping["cache_lens"], + slot_mapping, + tensor_mapping.get("k_scale"), + tensor_mapping.get("v_scale"), + use_quant=self.use_quant, + q_head_num=self.q_head_num, + kv_head_num=self.kv_head_num, + head_dim=self.head_dim, + total_head_num=self.total_head_num, + max_kv_len=self.max_kv_len, + ) + return k_cache, v_cache +except: + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/swiglu_dynamic_quant.py b/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/swiglu_dynamic_quant.py new file mode 100644 index 00000000..d5eac018 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/custom_ops/swiglu_dynamic_quant.py @@ -0,0 +1,44 @@ +from functools import partial + +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.swiglu_dynamic_quant import SwigluDynamicQuantOp + +try: + from custom_ops import swiglu_dynamic_quant + + @ProviderRegistry.register_vendor_impl("swiglu_dynamic_quant", "custom_ops") + class CustomOpsSwigluDynamicQuantOp(SwigluDynamicQuantOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + + self.extra_providers = ["custom_ops"] + + def vendor_impl(self): + # custom_ops kernel writes into preallocated output tensors, so we must + # create outputs in the tensor mapping (base impl uses create_outputs=False). + super().vendor_impl() + self._create_tensors_func = partial( + self._create_in_out_tensors, + create_inputs=True, + create_outputs=True, + ) + self._run_func = self.vendor_impl_run + + def vendor_impl_run(self, tensor_mapping): + # get pre-allocated input tensors + hidden_states = tensor_mapping["hidden_states"] + smooth_scale = tensor_mapping["smooth_scale"] + quant_tokens = tensor_mapping["quant_tokens"] + per_token_scale = tensor_mapping["per_token_scale"] + + quant_tokens, per_token_scale = swiglu_dynamic_quant( + hidden_states, + smooth_scale, + quant_tokens, + per_token_scale, + self.num_tokens, + self.hidden_size) + return quant_tokens, per_token_scale + +except: + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/flash_attn/__init__.py b/projects/micro_perf/vendor_ops/DCU/ops/flash_attn/__init__.py new file mode 100644 index 00000000..9ee82745 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/flash_attn/__init__.py @@ -0,0 +1,35 @@ +import importlib.metadata + +from xpu_perf.micro_perf.core.op import ProviderRegistry + +PROVIDER_NAME = "dcu_flash_attn" + +# Mirror GPU vendor + DCU environment package probes +try: + import flash_attn # noqa: F401 + + ProviderRegistry.register_provider_info( + "flash_attn_v2", {"flash_attn": importlib.metadata.version("flash_attn")} + ) +except Exception: + pass +try: + import flash_attn_interface # noqa: F401 + + ProviderRegistry.register_provider_info( + "flash_attn_v3", {"flash_attn": importlib.metadata.version("flash_attn")} + ) +except Exception: + pass +try: + ProviderRegistry.register_provider_info( + "vllm", {"vllm": importlib.metadata.version("vllm")} + ) +except Exception: + pass +try: + ProviderRegistry.register_provider_info( + "flashinfer", {"flashinfer": importlib.metadata.version("flashinfer-python")} + ) +except Exception: + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/flash_attn/flash_attention.py b/projects/micro_perf/vendor_ops/DCU/ops/flash_attn/flash_attention.py new file mode 100644 index 00000000..8f20373d --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/flash_attn/flash_attention.py @@ -0,0 +1,151 @@ +from functools import partial +import torch + +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.flash_attention import FlashAttentionOp +from xpu_perf.micro_perf.core.utils import OpTensorInfo, calc_tensor_size + + + +try: + from flash_attn import flash_attn_func, flash_attn_with_kvcache + + # https://github.com/Dao-AILab/flash-attention + @ProviderRegistry.register_vendor_impl("flash_attention", "fa2") + class FA2Op(FlashAttentionOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + + def vendor_parser(self): + super().vendor_parser() + + _allowed_dtypes = ("bfloat16", "float16") + _allowed_block_sizes = (0, 64) + block_size = self.args_dict.get("block_size", 0) + if block_size not in _allowed_block_sizes: + raise ValueError( + f"{type(self).__name__} only supports block_size in {_allowed_block_sizes}, got {block_size}." + ) + + if self.attn_mode == "prefill": + if all(d in _allowed_dtypes for d in ( + self.dtype, self.dst_dtype, self.cache_dtype, + self.qk_compute_dtype, self.pv_compute_dtype + )): + pass + else: + raise ValueError( + f"{type(self).__name__} prefill not support this combination." + ) + + elif self.attn_mode == "decode": + if all(d in _allowed_dtypes for d in ( + self.dtype, self.dst_dtype, self.cache_dtype, + self.qk_compute_dtype, self.pv_compute_dtype + )): + pass + else: + raise ValueError( + f"{type(self).__name__} decode not support this combination." + ) + if self.cache_type == "linear": + kv_lens_set = set(self.kv_lens) + if len(kv_lens_set) != 1: + raise ValueError( + f"{type(self).__name__} decode linear cache requires all kv_lens equal, got {self.kv_lens}." + ) + q_lens_set = set(self.q_lens) + if len(q_lens_set) != 1: + raise ValueError( + f"{type(self).__name__} decode only support q_lens == q_lens[0]." + ) + + else: + raise ValueError( + f"{type(self).__name__} not support this attn_mode: {self.attn_mode}." + ) + + def vendor_impl(self): + super().vendor_impl() + self._run_func = self.vendor_impl_run + + + def vendor_impl_run(self, tensor_mapping): + if self.attn_mode == "prefill": + return self.prefill_run(tensor_mapping) + if self.attn_mode == "decode": + return self.decode_run(tensor_mapping) + raise ValueError( + f"{type(self).__name__} not support this attn_mode: {self.attn_mode}." + ) + + + + def prefill_run(self, tensor_mapping): + q = tensor_mapping["q"].view(self.batch_size, self.num_tokens, self.q_head_num, self.head_dim) + kv_len = self.kv_lens[0] if self.batch_size == 1 else max(self.kv_lens) + k_raw = tensor_mapping["k_cache"] + v_raw = tensor_mapping["v_cache"] + + if self.cache_type == "linear": + k_cache = k_raw[:, :, :kv_len, :].permute(0, 2, 1, 3) + v_cache = v_raw[:, :, :kv_len, :].permute(0, 2, 1, 3) + else: + # paged: k_raw [total_blocks, kv_head_num, block_size, head_dim], reassemble by block_table into + # [1, kv_len, kv_head_num, head_dim] + block_table = tensor_mapping["block_table"] + num_blocks = (kv_len + self.block_size - 1) // self.block_size + k_parts, v_parts = [], [] + for b in range(num_blocks): + phys = block_table[0, b].item() + start = b * self.block_size + length = min(self.block_size, kv_len - start) + k_parts.append(k_raw[phys, :, :length, :]) + v_parts.append(v_raw[phys, :, :length, :]) + k_cat = torch.cat(k_parts, dim=1) + v_cat = torch.cat(v_parts, dim=1) + k_cache = k_cat.permute(1, 0, 2).unsqueeze(0) + v_cache = v_cat.permute(1, 0, 2).unsqueeze(0) + + out = flash_attn_func( + q, k_cache, v_cache, + causal=self.is_causal + ) + return out + + + def decode_run(self, tensor_mapping): + q = tensor_mapping["q"].view(self.batch_size, self.max_q_len, self.q_head_num, self.head_dim) + k_raw = tensor_mapping["k_cache"] + v_raw = tensor_mapping["v_cache"] + kv_lens = tensor_mapping["kv_lens"] + + if self.cache_type == "linear": + # linear: k_cache [batch_size, kv_head_num, max_kv_len, head_dim] -> flash_attn_func expects [B, kv_len, H, D] + kv_len = int(kv_lens[0].item()) + k_cache = k_raw[:, :, :kv_len, :].permute(0, 2, 1, 3) + v_cache = v_raw[:, :, :kv_len, :].permute(0, 2, 1, 3) + out = flash_attn_func(q, k_cache, v_cache, causal=self.is_causal) + else: + # paged: k_cache [total_blocks, kv_head_num, block_size, head_dim] -> [total_blocks, block_size, kv_head_num, head_dim] + k_cache = k_raw.permute(0, 2, 1, 3) + v_cache = v_raw.permute(0, 2, 1, 3) + block_table = tensor_mapping["block_table"] + out = flash_attn_with_kvcache( + q, k_cache, v_cache, + cache_seqlens=kv_lens, cache_batch_idx=None, + block_table=block_table, causal=self.is_causal + ) + return out + +except: + pass + +# Only enable FA2; keep FA3 disabled (FA2 is sufficient for current use). +# try: +# from flash_attn.flash_attn_interface import flash_attn_func, flash_attn_with_kvcache +# @ProviderRegistry.register_vendor_impl("flash_attention", "fa3") +# class FA3Op(FA2Op): +# ... +# except: +# pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/lightop/__init__.py b/projects/micro_perf/vendor_ops/DCU/ops/lightop/__init__.py new file mode 100644 index 00000000..449b5937 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/lightop/__init__.py @@ -0,0 +1,12 @@ +import importlib.metadata + +from xpu_perf.micro_perf.core.op import ProviderRegistry + +PROVIDER_NAME = "dcu_lightop" + +try: + ProviderRegistry.register_provider_info( + "lightop", {"lightop": importlib.metadata.version("lightop")} + ) +except Exception: + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/lightop/add_rms_norm.py b/projects/micro_perf/vendor_ops/DCU/ops/lightop/add_rms_norm.py new file mode 100644 index 00000000..35bcf638 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/lightop/add_rms_norm.py @@ -0,0 +1,31 @@ +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.add_rms_norm import AddRmsNormOp +from xpu_perf.micro_perf.core.utils import calc_tensor_size + +try: + from lightop import op + @ProviderRegistry.register_vendor_impl("add_rms_norm", "lightop") + class LightopAddRMSNormop(AddRmsNormOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + + self.extra_providers = ["lightop"] + + def vendor_impl(self): + # Keep base tensor semantics; adjust IO bookkeeping to match legacy behavior. + super().vendor_impl() + if "output" in self.output_tensor_info: + self.write_bytes = calc_tensor_size(self.output_tensor_info["output"]) + self.io_bytes = self.read_bytes + self.write_bytes + + def add_rms_norm_run(self, tensor_mapping): + src = tensor_mapping["hidden_states"] + weight = tensor_mapping["norm_weight"] + residual = tensor_mapping["residual"] + + dst = op.fused_add_rms_norm_opt(src, residual, weight, self.eps) + + return dst + +except: + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/lightop/add_rms_norm_dynamic_quant.py b/projects/micro_perf/vendor_ops/DCU/ops/lightop/add_rms_norm_dynamic_quant.py new file mode 100644 index 00000000..aba59db7 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/lightop/add_rms_norm_dynamic_quant.py @@ -0,0 +1,50 @@ +from functools import partial + +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.add_rms_norm_dynamic_quant import AddRmsNormDynamicQuantOp + +try: + from lightop import op + @ProviderRegistry.register_vendor_impl("add_rms_norm_dynamic_quant", "lightop") + class LightopAddRMSNormDynamicQuantOp(AddRmsNormDynamicQuantOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + + self.extra_providers = ["lightop"] + + def vendor_impl(self): + # lightop kernel writes into preallocated output tensors, so we must + # create outputs in the tensor mapping (base impl uses create_outputs=False). + super().vendor_impl() + self._create_tensors_func = partial( + self._create_in_out_tensors, + create_inputs=True, + create_outputs=True, + ) + self._run_func = self.vendor_impl_run + + def vendor_impl_run(self, tensor_mapping): + + src = tensor_mapping["hidden_states"] + weight = tensor_mapping["norm_weight"] + smoothScale = tensor_mapping["smooth_scale"] + residual = tensor_mapping["residual"] + per_token_scale = tensor_mapping["per_token_scale"] + dst = tensor_mapping["quant_tokens"] + + # after_res, dst, per_token_scale = op.miopen_add_rms_norm_dynamic_quant(src, residual, weight, smoothScale, self.eps) + if self.output_mode == "none": + op.rms_norm_smooth_per_token_dynamic_quant(dst, src, weight, smoothScale, per_token_scale,self.eps, residual, None,None,False,False) + return dst, per_token_scale + + elif self.output_mode == "res": + after_res = tensor_mapping["after_res"] + op.rms_norm_smooth_per_token_dynamic_quant(dst, src, weight, smoothScale, per_token_scale,self.eps, residual, None,after_res,False,False) + return dst, per_token_scale, after_res + elif self.output_mode == "norm": + after_norm = tensor_mapping["after_norm"] + op.rms_norm_smooth_per_token_dynamic_quant(dst, src, weight, smoothScale, per_token_scale,self.eps, residual, after_norm,None,False,False) + return dst, per_token_scale, after_norm + +except: + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_gather.py b/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_gather.py new file mode 100644 index 00000000..63c6d0cf --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_gather.py @@ -0,0 +1,349 @@ +import torch +from functools import partial + +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.moe_gather import MoeGatherOp +from xpu_perf.micro_perf.core.utils import OpTensorInfo, calc_tensor_size +import itertools + + +def count_expert_num_tokens_with_map(topk_ids, num_local_experts, expert_map): + + flat_ids = topk_ids.view(-1) + local_ids = expert_map[flat_ids] + mask = local_ids >= 0 + return torch.bincount(local_ids[mask], minlength=num_local_experts).to(torch.int32) + +def get_expert_offsets(expert_counts, alignment): + padded_counts = (expert_counts + alignment - 1) // alignment * alignment + offsets = torch.zeros_like(expert_counts) + offsets[1:] = torch.cumsum(padded_counts[:-1], dim=0) + return offsets, padded_counts.sum().item() + + +def get_moe_tokens_info( + num_tokens, num_experts, topk, + ep_size=1, ep_rank=0 +): + # split tokens / experts + num_scatter_tokens = num_tokens * topk + num_scatter_tokens_per_rank = num_scatter_tokens // ep_size + num_experts_per_rank = num_experts // ep_size + + experts_start_idx = ep_rank * num_experts_per_rank + experts_end_idx = experts_start_idx + num_experts_per_rank + + expert_map = torch.full((num_experts,), -1, dtype=torch.int32) + + local_expert_ids = torch.arange(num_experts_per_rank, dtype=torch.int32) + + expert_map[experts_start_idx:experts_end_idx] = local_expert_ids + + experts_idx_for_each_rank = [] + for rank_idx in range(ep_size): + start_idx = rank_idx * num_experts_per_rank + end_idx = start_idx + num_experts_per_rank + experts_idx_for_each_rank.append(list(range(start_idx, end_idx))) + transpose_experts = [list(row) for row in zip(*experts_idx_for_each_rank)] + experts_array = [num for row in transpose_experts for num in row] + + all_select_experts = [] + all_select_weights = [] + + cur_expert = 0 + for token_idx in range(num_tokens): + cur_token_selections = [] + for topk_idx in range(topk): + cur_token_selections.append(experts_array[cur_expert]) + cur_expert += 1 + if cur_expert >= num_experts: + cur_expert = 0 + all_select_experts.append(cur_token_selections) + all_select_weights.append([1 / topk for _ in range(topk)]) + + all_select_experts_tensor = torch.tensor(all_select_experts, dtype=torch.long) + expert_counts = count_expert_num_tokens_with_map(all_select_experts_tensor, num_experts_per_rank, expert_map) + expert_offsets, total_rows = get_expert_offsets(expert_counts, 16) + + cur_rank_tokens = {} + cur_rank_weights = {} + dispatch_tokens = 0 + + for token_idx in range(num_tokens): + cur_token_dispatch_experts = [] + cur_token_dispatch_weights = [] + for expert_idx, expert_weight in zip(all_select_experts[token_idx], all_select_weights[token_idx]): + if expert_idx >= experts_start_idx and expert_idx < experts_end_idx: + cur_token_dispatch_experts.append(expert_idx) + cur_token_dispatch_weights.append(expert_weight) + + if cur_token_dispatch_experts: + cur_rank_tokens[token_idx] = cur_token_dispatch_experts + cur_rank_weights[token_idx] = cur_token_dispatch_weights + dispatch_tokens += len(cur_token_dispatch_experts) + + used_src_tokens = len(cur_rank_tokens) + + expert_dispatch_tokens = [[] for _ in range(experts_start_idx, experts_end_idx)] + expert_dispatch_weights = [[] for _ in range(experts_start_idx, experts_end_idx)] + expert_dispatch_token_count = [0 for _ in range(experts_start_idx, experts_end_idx)] + expert_dispatch_token_offset = [0 for _ in range(experts_start_idx, experts_end_idx)] + + for token_idx in cur_rank_tokens: + for topk_idx, expert_idx in enumerate(cur_rank_tokens[token_idx]): + expert_dispatch_tokens[expert_idx - experts_start_idx].append(token_idx) + expert_dispatch_weights[expert_idx - experts_start_idx].append(cur_rank_weights[token_idx][topk_idx]) + expert_dispatch_token_count[expert_idx - experts_start_idx] += 1 + expert_dispatch_token_offset = ([0] + list(itertools.accumulate(expert_dispatch_token_count)))[:num_experts_per_rank] + + expert_dispatch_tokens_flatten = [token for tokens in expert_dispatch_tokens for token in tokens] + expert_dispatch_weights_flatten = [weight for weights in expert_dispatch_weights for weight in weights] + + return ( + num_scatter_tokens, + num_scatter_tokens_per_rank, + num_experts_per_rank, + experts_start_idx, + experts_end_idx, + all_select_experts, + all_select_weights, + dispatch_tokens, + used_src_tokens, + expert_dispatch_tokens, + expert_dispatch_weights, + expert_dispatch_tokens_flatten, + expert_dispatch_weights_flatten, + expert_dispatch_token_count, + expert_dispatch_token_offset, + expert_map, + expert_offsets, + total_rows, + all_select_experts_tensor + ) + + +def compute_inv_perm_ep_gather( + num_tokens: int, + topk: int, + experts_start_idx: int, + expert_dispatch_tokens: list, + expert_offsets: torch.Tensor, + all_select_experts: list, +) -> torch.Tensor: + """ + Build inv_perm [num_tokens, topk] for lightop ep_gather: row index in the padded expert buffer + `scatter_tokens_lightop` for each (token, k), or -1 if not routed to this EP rank. + + Layout matches expert-major scatter: expert e uses rows + [expert_offsets[e], expert_offsets[e] + count_e) inside [0, total_rows). + """ + inv = torch.full((num_tokens, topk), -1, dtype=torch.int32) + for e, tok_list in enumerate(expert_dispatch_tokens): + global_expert = experts_start_idx + e + base = int(expert_offsets[e].item()) + for j, tok in enumerate(tok_list): + row = base + j + for k in range(topk): + if all_select_experts[tok][k] == global_expert: + inv[tok, k] = row + break + return inv + + +def fill_scatter_tokens_lightop_from_dense( + scatter_tokens_lightop: torch.Tensor, + scatter_tokens: torch.Tensor, + expert_offsets: torch.Tensor, + expert_dispatch_token_count: list, + num_experts_per_rank: int, +) -> None: + """Place dense [dispatch_tokens, H] expert outputs into padded [total_rows, H] buffer (expert-major).""" + scatter_tokens_lightop.zero_() + off = 0 + for e in range(num_experts_per_rank): + cnt = expert_dispatch_token_count[e] + if cnt == 0: + continue + base = int(expert_offsets[e].item()) + scatter_tokens_lightop[base : base + cnt].copy_(scatter_tokens[off : off + cnt]) + off += cnt + + +try: + from lightop import op + + @ProviderRegistry.register_vendor_impl("moe_gather", "lightop") + class LightopMoeGatherOp(MoeGatherOp): + """ + MoE gather via lightop `op.ep_gather`, aligned with the same dispatch metadata as base `MoeGatherOp`: + - `scatter_tokens`: dense expert outputs [dispatch_tokens, H] (expert-major flatten order). + - `scatter_tokens_lightop`: padded buffer [total_rows, H] filled from `scatter_tokens` before the kernel. + - `inv_perm_lightop`: per (token, k) source row in `scatter_tokens_lightop` (not random). + - `selected_experts` / `moe_weights` / `expert_map`: arguments to `ep_gather` (global expert ids + map). + """ + + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + self.extra_providers = ["lightop"] + + def prepare_args(self): + # Keep base op semantic args parsing, then add lightop-specific dispatch metadata. + super().prepare_args() + + ( + self.num_scatter_tokens, + self.num_scatter_tokens_per_rank, + self.num_experts_per_rank, + self.experts_start_idx, + self.experts_end_idx, + self.all_select_experts, + self.all_select_weights, + self.dispatch_tokens, + self.used_src_tokens, + self.expert_dispatch_tokens, + self.expert_dispatch_weights, + self.expert_dispatch_tokens_flatten, + self.expert_dispatch_weights_flatten, + self.expert_dispatch_token_count, + self.expert_dispatch_token_offset, + self.expert_map, + self.expert_offsets, + self.total_rows, + self.all_select_experts_tensor, + ) = get_moe_tokens_info( + self.num_tokens, + self.num_experts, + self.topk, + ep_size=self.ep_size, + ep_rank=self.ep_rank, + ) + + self.inv_perm_cpu = compute_inv_perm_ep_gather( + self.num_tokens, + self.topk, + self.experts_start_idx, + self.expert_dispatch_tokens, + self.expert_offsets, + self.all_select_experts, + ) + + def vendor_parser(self): + if self.dtype not in ["bfloat16"]: + raise ValueError( + f"{type(self).__name__} only supports bfloat16 dtype, got {self.dtype}" + ) + + def vendor_impl(self): + self.torch_dtype = getattr(torch, self.dtype) + dev = self.backend.get_torch_device_name() + + inv_perm_stored = self.inv_perm_cpu + + self.input_tensor_info = { + "scatter_tokens": OpTensorInfo( + # Feed `ep_gather` the padded expert-major buffer directly to avoid + # an extra scatter_tokens -> scatter_tokens_lightop fill/copy. + shape=[self.total_rows, self.hidden_size], + dtype=self.torch_dtype, + device=dev, + ), + "selected_experts": OpTensorInfo( + shape=[self.num_tokens, self.topk], + dtype=torch.long, + device=dev, + creator=lambda size, dtype, device: torch.tensor( + self.all_select_experts, dtype=dtype, device=device + ), + ), + "moe_weights": OpTensorInfo( + shape=[self.num_tokens, self.topk], + dtype=torch.float32, + device=dev, + creator=lambda size, dtype, device: torch.tensor( + self.all_select_weights, dtype=dtype, device=device + ), + ), + "inv_perm_lightop": OpTensorInfo( + shape=[self.num_tokens, self.topk], + dtype=torch.int32, + device=dev, + creator=lambda size, dtype, device: torch.randint(low=0, high=self.total_rows, size=(self.num_tokens, self.topk), dtype=dtype, device=device), + ), + + "expert_map": OpTensorInfo( + shape=[self.num_experts], + dtype=torch.int32, + device=dev, + creator=lambda size, dtype, device: self.expert_map.clone().detach().to(dtype=dtype, device=device), + ), + } + if getattr(self, "res_scale", 0.0) != 0.0: + self.input_tensor_info["residual_tokens"] = OpTensorInfo( + shape=[self.num_res_tokens_per_rank, self.hidden_size], + dtype=self.torch_dtype, + device=dev, + ) + + self.output_tensor_info = { + "convergent_tokens": OpTensorInfo( + shape=[self.num_tokens, self.hidden_size], + dtype=self.torch_dtype, + device=dev, + creator=torch.zeros + ), + } + + # calculator (keep consistent with base MoeGatherOp accounting) + self.input_tensor_size = sum( + calc_tensor_size(info) for info in self.input_tensor_info.values() + ) + self.output_tensor_size = sum( + calc_tensor_size(info) for info in self.output_tensor_info.values() + ) + self.tensor_size = self.input_tensor_size + self.output_tensor_size + + self.read_bytes = \ + calc_tensor_size(self.input_tensor_info["scatter_tokens"]) * (self.num_tokens / 2) / self.total_rows + \ + calc_tensor_size(self.input_tensor_info["selected_experts"]) + \ + calc_tensor_size(self.input_tensor_info["moe_weights"]) + \ + calc_tensor_size(self.input_tensor_info["inv_perm_lightop"]) + \ + calc_tensor_size(self.input_tensor_info["expert_map"]) + + if self.res_scale: + self.read_bytes += calc_tensor_size(self.input_tensor_info["residual_tokens"]) + # index_add dst + self.write_bytes = calc_tensor_size(self.output_tensor_info["convergent_tokens"]) + self.io_bytes = self.read_bytes + self.write_bytes + + + self._create_tensors_func = partial( + self._create_in_out_tensors, + create_inputs=True, + create_outputs=True, + ) + self._run_func = self.vendor_impl_run + + def vendor_impl_run(self, tensor_mapping): + scatter_tokens = tensor_mapping["scatter_tokens"] + selected_experts = tensor_mapping["selected_experts"] + expert_map = tensor_mapping["expert_map"] + inv_perm_lightop = tensor_mapping["inv_perm_lightop"] + moe_weights = tensor_mapping["moe_weights"] + convergent_tokens = tensor_mapping["convergent_tokens"] + + if getattr(self, "res_scale", 0.0) != 0.0: + residual_tokens = tensor_mapping["residual_tokens"] + convergent_tokens[self.res_token_start:self.res_token_end] += residual_tokens * self.res_scale + + convergent_tokens = op.ep_gather( + scatter_tokens, + selected_experts, + moe_weights, + inv_perm_lightop, + expert_map, + convergent_tokens, + ) + return convergent_tokens + +except Exception: + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_quant_group_gemm.py b/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_quant_group_gemm.py new file mode 100644 index 00000000..bd57ae2a --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_quant_group_gemm.py @@ -0,0 +1,187 @@ +from functools import partial +import os + +import torch + +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.moe_quant_group_gemm import MoeQuantGroupGemmOp +from xpu_perf.micro_perf.core.utils import OpTensorInfo, calc_tensor_size, get_torch_dtype + +from lightop import moe_groupgemm_marlin_w4a8, get_moe_groupgemm_config_w4a8 + + +@ProviderRegistry.register_vendor_impl("moe_quant_group_gemm", "lightop") +class LightopMoeQuantGroupGemmOp(MoeQuantGroupGemmOp): + def vendor_parser(self): + """ + Bring legacy MoeQuantGroupGemmUp semantics into vendor implementation. + K1/K2 and tile_k/tile_n are kernel/padding/packing-related parameters and + therefore belong to the vendor layer (not the base op). + """ + if self.dtype != "int8" or self.compute_dtype != "int8" or self.dst_dtype != "bfloat16": + raise ValueError( + f"{type(self).__name__} only supports int8/int8 -> bfloat16, " + f"got dtype={self.dtype}, compute_dtype={self.compute_dtype}, dst_dtype={self.dst_dtype}" + ) + + # Match legacy xpu-perf MoeQuantGroupGemmUpOp behavior: + # lightop w4a8 marlin kernels expect packed weights stored as int32. + if self.w_dtype != "int32": + raise ValueError( + f"{type(self).__name__} only supports w_dtype=int32 (packed w4a8 weights), got {self.w_dtype}" + ) + + self.K1 = int(self.args_dict.get("K1", self.hidden_size // 2)) + self.K2 = int(self.args_dict.get("K2", self.new_hidden_size // 4)) + # For marlin w4a8 groupgemm kernels, the variant name often encodes tiling + self.tile_k = int(self.args_dict.get("tile_k", 32)) + self.tile_n = int(self.args_dict.get("tile_n", 64)) + # Many lightop MoE groupgemm kernels assume each expert's token segment is aligned + # (e.g., for vectorized memory access). Keep this vendor-specific. + # Default to legacy xpu-perf behavior (no padding). You can opt-in per-case. + self.token_align = int(self.args_dict.get("token_align", 1)) + if self.token_align <= 0: + raise ValueError(f"{type(self).__name__} requires token_align > 0, got {self.token_align}") + + if self.hidden_size % self.tile_k != 0: + raise ValueError( + f"{type(self).__name__} requires hidden_size % tile_k == 0, got hidden_size={self.hidden_size}, tile_k={self.tile_k}" + ) + if self.tile_n > 0 and (self.new_hidden_size % self.tile_n != 0): + raise ValueError( + f"{type(self).__name__} requires new_hidden_size % tile_n == 0 when tile_n>0, " + f"got new_hidden_size={self.new_hidden_size}, tile_n={self.tile_n}" + ) + + def vendor_impl(self): + self.extra_providers = ["lightop"] + # Must be available during vendor_impl (called inside BasicOp.__init__). + self.num_cus = torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count + + self.torch_dtype = get_torch_dtype(self.dtype) + self.w_torch_dtype = get_torch_dtype(self.w_dtype) + self.compute_torch_dtype = get_torch_dtype(self.compute_dtype) + self.dst_torch_dtype = get_torch_dtype(self.dst_dtype) + + # packed tensor layout (copied from legacy MoeQuantGroupGemmUpOp) + self.input_tensor_info = {} + self.output_tensor_info = {} + + # Match legacy xpu-perf MoeQuantGroupGemmUpOp tensor semantics: + # - no padding / alignment in generated token counts/offsets + # - use dispatch_tokens as the leading dimension for input/output + self.expert_dispatch_token_count = list(self.expert_dispatch_token_count) + # lightop moe_groupgemm_marlin_w4a8 expects experts_offsets as a prefix-sum array. + # Build a strict (E+1) offsets array from counts to avoid ambiguity. + # offsets[i]..offsets[i+1] is expert i's segment. + _offsets = [0] + for c in self.expert_dispatch_token_count: + _offsets.append(_offsets[-1] + int(c)) + self.expert_dispatch_token_offset = _offsets + + self.input_tensor_info["scatter_tokens"] = OpTensorInfo( + shape=[self.dispatch_tokens, self.hidden_size], + dtype=self.torch_dtype, + device=self.backend.get_torch_device_name(), + creator=torch.zeros, + ) + self.input_tensor_info["per_token_scale"] = OpTensorInfo( + shape=[self.dispatch_tokens, 1], + dtype=torch.float32, + device=self.backend.get_torch_device_name(), + creator=torch.ones, + ) + + self.input_tensor_info["experts_weight"] = OpTensorInfo( + shape=[ + self.num_experts_per_rank, + self.hidden_size // self.tile_k, + self.new_hidden_size * self.tile_k // 8, + ], + dtype=self.w_torch_dtype, + device=self.backend.get_torch_device_name(), + creator=torch.zeros, + ) + self.input_tensor_info["experts_scale"] = OpTensorInfo( + shape=[self.num_experts_per_rank, self.new_hidden_size, 1], + dtype=torch.float32, + device=self.backend.get_torch_device_name(), + creator=torch.ones, + ) + self.input_tensor_info["experts_token_count"] = OpTensorInfo( + shape=[self.num_experts_per_rank], + dtype=torch.int32, + device=self.backend.get_torch_device_name(), + creator=lambda size, dtype, device: torch.tensor( + self.expert_dispatch_token_count, dtype=dtype, device=device + ), + ) + self.input_tensor_info["experts_token_offset"] = OpTensorInfo( + shape=[self.num_experts_per_rank + 1], + dtype=torch.int32, + device=self.backend.get_torch_device_name(), + creator=lambda size, dtype, device: torch.tensor( + self.expert_dispatch_token_offset, dtype=dtype, device=device + ), + ) + + self.output_tensor_info["y"] = OpTensorInfo( + shape=[self.dispatch_tokens, self.new_hidden_size], + dtype=self.dst_torch_dtype, + device=self.backend.get_torch_device_name(), + ) + + self.input_tensor_size = sum(calc_tensor_size(info) for info in self.input_tensor_info.values()) + self.output_tensor_size = sum(calc_tensor_size(info) for info in self.output_tensor_info.values()) + self.tensor_size = self.input_tensor_size + self.output_tensor_size + + self.read_bytes = self.input_tensor_size + self.write_bytes = self.output_tensor_size + self.io_bytes = self.read_bytes + self.write_bytes + self.calc_flops = 2 * self.dispatch_tokens * self.hidden_size * self.new_hidden_size + + self._create_tensors_func = partial( + self._create_in_out_tensors, + create_inputs=True, + create_outputs=True, + ) + self._run_func = self.vendor_impl_run + + # kernel configs used by lightop implementation + self.config1, self.config2, self.status = get_moe_groupgemm_config_w4a8( + self.num_experts_per_rank, + self.num_tokens, + self.new_hidden_size, + self.K1, + self.hidden_size, + self.K2, + "DCU", + str(self.num_cus), + self.dst_dtype, + self.tile_n if self.tile_n > 0 else None, + self.tile_k if self.tile_k > 0 else None, + ) + + + def vendor_impl_run(self, tensor_mapping): + + scatter_tokens = tensor_mapping["scatter_tokens"] + experts_weight = tensor_mapping["experts_weight"] + per_token_scale = tensor_mapping["per_token_scale"] + experts_token_count = tensor_mapping["experts_token_count"] + experts_scale = tensor_mapping["experts_scale"] + experts_token_start = tensor_mapping["experts_token_offset"] + gemm1_out = tensor_mapping["y"] + + moe_groupgemm_marlin_w4a8(scatter_tokens, + experts_weight, + gemm1_out, + per_token_scale, + experts_scale, + experts_token_count, + experts_token_start, + self.config1 + ) + + + return gemm1_out diff --git a/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_quant_group_gemm_down.py b/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_quant_group_gemm_down.py new file mode 100644 index 00000000..4c89b636 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_quant_group_gemm_down.py @@ -0,0 +1,14 @@ +"""lightop vendor: moe_quant_group_gemm_down + +Compatibility provider for legacy split-op workloads. +Currently it reuses the unified `moe_quant_group_gemm` lightop implementation. +""" + +from xpu_perf.micro_perf.core.op import ProviderRegistry + +from .moe_quant_group_gemm import LightopMoeQuantGroupGemmOp as _UnifiedLightopMoeQuantGroupGemmOp + + +@ProviderRegistry.register_vendor_impl("moe_quant_group_gemm_down", "lightop") +class LightopMoeQuantGroupGemmDownOp(_UnifiedLightopMoeQuantGroupGemmOp): + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_quant_group_gemm_up.py b/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_quant_group_gemm_up.py new file mode 100644 index 00000000..caae1267 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_quant_group_gemm_up.py @@ -0,0 +1,14 @@ +"""lightop vendor: moe_quant_group_gemm_up + +Compatibility provider for legacy split-op workloads. +Currently it reuses the unified `moe_quant_group_gemm` lightop implementation. +""" + +from xpu_perf.micro_perf.core.op import ProviderRegistry + +from .moe_quant_group_gemm import LightopMoeQuantGroupGemmOp as _UnifiedLightopMoeQuantGroupGemmOp + + +@ProviderRegistry.register_vendor_impl("moe_quant_group_gemm_up", "lightop") +class LightopMoeQuantGroupGemmUpOp(_UnifiedLightopMoeQuantGroupGemmOp): + pass \ No newline at end of file diff --git a/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_softmax_topk.py b/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_softmax_topk.py new file mode 100644 index 00000000..a4274520 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_softmax_topk.py @@ -0,0 +1,47 @@ +import torch + +from xpu_perf_provider_base_ops.llm_ops.moe_softmax_topk import MoeSoftmaxTopkOp +from xpu_perf.micro_perf.core.op import ProviderRegistry + +from lightop import op + +@ProviderRegistry.register_vendor_impl("moe_softmax_topk", "lightop") +class LightopMoeSoftmaxTopkOp(MoeSoftmaxTopkOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + + self.extra_providers = ["lightop"] + + def vendor_impl_run(self, tensor_mapping): + # get pre-allocated input tensors + gating_output = tensor_mapping["gating_output"] + + if self.compute_mode == "pre-softmax": + + + M, _ = gating_output.shape + + topk_weights = torch.empty(M, + self.topk, + dtype=torch.float32, + device=gating_output.device) + topk_ids = torch.empty(M, + self.topk, + dtype=torch.int32, + device=gating_output.device) + token_expert_indicies = torch.empty(M, + self.topk, + dtype=torch.int32, + device=gating_output.device) + + + op.topk_softmax( + topk_weights, topk_ids, + token_expert_indicies, gating_output,True) + + del token_expert_indicies # Not used. Will be used in the future. + + + return topk_weights, topk_ids + else: + raise NotImplementedError diff --git a/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_swiglu_dynamic_quant.py b/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_swiglu_dynamic_quant.py new file mode 100644 index 00000000..f9956362 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/lightop/moe_swiglu_dynamic_quant.py @@ -0,0 +1,115 @@ +""" +Lightop implementation of MoeSwigluDynamicQuant, calling lightop.moe_swiglu_dynamic_quant +(6 tensors + 1 float; outputs are written into pre-allocated quant_tokens / per_token_scale). +""" +from xpu_perf.micro_perf.core.utils import OpTensorInfo, calc_tensor_size, get_torch_dtype, get_torch_dtype_size +from functools import partial +import torch + +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.moe_swiglu_dynamic_quant import MoeSwigluDynamicQuantOp + +try: + from lightop import op + + @ProviderRegistry.register_vendor_impl("moe_swiglu_dynamic_quant", "lightop") + class LightopMoeSwigluDynamicQuantOp(MoeSwigluDynamicQuantOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + self.extra_providers = ["lightop"] + self.require_profiling = True + + def vendor_impl(self): + self.torch_dtype = get_torch_dtype(self.dtype) + self.dst_torch_dtype = get_torch_dtype(self.dst_dtype) + + # input/output tensors + self.input_tensor_info = { + "scatter_tokens": OpTensorInfo( + shape=[self.dispatch_tokens, self.hidden_size * 2], + dtype=self.torch_dtype, + device=self.backend.get_torch_device_name(), + ), + "experts_smooth_scale": OpTensorInfo( + shape=[self.num_experts_per_rank, self.hidden_size], + dtype=torch.float32, + device=self.backend.get_torch_device_name(), + creator=torch.ones + ), + "experts_token_count": OpTensorInfo( + shape=[self.num_experts_per_rank], + dtype=torch.int32, + device=self.backend.get_torch_device_name(), + creator=lambda size, dtype, device: torch.tensor( + self.expert_dispatch_token_count, dtype=dtype, device=device) + ), + "experts_token_offset": OpTensorInfo( + shape=[self.num_experts_per_rank], + dtype=torch.int32, + device=self.backend.get_torch_device_name(), + creator=lambda size, dtype, device: torch.tensor( + self.expert_dispatch_token_offset, dtype=dtype, device=device) + ) + } + self.output_tensor_info = { + "quant_tokens": OpTensorInfo( + shape=[self.dispatch_tokens, self.hidden_size], + dtype=self.dst_torch_dtype, + device=self.backend.get_torch_device_name(), + ), + "per_token_scale": OpTensorInfo( + shape=[self.dispatch_tokens], + dtype=torch.float32, + device=self.backend.get_torch_device_name(), + ), + } + + # calculator + self.input_tensor_size = sum([ + calc_tensor_size(info) for info in self.input_tensor_info.values() + ]) + self.output_tensor_size = sum([ + calc_tensor_size(info) for info in self.output_tensor_info.values() + ]) + self.tensor_size = self.input_tensor_size + self.output_tensor_size + + self.read_bytes = self.input_tensor_size + self.write_bytes = self.output_tensor_size + self.io_bytes = self.read_bytes + self.write_bytes + + # creator func + self._create_tensors_func = partial( + self._create_in_out_tensors, + create_inputs=True, + create_outputs=True + ) + + # run func + self._run_func = self.vendor_impl_run + + + def vendor_impl_run(self, tensor_mapping): + scatter_tokens = tensor_mapping["scatter_tokens"] + experts_smooth_scale = tensor_mapping["experts_smooth_scale"] + experts_token_count = tensor_mapping["experts_token_count"] + experts_token_offset = tensor_mapping["experts_token_offset"] + # Pre-allocated by framework; kernel writes in-place and returns None. + quant_tokens = tensor_mapping["quant_tokens"] + per_token_scale = tensor_mapping["per_token_scale"] + + # Signature: (scatter, smooth_scale, token_count, token_offset, + # quant_out, scale_out, float_param) -> None + op.moe_swiglu_dynamic_quant( + scatter_tokens, + experts_smooth_scale, + experts_token_count, + experts_token_offset, + quant_tokens, + per_token_scale, + 1.0, + ) + return quant_tokens, per_token_scale + +except Exception: + # lightop is optional; if unavailable we just don't register this provider. + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/lightop/qk_rms_norm.py b/projects/micro_perf/vendor_ops/DCU/ops/lightop/qk_rms_norm.py new file mode 100644 index 00000000..574d9723 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/lightop/qk_rms_norm.py @@ -0,0 +1,26 @@ +from functools import partial + +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.qk_rms_norm import QKRMSNormOp + + +from lightop import op +@ProviderRegistry.register_vendor_impl("qk_rms_norm", "lightop") + +class LightopQKRMSNormOp(QKRMSNormOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + + self.extra_providers = ["lightop"] + + def vendor_impl_run(self, tensor_mapping): + token_data = tensor_mapping["token_data"] + q_norm_weight = tensor_mapping["q_norm_weight"] + k_norm_weight = tensor_mapping["k_norm_weight"] + + op.fuse_qkv_head_rms_norm(token_data, q_norm_weight, k_norm_weight, self.q_head_num, self.kv_head_num, self.qk_head_dim, self.eps) + + return token_data + + + diff --git a/projects/micro_perf/vendor_ops/DCU/ops/lightop/rotary_embedding.py b/projects/micro_perf/vendor_ops/DCU/ops/lightop/rotary_embedding.py new file mode 100644 index 00000000..73254f27 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/lightop/rotary_embedding.py @@ -0,0 +1,54 @@ +""" +Lightop implementation of RotaryEmbedding, following the style of VllmRotaryEmbeddingOp in llm_ops. +""" +from itertools import chain +import torch + +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.rotary_embedding import RotaryEmbeddingOp + +try: + # from vllm import _custom_ops as ops + from lightop import op + + @ProviderRegistry.register_vendor_impl("rotary_embedding", "lightop") + class LightopRotaryEmbeddingOp(RotaryEmbeddingOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + self.extra_providers = ["lightop"] + self.require_profiling = True + + def vendor_impl_run(self, tensor_mapping): + packed_qkv = tensor_mapping["packed_qkv"] + q_lens = tensor_mapping["q_lens"] + accum_q_lens = tensor_mapping["accum_q_lens"] + cache_lens = tensor_mapping["cache_lens"] + cos = tensor_mapping["cos"] + sin = tensor_mapping["sin"] + + cos_sin_cache = torch.cat([cos, sin], dim=-1).contiguous() + + dim_start = self.rope_offset + dim_end = self.rope_offset + self.rope_dim + + positions = getattr(self, "positions", None) + if positions is None or positions.numel() != self.num_tokens: + # Use self.cache_lens / self.q_lens to avoid indexing GPU cache_lens (can trigger sync). + positions_list = [ + self.cache_lens[b] + j + for b in range(self.batch_size) + for j in range(self.q_lens[b]) + ] + positions = torch.tensor(positions_list, dtype=torch.int64, device=packed_qkv.device) + + op.rotary_embedding_fuse( + positions=positions, + query=packed_qkv[:, :self.q_head_num, dim_start:dim_end].view(packed_qkv.size(0), -1), + key=packed_qkv[:, self.q_head_num:self.q_head_num + self.kv_head_num, dim_start:dim_end].view(packed_qkv.size(0), -1), + head_size=self.rope_dim, + cos_sin_cache=cos_sin_cache, + is_neox=True, + ) + return packed_qkv +except Exception: + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/lmslim/__init__.py b/projects/micro_perf/vendor_ops/DCU/ops/lmslim/__init__.py new file mode 100644 index 00000000..1fb94cbe --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/lmslim/__init__.py @@ -0,0 +1,12 @@ +import importlib.metadata + +from xpu_perf.micro_perf.core.op import ProviderRegistry + +PROVIDER_NAME = "dcu_lmslim" + +try: + ProviderRegistry.register_provider_info( + "lmslim", {"lmslim": importlib.metadata.version("lmslim")} + ) +except Exception: + pass diff --git a/projects/micro_perf/vendor_ops/DCU/ops/lmslim/quant_matmul.py b/projects/micro_perf/vendor_ops/DCU/ops/lmslim/quant_matmul.py new file mode 100644 index 00000000..ddff4cf4 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/lmslim/quant_matmul.py @@ -0,0 +1,44 @@ +import torch +from typing import Optional, List + +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.quant_matmul import QuantMatmulOp + +try: + from lmslim.quantize.quant_ops import triton_scaled_mm + from lmslim import quant_ops + + @ProviderRegistry.register_vendor_impl("quant_matmul", "lmslim") + class LmslimQuantMatmulOp(QuantMatmulOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + self.extra_providers = ["lmslim"] + + def blaslt_scaled_mm( + self, + a_int8: torch.Tensor, # int8 [M, K] + b_int8: torch.Tensor, # int8 [K, N] + scale_a: torch.Tensor, # fp32 scale for A + scale_b: torch.Tensor, # fp32 scale for B + out_dtype: torch.dtype, + bias: Optional[torch.Tensor] = None + ) -> torch.Tensor: + m = a_int8.shape[0] + k = a_int8.shape[1] + n = b_int8.shape[0] + _, out = quant_ops.hipblaslt_w8a8_gemm(a_int8, b_int8, scale_a, scale_b, m, n, k, "NT", out_dtype,bias) + return out + + def vendor_impl_run(self, tensor_mapping): + # get pre-allocated input tensors, require hidden_states contiguous, expert_weight not + hidden_states = tensor_mapping["hidden_states"] + per_token_scale = tensor_mapping["per_token_scale"] + expert_weight = tensor_mapping["expert_weight"]#.transpose(0,1) + expert_scale = tensor_mapping["expert_scale"] + + out = self.blaslt_scaled_mm(hidden_states, expert_weight, per_token_scale, expert_scale, out_dtype=self.dst_torch_dtype, bias=None) + + return out +except Exception as e: + import traceback + traceback.print_exc() diff --git a/projects/micro_perf/vendor_ops/DCU/ops/torch/__init__.py b/projects/micro_perf/vendor_ops/DCU/ops/torch/__init__.py new file mode 100644 index 00000000..a96e788d --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/torch/__init__.py @@ -0,0 +1 @@ +PROVIDER_NAME = "dcu_torch" diff --git a/projects/micro_perf/vendor_ops/DCU/ops/torch/all_reduce.py b/projects/micro_perf/vendor_ops/DCU/ops/torch/all_reduce.py new file mode 100644 index 00000000..46c1f886 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/torch/all_reduce.py @@ -0,0 +1,9 @@ +from functools import partial +import torch +import torch.distributed as dist + +from xpu_perf_provider_base_ops.basic_ops.xccl_ops import AllReduceOp + +OP_MAPPING = { + "torch": AllReduceOp +} diff --git a/projects/micro_perf/vendor_ops/DCU/ops/torch/moe_gating_gemm.py b/projects/micro_perf/vendor_ops/DCU/ops/torch/moe_gating_gemm.py new file mode 100644 index 00000000..c17ca4c9 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/torch/moe_gating_gemm.py @@ -0,0 +1,55 @@ +""" +Torch implementation of MoeGatingGemm: during create_tensors, convert gating_weight to a +[hidden_size, num_experts] contiguous layout to avoid repeated transpose on the matmul hot path. +""" +import torch +from functools import partial +from xpu_perf.micro_perf.core.op import ProviderRegistry +from xpu_perf_provider_base_ops.llm_ops.moe_gating_gemm import MoeGatingGemmOp +from xpu_perf.micro_perf.core.utils import OpTensorInfo, calc_tensor_size + + +@ProviderRegistry.register_vendor_impl("moe_gating_gemm", "torch") +class TorchMoeGatingGemmOp(MoeGatingGemmOp): + def __init__(self, args_dict, backend, *args, **kwargs): + super().__init__(args_dict, backend, *args, **kwargs) + self.extra_providers = ["torch"] + self.require_profiling = True + + def vendor_parser(self): + if self.dtype in ("float32", "bfloat16") and self.compute_dtype in ("float32", "bfloat16") and self.dst_dtype == "float32": + pass + else: + raise ValueError(f"MoeGatingGemmOp only support float32-->float32, but got {self.dtype}--> {self.dst_dtype}") + + def vendor_impl(self): + # Reuse base setup and then switch gating_weight layout to [hidden_size, num_experts]. + # This avoids per-run transpose in the hot mm path. + super().vendor_impl() + + self.input_tensor_info["gating_weight"] = OpTensorInfo( + shape=[self.hidden_size, self.num_experts], + dtype=self.torch_dtype, + device=self.backend.get_torch_device_name(), + ) + + # Recompute io stats to match updated tensor shape bookkeeping. + self.input_tensor_size = sum([ + calc_tensor_size(info) for info in self.input_tensor_info.values() + ]) + self.tensor_size = self.input_tensor_size + self.output_tensor_size + self.read_bytes = self.input_tensor_size + self.io_bytes = self.read_bytes + self.write_bytes + + self._create_tensors_func = partial( + self._create_in_out_tensors, + create_inputs=True, + create_outputs=False, + ) + + def vendor_impl_run(self, tensor_mapping): + gating_output = torch.mm( + tensor_mapping["hidden_states"], + tensor_mapping["gating_weight"] + ).to(self.dst_torch_dtype) + return gating_output diff --git a/projects/micro_perf/vendor_ops/DCU/ops/torch/swiglu_dynamic_quant.py b/projects/micro_perf/vendor_ops/DCU/ops/torch/swiglu_dynamic_quant.py new file mode 100644 index 00000000..1eb555b6 --- /dev/null +++ b/projects/micro_perf/vendor_ops/DCU/ops/torch/swiglu_dynamic_quant.py @@ -0,0 +1,5 @@ +from xpu_perf_provider_base_ops.llm_ops.swiglu_dynamic_quant import SwigluDynamicQuantOp + +OP_MAPPING = { + "torch": SwigluDynamicQuantOp +} From cfaa3ecfe3357ea3a23bc2be0d2210e5462d4fa1 Mon Sep 17 00:00:00 2001 From: Zhang Jing Date: Mon, 27 Apr 2026 21:16:38 +0800 Subject: [PATCH 5/6] llm_sim: add DCU deployment configs Add DCU-specific deploy configs for seed-oss-36b, qwen3-32b, and qwen3-235b-a22b tp-ep. Signed-off-by: Zhang Jing Co-authored-by: Zhu Fuzhu Co-authored-by: Wang Sen --- .../qwen3_dense/qwen3-32b/deploys/tp_dcu.json | 31 + .../qwen3-235b-a22b/deploys/tp_ep_dcu.json | 44 ++ .../qwen3-235b-a22b/deploys/tp_ep_dcu.py | 568 ++++++++++++++++++ .../seed_oss/seed-oss-36b/deploys/tp_dcu.json | 31 + 4 files changed, 674 insertions(+) create mode 100644 projects/xpu_oj/llm_sim/model_zoo/qwen3_dense/qwen3-32b/deploys/tp_dcu.json create mode 100644 projects/xpu_oj/llm_sim/model_zoo/qwen3_moe/qwen3-235b-a22b/deploys/tp_ep_dcu.json create mode 100644 projects/xpu_oj/llm_sim/model_zoo/qwen3_moe/qwen3-235b-a22b/deploys/tp_ep_dcu.py create mode 100644 projects/xpu_oj/llm_sim/model_zoo/seed_oss/seed-oss-36b/deploys/tp_dcu.json diff --git a/projects/xpu_oj/llm_sim/model_zoo/qwen3_dense/qwen3-32b/deploys/tp_dcu.json b/projects/xpu_oj/llm_sim/model_zoo/qwen3_dense/qwen3-32b/deploys/tp_dcu.json new file mode 100644 index 00000000..f09cfce4 --- /dev/null +++ b/projects/xpu_oj/llm_sim/model_zoo/qwen3_dense/qwen3-32b/deploys/tp_dcu.json @@ -0,0 +1,31 @@ +{ + "base_model_name": "qwen3_dense", + "model_name": "qwen3-32b", + "infer_dtype": "gemm{w8a8}_fa{c8}", + "dtype_config": { + "default_dtype": "bfloat16", + "qkvo": { + "dtype": "int8", + "w_dtype": "int8", + "compute_dtype": "int8" + }, + "attn": { + "dtype": "bfloat16", + "cache_dtype": "bfloat16", + "qk_compute_dtype": "bfloat16", + "pv_compute_dtype": "bfloat16" + }, + "mlp": { + "dtype": "int8", + "w_dtype": "int8", + "compute_dtype": "int8" + } + }, + "parallel_config": { + "device_num": 4, + "tp_size": 4 + }, + "extra_config": {}, + "template": "model_zoo.qwen3_dense.qwen3-32b.deploys.tp" +} + diff --git a/projects/xpu_oj/llm_sim/model_zoo/qwen3_moe/qwen3-235b-a22b/deploys/tp_ep_dcu.json b/projects/xpu_oj/llm_sim/model_zoo/qwen3_moe/qwen3-235b-a22b/deploys/tp_ep_dcu.json new file mode 100644 index 00000000..0b66a35c --- /dev/null +++ b/projects/xpu_oj/llm_sim/model_zoo/qwen3_moe/qwen3-235b-a22b/deploys/tp_ep_dcu.json @@ -0,0 +1,44 @@ +{ + "base_model_name": "qwen3_moe", + "model_name": "qwen3-235b-a22b", + "infer_dtype": "gemm{w8a8}_fa{c8}", + "dtype_config": { + "default_dtype": "bfloat16", + "qkvo": { + "dtype": "int8", + "w_dtype": "int8", + "compute_dtype": "int8" + }, + "attn": { + "dtype": "bfloat16", + "cache_dtype": "bfloat16", + "qk_compute_dtype": "bfloat16", + "pv_compute_dtype": "bfloat16" + }, + "gating": { + "dtype": "bfloat16", + "compute_dtype": "bfloat16", + "dst_dtype": "float32" + }, + "mlp": { + "dtype": "int8", + "w_dtype": "int32", + "compute_dtype": "int8" + } + }, + "parallel_config": { + "device_num": 8, + "sp_size": 1, + "tp_size": 8, + "ep_size": 8 + }, + "extra_config": { + "topk": 8, + "num_experts": 128, + "compute_mode": "pre-softmax", + "tile_k": 32, + "tile_n": 64, + "moe_gather_res_scale": 0.0 + }, + "template": "model_zoo.qwen3_moe.qwen3-235b-a22b.deploys.tp_ep_dcu" +} diff --git a/projects/xpu_oj/llm_sim/model_zoo/qwen3_moe/qwen3-235b-a22b/deploys/tp_ep_dcu.py b/projects/xpu_oj/llm_sim/model_zoo/qwen3_moe/qwen3-235b-a22b/deploys/tp_ep_dcu.py new file mode 100644 index 00000000..de3bacd9 --- /dev/null +++ b/projects/xpu_oj/llm_sim/model_zoo/qwen3_moe/qwen3-235b-a22b/deploys/tp_ep_dcu.py @@ -0,0 +1,568 @@ +import os +from typing import Dict + +from transformers import Qwen3MoeConfig + +from model_zoo.topology import OpTopologyDAG +from xpu_perf.model_perf.utils import DistributionInfo + +""" +tp-ep 模式 (ep_size == tp_size) +Attention 部分使用纯 TP 并行,MoE 部分在 TP 基础上加入 EP 并行通信 +""" + + +def generate( + model_config: Qwen3MoeConfig, + bench_config: Dict, +): + # parse model params + hidden_size = model_config.hidden_size + q_head_num = model_config.num_attention_heads + kv_head_num = model_config.num_key_value_heads + head_dim = model_config.head_dim + + attention_bias = model_config.attention_bias + + moe_intermediate_size = model_config.moe_intermediate_size + num_experts = model_config.num_experts + num_experts_per_tok = model_config.num_experts_per_tok + + # parse distribution info + dist_info = DistributionInfo.from_bench_config(bench_config["parallel_config"]) + + split_q_head_num = q_head_num // dist_info.tp_size if q_head_num >= dist_info.tp_size else 1 + split_kv_head_num = kv_head_num // dist_info.tp_size if kv_head_num >= dist_info.tp_size else 1 + + # 获取默认数据类型 + default_dtype = bench_config.get("dtype_config", {}).get("default_dtype", "bfloat16") + + qkvo_config = bench_config["dtype_config"]["qkvo"] + attn_config = bench_config["dtype_config"]["attn"] + gating_config = bench_config["dtype_config"]["gating"] + mlp_config = bench_config["dtype_config"]["mlp"] + extra_config = bench_config.get("extra_config", {}) + moe_gather_res_scale = float(extra_config.get("moe_gather_res_scale", 1.0)) + + model_topo = OpTopologyDAG() + + # ============================================================ + # Attention 部分 (纯 TP 并行) + # ============================================================ + model_topo.op_process_wrapper( + "add_rms_norm_dynamic_quant", + "add_rms_norm_0", + { + "dtype": default_dtype, + "dst_dtype": qkvo_config["dtype"], + "hidden_size": hidden_size, + "add_residual": True, + "output_mode": "res", + }, + ) + + model_topo.op_process_wrapper( + "quant_matmul", + "qkv_gemm", + { + "dtype": qkvo_config["dtype"], + "w_dtype": qkvo_config["w_dtype"], + "compute_dtype": qkvo_config["compute_dtype"], + "dst_dtype": default_dtype, + "has_bias": attention_bias, + "hidden_size": hidden_size, + "new_hidden_size": (split_q_head_num + 2 * split_kv_head_num) * head_dim, + }, + ) + + model_topo.op_process_wrapper( + "qk_rms_norm", + "qk_norm", + { + "dtype": default_dtype, + "q_head_num": split_q_head_num, + "kv_head_num": split_kv_head_num, + "qk_head_dim": head_dim, + "v_head_dim": head_dim, + }, + ) + + model_topo.op_process_wrapper( + "rotary_embedding", + "rotary_embedding", + { + "dtype": default_dtype, + "q_head_num": split_q_head_num, + "kv_head_num": split_kv_head_num, + "head_dim": head_dim, + "rope_offset": 0, + "rope_dim": head_dim, + }, + ) + + model_topo.op_process_wrapper( + "store_kv_cache", + "store_kv_cache", + { + "dtype": default_dtype, + "cache_dtype": attn_config["cache_dtype"], + "q_head_num": split_q_head_num, + "kv_head_num": split_kv_head_num, + "head_dim": head_dim, + }, + ) + + model_topo.op_process_wrapper( + "flash_attention", + "flash_attention", + { + "dtype": default_dtype, + "cache_dtype": attn_config["cache_dtype"], + "qk_compute_dtype": attn_config["qk_compute_dtype"], + "pv_compute_dtype": attn_config["pv_compute_dtype"], + "q_head_num": split_q_head_num, + "kv_head_num": split_kv_head_num, + "head_dim": head_dim, + }, + ) + + model_topo.op_process_wrapper( + "quant_matmul", + "attn_out_gemm", + { + "dtype": qkvo_config["dtype"], + "w_dtype": qkvo_config["w_dtype"], + "compute_dtype": qkvo_config["compute_dtype"], + "dst_dtype": default_dtype, + "has_bias": attention_bias, + "hidden_size": split_q_head_num * head_dim, + "new_hidden_size": hidden_size, + }, + ) + + model_topo.op_process_wrapper( + "all_reduce", + "all_reduce_0", + { + "world_size": dist_info.tp_size, + "dtype": default_dtype, + "hidden_size": hidden_size, + }, + ) + + # ============================================================ + # MoE 部分 (TP + EP 并行) + # ============================================================ + pre_moe = model_topo.op_process_wrapper( + "add_rms_norm", + "qwen3_pre_moe_norm", + {"dtype": default_dtype, "hidden_size": hidden_size}, + ) + + a2a0_node = model_topo.op_process_wrapper( + "all_to_all", + "qwen3_moe_a2a0", + { + "dtype": default_dtype, + "world_size": dist_info.ep_size, + "hidden_size": hidden_size, + }, + src=pre_moe, + ) + + model_topo.op_process_wrapper( + "moe_gating_gemm", + "qwen3_moe_gating", + { + "dtype": gating_config["dtype"], + "compute_dtype": gating_config["compute_dtype"], + "dst_dtype": gating_config.get("dst_dtype", "float32"), + "num_experts": num_experts, + "hidden_size": hidden_size, + }, + src=pre_moe, + ) + + topk_node = model_topo.op_process_wrapper( + "moe_softmax_topk", + "qwen3_moe_softmax_topk", + { + "dtype": "float32", + "num_experts": num_experts, + "topk": num_experts_per_tok, + "compute_mode": extra_config.get("compute_mode", "pre-softmax"), + }, + ) + + scatter_node = model_topo.op_process_wrapper( + "moe_scatter_dynamic_quant", + "qwen3_moe_scatter", + { + "dtype": default_dtype, + "dst_dtype": mlp_config["dtype"], + "ep_size": dist_info.ep_size, + "num_experts": num_experts, + "topk": num_experts_per_tok, + "hidden_size": hidden_size, + }, + src=[a2a0_node, topk_node], + ) + + tile_k = int(extra_config.get("tile_k", 0)) + tile_n = int(extra_config.get("tile_n", 0)) + + model_topo.op_process_wrapper( + "moe_quant_group_gemm_up", + "qwen3_moe_up_gemm", + { + "dtype": mlp_config["dtype"], + "w_dtype": mlp_config["w_dtype"], + "compute_dtype": mlp_config["compute_dtype"], + "dst_dtype": default_dtype, + "ep_size": dist_info.ep_size, + "num_experts": num_experts, + "topk": num_experts_per_tok, + "tile_k": tile_k, + "tile_n": tile_n, + "hidden_size": hidden_size, + "new_hidden_size": moe_intermediate_size * 2, + }, + src=scatter_node, + ) + + model_topo.op_process_wrapper( + "moe_swiglu_dynamic_quant", + "qwen3_moe_swiglu", + { + "dtype": default_dtype, + "dst_dtype": mlp_config["dtype"], + "ep_size": dist_info.ep_size, + "num_experts": num_experts, + "topk": num_experts_per_tok, + "hidden_size": moe_intermediate_size, + }, + ) + + model_topo.op_process_wrapper( + "moe_quant_group_gemm_down", + "qwen3_moe_down_gemm", + { + "dtype": mlp_config["dtype"], + "w_dtype": mlp_config["w_dtype"], + "compute_dtype": mlp_config["compute_dtype"], + "dst_dtype": default_dtype, + "ep_size": dist_info.ep_size, + "num_experts": num_experts, + "topk": num_experts_per_tok, + "tile_k": tile_k, + "tile_n": tile_n, + "hidden_size": moe_intermediate_size, + "new_hidden_size": hidden_size, + }, + ) + + model_topo.op_process_wrapper( + "all_to_all", + "qwen3_moe_a2a1", + {"dtype": default_dtype, "world_size": dist_info.ep_size, "hidden_size": hidden_size}, + ) + + model_topo.op_process_wrapper( + "moe_gather", + "qwen3_moe_gather", + { + "dtype": default_dtype, + "ep_size": dist_info.ep_size, + "num_experts": num_experts, + "topk": num_experts_per_tok, + "hidden_size": hidden_size, + "res_scale": moe_gather_res_scale, + }, + ) + + return model_topo + +import os +from typing import Dict + +from transformers import Qwen3MoeConfig + +from model_zoo.topology import OpTopologyDAG +from xpu_perf.model_perf.utils import DistributionInfo + +""" +tp-ep 模式 (ep_size == tp_size) +Attention 部分使用纯 TP 并行,MoE 部分在 TP 基础上加入 EP 并行通信 +""" + + +def generate( + model_config: Qwen3MoeConfig, + bench_config: Dict, +): + # parse model params + hidden_size = model_config.hidden_size + q_head_num = model_config.num_attention_heads + kv_head_num = model_config.num_key_value_heads + head_dim = model_config.head_dim + + attention_bias = model_config.attention_bias + + moe_intermediate_size = model_config.moe_intermediate_size + num_experts = model_config.num_experts + num_experts_per_tok = model_config.num_experts_per_tok + + # parse distribution info + dist_info = DistributionInfo.from_bench_config(bench_config["parallel_config"]) + + split_q_head_num = q_head_num // dist_info.tp_size if q_head_num >= dist_info.tp_size else 1 + split_kv_head_num = kv_head_num // dist_info.tp_size if kv_head_num >= dist_info.tp_size else 1 + + # 获取默认数据类型 + default_dtype = bench_config.get("dtype_config", {}).get("default_dtype", "bfloat16") + + qkvo_config = bench_config["dtype_config"]["qkvo"] + attn_config = bench_config["dtype_config"]["attn"] + gating_config = bench_config["dtype_config"]["gating"] + mlp_config = bench_config["dtype_config"]["mlp"] + extra_config = bench_config.get("extra_config", {}) + moe_gather_res_scale = float(extra_config.get("moe_gather_res_scale", 1.0)) + + model_topo = OpTopologyDAG() + + # ============================================================ + # Attention 部分 (纯 TP 并行) + # ============================================================ + model_topo.op_process_wrapper( + "add_rms_norm_dynamic_quant", + "add_rms_norm_0", + { + "dtype": default_dtype, + "dst_dtype": qkvo_config["dtype"], + "hidden_size": hidden_size, + "add_residual": True, + "output_mode": "res", + }, + ) + + model_topo.op_process_wrapper( + "quant_matmul", + "qkv_gemm", + { + "dtype": qkvo_config["dtype"], + "w_dtype": qkvo_config["w_dtype"], + "compute_dtype": qkvo_config["compute_dtype"], + "dst_dtype": default_dtype, + "has_bias": attention_bias, + "hidden_size": hidden_size, + "new_hidden_size": (split_q_head_num + 2 * split_kv_head_num) * head_dim, + }, + ) + + model_topo.op_process_wrapper( + "qk_rms_norm", + "qk_norm", + { + "dtype": default_dtype, + "q_head_num": split_q_head_num, + "kv_head_num": split_kv_head_num, + "qk_head_dim": head_dim, + "v_head_dim": head_dim, + }, + ) + + model_topo.op_process_wrapper( + "rotary_embedding", + "rotary_embedding", + { + "dtype": default_dtype, + "q_head_num": split_q_head_num, + "kv_head_num": split_kv_head_num, + "head_dim": head_dim, + "rope_offset": 0, + "rope_dim": head_dim, + }, + ) + + model_topo.op_process_wrapper( + "store_kv_cache", + "store_kv_cache", + { + "dtype": default_dtype, + "cache_dtype": attn_config["cache_dtype"], + "q_head_num": split_q_head_num, + "kv_head_num": split_kv_head_num, + "head_dim": head_dim, + }, + ) + + model_topo.op_process_wrapper( + "flash_attention", + "flash_attention", + { + "dtype": default_dtype, + "cache_dtype": attn_config["cache_dtype"], + "qk_compute_dtype": attn_config["qk_compute_dtype"], + "pv_compute_dtype": attn_config["pv_compute_dtype"], + "q_head_num": split_q_head_num, + "kv_head_num": split_kv_head_num, + "head_dim": head_dim, + }, + ) + + model_topo.op_process_wrapper( + "quant_matmul", + "attn_out_gemm", + { + "dtype": qkvo_config["dtype"], + "w_dtype": qkvo_config["w_dtype"], + "compute_dtype": qkvo_config["compute_dtype"], + "dst_dtype": default_dtype, + "has_bias": attention_bias, + "hidden_size": split_q_head_num * head_dim, + "new_hidden_size": hidden_size, + }, + ) + + model_topo.op_process_wrapper( + "all_reduce", + "all_reduce_0", + { + "world_size": dist_info.tp_size, + "dtype": default_dtype, + "hidden_size": hidden_size, + }, + ) + + # ============================================================ + # MoE 部分 (TP + EP 并行) + # ============================================================ + pre_moe = model_topo.op_process_wrapper( + "add_rms_norm", + "qwen3_pre_moe_norm", + {"dtype": default_dtype, "hidden_size": hidden_size}, + ) + + a2a0_node = model_topo.op_process_wrapper( + "all_to_all", + "qwen3_moe_a2a0", + { + "dtype": default_dtype, + "world_size": dist_info.ep_size, + "hidden_size": hidden_size, + }, + src=pre_moe, + ) + + model_topo.op_process_wrapper( + "moe_gating_gemm", + "qwen3_moe_gating", + { + "dtype": gating_config["dtype"], + "compute_dtype": gating_config["compute_dtype"], + "dst_dtype": gating_config.get("dst_dtype", "float32"), + "num_experts": num_experts, + "hidden_size": hidden_size, + }, + src=pre_moe, + ) + + topk_node = model_topo.op_process_wrapper( + "moe_softmax_topk", + "qwen3_moe_softmax_topk", + { + "dtype": "float32", + "num_experts": num_experts, + "topk": num_experts_per_tok, + "compute_mode": extra_config.get("compute_mode", "pre-softmax"), + }, + ) + + scatter_node = model_topo.op_process_wrapper( + "moe_scatter_dynamic_quant", + "qwen3_moe_scatter", + { + "dtype": default_dtype, + "dst_dtype": mlp_config["dtype"], + "ep_size": dist_info.ep_size, + "num_experts": num_experts, + "topk": num_experts_per_tok, + "hidden_size": hidden_size, + }, + src=[a2a0_node, topk_node], + ) + + tile_k = int(extra_config.get("tile_k", 0)) + tile_n = int(extra_config.get("tile_n", 0)) + + model_topo.op_process_wrapper( + "moe_quant_group_gemm_up", + "qwen3_moe_up_gemm", + { + "dtype": mlp_config["dtype"], + "w_dtype": mlp_config["w_dtype"], + "compute_dtype": mlp_config["compute_dtype"], + "dst_dtype": default_dtype, + "ep_size": dist_info.ep_size, + "num_experts": num_experts, + "topk": num_experts_per_tok, + "tile_k": tile_k, + "tile_n": tile_n, + "hidden_size": hidden_size, + "new_hidden_size": moe_intermediate_size * 2, + }, + src=scatter_node, + ) + + model_topo.op_process_wrapper( + "moe_swiglu_dynamic_quant", + "qwen3_moe_swiglu", + { + "dtype": default_dtype, + "dst_dtype": mlp_config["dtype"], + "ep_size": dist_info.ep_size, + "num_experts": num_experts, + "topk": num_experts_per_tok, + "hidden_size": moe_intermediate_size, + }, + ) + + model_topo.op_process_wrapper( + "moe_quant_group_gemm_down", + "qwen3_moe_down_gemm", + { + "dtype": mlp_config["dtype"], + "w_dtype": mlp_config["w_dtype"], + "compute_dtype": mlp_config["compute_dtype"], + "dst_dtype": default_dtype, + "ep_size": dist_info.ep_size, + "num_experts": num_experts, + "topk": num_experts_per_tok, + "tile_k": tile_k, + "tile_n": tile_n, + "hidden_size": moe_intermediate_size, + "new_hidden_size": hidden_size, + }, + ) + + model_topo.op_process_wrapper( + "all_to_all", + "qwen3_moe_a2a1", + {"dtype": default_dtype, "world_size": dist_info.ep_size, "hidden_size": hidden_size}, + ) + + model_topo.op_process_wrapper( + "moe_gather", + "qwen3_moe_gather", + { + "dtype": default_dtype, + "ep_size": dist_info.ep_size, + "num_experts": num_experts, + "topk": num_experts_per_tok, + "hidden_size": hidden_size, + "res_scale": moe_gather_res_scale, + }, + ) + + return model_topo + diff --git a/projects/xpu_oj/llm_sim/model_zoo/seed_oss/seed-oss-36b/deploys/tp_dcu.json b/projects/xpu_oj/llm_sim/model_zoo/seed_oss/seed-oss-36b/deploys/tp_dcu.json new file mode 100644 index 00000000..c061e6e5 --- /dev/null +++ b/projects/xpu_oj/llm_sim/model_zoo/seed_oss/seed-oss-36b/deploys/tp_dcu.json @@ -0,0 +1,31 @@ +{ + "base_model_name": "seed_oss", + "model_name": "seed-oss-36b", + "infer_dtype": "gemm{w8a8}_fa{c8}", + "dtype_config": { + "default_dtype": "bfloat16", + "qkvo": { + "dtype": "int8", + "w_dtype": "int8", + "compute_dtype": "int8" + }, + "attn": { + "dtype": "bfloat16", + "cache_dtype": "bfloat16", + "qk_compute_dtype": "bfloat16", + "pv_compute_dtype": "bfloat16" + }, + "mlp": { + "dtype": "int8", + "w_dtype": "int8", + "compute_dtype": "int8" + } + }, + "parallel_config": { + "device_num": 4, + "tp_size": 4 + }, + "extra_config": {}, + "template": "model_zoo.seed_oss.seed-oss-36b.deploys.tp" +} + From 8addf97f74ce9342ca8d4c052e234150510b2679 Mon Sep 17 00:00:00 2001 From: Zhang Jing Date: Mon, 27 Apr 2026 21:18:48 +0800 Subject: [PATCH 6/6] Add DCU vendor_test workloads Signed-off-by: Zhang Jing Co-authored-by: Zhu Fuzhu Co-authored-by: Wang Sen --- .../llm/vendor_test_DCU/flash_attention.json | 25 +++++++++++++++++++ .../vendor_test_DCU/moe_quant_group_gemm.json | 25 +++++++++++++++++++ .../llm/vendor_test_DCU/quant_matmul.json | 22 ++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 projects/micro_perf/workloads/llm/vendor_test_DCU/flash_attention.json create mode 100644 projects/micro_perf/workloads/llm/vendor_test_DCU/moe_quant_group_gemm.json create mode 100644 projects/micro_perf/workloads/llm/vendor_test_DCU/quant_matmul.json diff --git a/projects/micro_perf/workloads/llm/vendor_test_DCU/flash_attention.json b/projects/micro_perf/workloads/llm/vendor_test_DCU/flash_attention.json new file mode 100644 index 00000000..246e0890 --- /dev/null +++ b/projects/micro_perf/workloads/llm/vendor_test_DCU/flash_attention.json @@ -0,0 +1,25 @@ +{ + "flash_attention": [ + { + "arg_type": "llm", + "attn_mode": "prefill", + "dtype.cache_dtype.qk_compute_dtype.pv_compute_dtype.dst_dtype": [ + ["bfloat16", "bfloat16", "bfloat16", "bfloat16", "bfloat16"] + ], + "q_head_num.kv_head_num.head_dim": [ + [64, 8, 128], + [32, 4, 128], + [16, 2, 128], + [8, 1, 128] + ], + "block_size": 0, + "batch_size.cache_len": [ + [1, 0] + ], + "q_len": [ + 128, 256, 512, 768, 1024, 2048, 4096, 6192, 8192, 10240, 12288, 14336, 16384, 18432, + 20480, 22528, 24576, 26624, 28672, 30720, 32768 + ] + } + ] +} diff --git a/projects/micro_perf/workloads/llm/vendor_test_DCU/moe_quant_group_gemm.json b/projects/micro_perf/workloads/llm/vendor_test_DCU/moe_quant_group_gemm.json new file mode 100644 index 00000000..e268bbd7 --- /dev/null +++ b/projects/micro_perf/workloads/llm/vendor_test_DCU/moe_quant_group_gemm.json @@ -0,0 +1,25 @@ +{ + "moe_quant_group_gemm": [ + { + "arg_type": "llm", + "dtype.w_dtype.compute_dtype.dst_dtype": [ + ["int8", "int32", "int8", "bfloat16"] + ], + "ep_size": [ + 4, 8, 16 + ], + "num_experts.topk.hidden_size.new_hidden_size": [ + [16, 8, 4096, 3072], + [16, 8, 7168, 4096], + [128, 8, 4096, 3072], + [128, 8, 7168, 4096] + ], + "num_tokens": [ + 1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 640, 768, 896, 1024, 1280, 1536, 1792, + 2048, 4096, 6192, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, + 28672, 30720, 32768, 40960, 49152, 57344, 65536, 73728, 81920, 90112, 98304, 106496, + 114688, 122880, 131072 + ] + } + ] +} diff --git a/projects/micro_perf/workloads/llm/vendor_test_DCU/quant_matmul.json b/projects/micro_perf/workloads/llm/vendor_test_DCU/quant_matmul.json new file mode 100644 index 00000000..245936c7 --- /dev/null +++ b/projects/micro_perf/workloads/llm/vendor_test_DCU/quant_matmul.json @@ -0,0 +1,22 @@ +{ + "quant_matmul": [ + { + "arg_type": "llm", + "dtype.w_dtype.compute_dtype.dst_dtype": [ + ["int8", "int8", "int8", "bfloat16"] + ], + "sp_size.hidden_size.new_hidden_size": [ + [1, 4096, 4096], + [1, 8192, 8192], + [1, 8192, 1024], + [1, 1024, 8192] + ], + "num_tokens": [ + 1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 640, 768, 896, 1024, 1280, 1536, 1792, + 2048, 4096, 6192, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, + 28672, 30720, 32768, 40960, 49152, 57344, 65536, 73728, 81920, 90112, 98304, 106496, + 114688, 122880, 131072 + ] + } + ] +}