Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion projects/micro_perf/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@

import torch.multiprocessing as mp

FILE_DIR = pathlib.Path(__file__).parent.absolute()

from xpu_perf.micro_perf.core.perf_engine import XpuPerfServer
from xpu_perf.micro_perf.core.common_utils import logger, setup_logger
from xpu_perf.micro_perf.core.common_utils import get_submodules, existing_dir_path, valid_file
from xpu_perf.micro_perf.core.common_utils import parse_tasks, parse_workload, export_reports


FILE_DIR = pathlib.Path(__file__).parent.absolute()
BYTE_MLPERF_ROOT = FILE_DIR
OP_DEFS_DIR = BYTE_MLPERF_ROOT.joinpath("op_defs")

Expand Down
2 changes: 1 addition & 1 deletion projects/micro_perf/op_defs/llm_ops/moe_gating_gemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def prepare_args(self):
# 以下参数决定当前 moe_gating_gemm 的具体数据类型
self.dtype = self.args_dict.get("dtype", "float32")
self.compute_dtype = self.args_dict.get("compute_dtype", "float32")
self.dst_dtype = self.args_dict.get("dtype", "float32")
self.dst_dtype = self.args_dict.get("dst_dtype", self.dtype)

def vendor_parser(self):
if self.dtype == "float32" and self.compute_dtype == "float32" and self.dst_dtype == "float32":
Expand Down
67 changes: 67 additions & 0 deletions projects/micro_perf/op_defs/llm_ops/moe_quant_group_gemm_down.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""LLM op: moe_quant_group_gemm_down (base definition).

This exists for compatibility with legacy split-op vendor implementations.
The semantic definition is intentionally identical to `moe_quant_group_gemm`;
vendor providers may use different kernel/packing for up/down.
"""

from ._common import *


@ProviderRegistry.register_base_impl("moe_quant_group_gemm_down", "ComputeEngine")
class MoeQuantGroupGemmDownOp(BasicOp):
def __init__(self, args_dict, backend, *args, **kwargs):
super().__init__(args_dict, backend, *args, **kwargs)

def prepare_args(self):
self.arg_type = self.args_dict["arg_type"]
if self.arg_type not in ["llm"]:
raise ValueError(
f"{type(self).__name__} only supports llm arg_type, but got {self.arg_type}"
)

# predefined attrs
self.num_tokens = self.args_dict["num_tokens"]
self.hidden_size = self.args_dict["hidden_size"]
self.new_hidden_size = self.args_dict["new_hidden_size"]

# moe info
self.num_experts = self.args_dict["num_experts"]
self.topk = self.args_dict["topk"]

# parallel info
self.ep_size = self.args_dict.get("ep_size", 1)
self.ep_rank = self.args_dict.get("ep_rank", 0)

# get moe token dispatch info
(
self.num_scatter_tokens,
self.num_scatter_tokens_per_rank,
self.num_experts_per_rank,
self.experts_start_idx,
self.experts_end_idx,
self.all_select_experts,
self.all_select_weights,
self.dispatch_tokens,
self.used_src_tokens,
self.expert_dispatch_tokens,
self.expert_dispatch_weights,
self.scatter_token_id,
self.scatter_token_weight,
self.expert_dispatch_token_count,
self.expert_dispatch_token_offset,
) = get_moe_tokens_info(
self.num_tokens,
self.num_experts,
self.topk,
ep_size=self.ep_size,
ep_rank=self.ep_rank,
)

# dtype tuple
self.dtype = self.args_dict.get("dtype", "int8")
self.w_dtype = self.args_dict.get("w_dtype", "int8")
self.compute_dtype = self.args_dict.get("compute_dtype", "int8")
self.dst_dtype = self.args_dict.get("dst_dtype", "bfloat16")


67 changes: 67 additions & 0 deletions projects/micro_perf/op_defs/llm_ops/moe_quant_group_gemm_up.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""LLM op: moe_quant_group_gemm_up (base definition).

This exists for compatibility with legacy split-op vendor implementations.
The semantic definition is intentionally identical to `moe_quant_group_gemm`;
vendor providers may use different kernel/packing for up/down.
"""

from ._common import *


@ProviderRegistry.register_base_impl("moe_quant_group_gemm_up", "ComputeEngine")
class MoeQuantGroupGemmUpOp(BasicOp):
def __init__(self, args_dict, backend, *args, **kwargs):
super().__init__(args_dict, backend, *args, **kwargs)

def prepare_args(self):
self.arg_type = self.args_dict["arg_type"]
if self.arg_type not in ["llm"]:
raise ValueError(
f"{type(self).__name__} only supports llm arg_type, but got {self.arg_type}"
)

# predefined attrs
self.num_tokens = self.args_dict["num_tokens"]
self.hidden_size = self.args_dict["hidden_size"]
self.new_hidden_size = self.args_dict["new_hidden_size"]

# moe info
self.num_experts = self.args_dict["num_experts"]
self.topk = self.args_dict["topk"]

# parallel info
self.ep_size = self.args_dict.get("ep_size", 1)
self.ep_rank = self.args_dict.get("ep_rank", 0)

# get moe token dispatch info
(
self.num_scatter_tokens,
self.num_scatter_tokens_per_rank,
self.num_experts_per_rank,
self.experts_start_idx,
self.experts_end_idx,
self.all_select_experts,
self.all_select_weights,
self.dispatch_tokens,
self.used_src_tokens,
self.expert_dispatch_tokens,
self.expert_dispatch_weights,
self.scatter_token_id,
self.scatter_token_weight,
self.expert_dispatch_token_count,
self.expert_dispatch_token_offset,
) = get_moe_tokens_info(
self.num_tokens,
self.num_experts,
self.topk,
ep_size=self.ep_size,
ep_rank=self.ep_rank,
)

# dtype tuple
self.dtype = self.args_dict.get("dtype", "int8")
self.w_dtype = self.args_dict.get("w_dtype", "int8")
self.compute_dtype = self.args_dict.get("compute_dtype", "int8")
self.dst_dtype = self.args_dict.get("dst_dtype", "bfloat16")


3 changes: 2 additions & 1 deletion projects/micro_perf/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@

import torch.multiprocessing as mp

FILE_DIR = pathlib.Path(__file__).parent.absolute()

from xpu_perf.micro_perf.core.perf_engine import XpuPerfServer
from xpu_perf.micro_perf.core.common_utils import logger, setup_logger
from xpu_perf.micro_perf.core.common_utils import get_submodules, existing_dir_path, valid_file

from flask import Flask, request, jsonify, Response, stream_with_context


FILE_DIR = pathlib.Path(__file__).parent.absolute()
BYTE_MLPERF_ROOT = FILE_DIR
OP_DEFS_DIR = BYTE_MLPERF_ROOT.joinpath("op_defs")

Expand Down
12 changes: 12 additions & 0 deletions projects/micro_perf/vendor_ops/DCU/ops/custom_ops/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import importlib.metadata

from xpu_perf.micro_perf.core.op import ProviderRegistry

PROVIDER_NAME = "dcu_custom_ops"

try:
ProviderRegistry.register_provider_info(
"custom_ops", {"custom_ops": importlib.metadata.version("custom_ops")}
)
except Exception:
pass
32 changes: 32 additions & 0 deletions projects/micro_perf/vendor_ops/DCU/ops/custom_ops/add_rms_norm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from xpu_perf.micro_perf.core.op import ProviderRegistry
from xpu_perf_provider_base_ops.llm_ops.add_rms_norm import AddRmsNormOp
from xpu_perf.micro_perf.core.utils import calc_tensor_size

try:
from custom_ops import addrmsnorm
@ProviderRegistry.register_vendor_impl("add_rms_norm", "custom_ops")
class CustomopsAddRMSNormop(AddRmsNormOp):
def __init__(self, args_dict, backend, *args, **kwargs):
super().__init__(args_dict, backend, *args, **kwargs)

self.extra_providers = ["custom_ops"]

def vendor_impl(self):
# Keep base semantic tensor definitions, only swap run function.
super().vendor_impl()
if "output" in self.output_tensor_info:
self.write_bytes = calc_tensor_size(self.output_tensor_info["output"])
self.io_bytes = self.read_bytes + self.write_bytes
self._run_func = self.vendor_impl_run

def vendor_impl_run(self, tensor_mapping):
src = tensor_mapping["hidden_states"]
weight = tensor_mapping["norm_weight"]
residual = tensor_mapping["residual"]

dst = addrmsnorm(src, residual, weight, self.eps)

return dst

except:
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from functools import partial

from xpu_perf.micro_perf.core.op import ProviderRegistry
from xpu_perf_provider_base_ops.llm_ops.add_rms_norm_dynamic_quant import AddRmsNormDynamicQuantOp

try:
from custom_ops import addrmsnormdynamicquant
@ProviderRegistry.register_vendor_impl("add_rms_norm_dynamic_quant", "custom_ops")
class CustomopsAddRMSNormDynamicQuantOp(AddRmsNormDynamicQuantOp):
def __init__(self, args_dict, backend, *args, **kwargs):
super().__init__(args_dict, backend, *args, **kwargs)

self.extra_providers = ["custom_ops"]

def vendor_impl(self):
# custom_ops kernel writes into preallocated output tensors, so we must
# create outputs in the tensor mapping (base impl uses create_outputs=False).
super().vendor_impl()
self._create_tensors_func = partial(
self._create_in_out_tensors,
create_inputs=True,
create_outputs=True,
)
self._run_func = self.vendor_impl_run

def vendor_impl_run(self, tensor_mapping):

src = tensor_mapping["hidden_states"]
weight = tensor_mapping["norm_weight"]
smoothScale = tensor_mapping["smooth_scale"]
residual = tensor_mapping["residual"]
per_token_scale = tensor_mapping["per_token_scale"]
dst = tensor_mapping["quant_tokens"]

if self.output_mode == "none":
addrmsnormdynamicquant(src,weight,smoothScale,residual,dst,per_token_scale,None,None,0,self.eps)
return dst, per_token_scale
elif self.output_mode == "res":
after_res = tensor_mapping["after_res"]
addrmsnormdynamicquant(src,weight,smoothScale,residual,dst,per_token_scale,after_res,None,1,self.eps)
return dst, per_token_scale, after_res
elif self.output_mode == "norm":
after_norm = tensor_mapping["after_norm"]
addrmsnormdynamicquant(src,weight,smoothScale,residual,dst,per_token_scale,None,after_norm,2,self.eps)
return dst, per_token_scale, after_norm

except:
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from functools import partial
import torch

from xpu_perf.micro_perf.core.op import ProviderRegistry
from xpu_perf_provider_base_ops.llm_ops.moe_scatter_dynamic_quant import MoeScatterDynamicQuantOp
from xpu_perf.micro_perf.core.utils import static_quant

try:
from custom_ops import moe_scatter_dynamic_quant

@ProviderRegistry.register_vendor_impl("moe_scatter_dynamic_quant", "custom_ops")
class CustomOpsMoeScatterDynamicQuantOp(MoeScatterDynamicQuantOp):
def __init__(self, args_dict, backend, *args, **kwargs):
super().__init__(args_dict, backend, *args, **kwargs)

self.extra_providers = ["custom_ops"]

def vendor_impl(self):
# Keep base semantic tensor definitions, only swap run function.
super().vendor_impl()
self._run_func = self.vendor_impl_run

def vendor_impl_run(self, tensor_mapping):
# get pre-allocated input tensors
hidden_states = tensor_mapping["hidden_states"]
experts_smooth_scale = tensor_mapping["experts_smooth_scale"]
selected_experts = tensor_mapping["selected_experts"]
moe_weights = tensor_mapping["moe_weights"]

# get pre-allocated output tensors
scatter_tokens = tensor_mapping["scatter_tokens"]
scatter_per_token_scale = tensor_mapping["scatter_per_token_scale"]

# For ease of reference in code demonstration,
# all the following tensors are precomputed.
# Vendors are required to implement the corresponding computation logic during integration.
scatter_token_id = tensor_mapping["scatter_token_id"]
scatter_token_weight = tensor_mapping["scatter_token_weight"]
experts_token_count = tensor_mapping["experts_token_count"]
experts_token_offset = tensor_mapping["experts_token_offset"]

#import traceback
#traceback.print_stack()

if experts_smooth_scale.shape[0] == self.num_experts:
experts_smooth_scale_per_rank = experts_smooth_scale[self.experts_start_idx:self.experts_end_idx]
else:
experts_smooth_scale_per_rank = experts_smooth_scale

result = moe_scatter_dynamic_quant(
hidden_states=hidden_states,
experts_smooth_scale=experts_smooth_scale_per_rank,
selected_experts=selected_experts,
moe_weights=moe_weights,
scatter_tokens=scatter_tokens,
scatter_per_token_scale=scatter_per_token_scale,
scatter_token_id=scatter_token_id,
scatter_token_weight=scatter_token_weight,
experts_token_count=experts_token_count,
experts_token_offset=experts_token_offset,
topk=self.topk,
ep_size=self.ep_size,
ep_rank=self.ep_rank,
dst_dtype=self.dst_torch_dtype,
balanced=True,
)

if isinstance(result, tuple) and len(result) == 6:
scatter_tokens, scatter_per_token_scale, \
scatter_token_id, scatter_token_weight, \
experts_token_count, experts_token_offset = result

return scatter_tokens, scatter_per_token_scale, \
scatter_token_id, scatter_token_weight, \
experts_token_count, experts_token_offset

except:
pass
Loading