diff --git a/requirements.txt b/requirements.txt index 3e71feab0..b05282668 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ pandas>=2.2.3 numba>=0.58.0 numpy>=1.26.0 transformers>=4.1,<4.56.0 +neural-compressor-pt @ file:///software/users/dsemiat/dynamic_quant/vllm/neural_compressor_pt-3.5.dev4+ga3204838c-py3-none-any.whl diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py index 4223a4219..078b01451 100644 --- a/vllm_gaudi/extension/ops.py +++ b/vllm_gaudi/extension/ops.py @@ -54,12 +54,15 @@ def block2batch(tensor, block_mapping, matmul_op=torch.matmul): def pipelined_pa(attn, value, block_bias, block_groups, block_mapping, batch_size, matmul_av_op, batch2block_matmul_op, block2batch_matmul_op): + # print(f"Danny running pipelined_pa {get_config().fused_block_softmax=} {get_config().fused_block_softmax_adjustment=}") # When fp32_softmax is enabled attn is left in fp32 after Q@K # We can return to native dtype after we renormalize and calculate the adjustments if block_bias is not None and attn.dtype != block_bias.dtype: block_bias = block_bias.to(dtype=attn.dtype) # TODO: w/a with 5D req as the block_softmax kernel does not support 4D attn tensor, which is used in e.g. Granite-3B if get_config().fused_block_softmax and get_config().fused_block_softmax_adjustment and attn.dim() == 5: + # print("Danny Warning: fused_block_softmax with 5D attn tensor is not supported yet, falling back to unfused path.") + # print("Danny using fused_block_softmax without calling the nn.Module member.") attn, block_max, block_sums = torch.ops.hpu.block_softmax(attn, block_bias, block_groups) if attn.dtype == torch.float32: attn = attn.to(value.dtype) @@ -74,6 +77,7 @@ def pipelined_pa(attn, value, block_bias, block_groups, block_mapping, batch_siz block_sums = attn.sum(dim=-1, keepdim=True) attn = matmul_av_op(attn, value) if get_config().fused_block_softmax_adjustment: + # print("Danny using block_softmax_adjustment without calling the nn.Module member.") out_shape = list(attn.shape[:3]) + [1] * (attn.dim() - 3) rescale = torch.ops.hpu.block_softmax_adjustment(block_max, block_sums.to(block_max.dtype), block_groups, batch_size, out_shape).to(attn.dtype) @@ -476,6 +480,7 @@ def forward(self, hidden_states, expert_routing_table, router_weights, permuted_ w2_list = [self.w2_list[i].weight.squeeze() for i in experts_range] if self.moe_n_slice == 1: + print(f"VllmMixtureOfExpertsOp: running with single slice, calling moe with {hidden_states.shape=}, {expert_routing_table.shape=}, {router_weights.shape=}, {activation=}") return torch.ops.hpu.mixture_of_experts(hidden_states=hidden_states, expert_routing_table=expert_routing_table, router_weights=router_weights, diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py index e5630c0ff..895503cf0 100644 --- a/vllm_gaudi/extension/utils.py +++ b/vllm_gaudi/extension/utils.py @@ -26,8 +26,8 @@ class Matmul(torch.nn.Module): def __init__(self): super(Matmul, self).__init__() - def forward(self, x, y): - return torch.matmul(x, y) + def forward(self, x, y, *args, **kwargs): + return torch.matmul(x, y, *args, **kwargs) class Softmax(torch.nn.Module):