Skip to content
Open
36 changes: 32 additions & 4 deletions src/backend/cuda/codegen/codegen_cuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,10 @@ void CodeGenTileLangCUDA::PrintVecBinaryOp(const std::string &op, DataType t,
tl_func = "min2";
else if (op == "max")
tl_func = "max2";
else if (op == "min_nan")
tl_func = "min2_nan";
else if (op == "max_nan")
tl_func = "max2_nan";

if (!tl_func.empty()) {
// Decompose into lanes/2 independent x2 packed operations.
Expand Down Expand Up @@ -3768,6 +3772,7 @@ bool CodeGenTileLangCUDA::HandleLateIntrinsicCall(const CallNode *op,
} else if (op->op.same_as(tl::add2()) || op->op.same_as(tl::sub2()) ||
op->op.same_as(tl::mul2()) || op->op.same_as(tl::fma2()) ||
op->op.same_as(tl::max2()) || op->op.same_as(tl::min2()) ||
op->op.same_as(tl::max2_nan()) || op->op.same_as(tl::min2_nan()) ||
op->op.same_as(tl::abs2())) {
// Packed x2 element-wise math intrinsics.
//
Expand All @@ -3790,6 +3795,10 @@ bool CodeGenTileLangCUDA::HandleLateIntrinsicCall(const CallNode *op,
op_name = "max2";
else if (op->op.same_as(tl::min2()))
op_name = "min2";
else if (op->op.same_as(tl::max2_nan()))
op_name = "max2_nan";
else if (op->op.same_as(tl::min2_nan()))
op_name = "min2_nan";
else
op_name = "abs2";

Expand Down Expand Up @@ -4427,10 +4436,29 @@ void CodeGenTileLangCUDA::VisitExpr_(const ShuffleNode *op,
os << "uint1{__pack_nv_bfloat162(" << e0 << ", " << e1 << ")}";
} else {
enable_fp16_ = true;
// __pack_half2 returns __half2 which is 32-bit.
// Reinterpret via aggregate initialisation.
os << "uint1{*(unsigned*)&(__pack_half2((__half)(" << e0 << "), (__half)("
<< e1 << ")))}";
os << "uint1{tl::pack_half2(" << e0 << ", " << e1 << ")}";
}
Comment on lines +4439 to +4440
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
rg -n "pack_half2|uint1\\{tl::pack_half2" src/target/codegen_cuda.cc src/tl_templates/cuda/common.h -C2

Repository: tile-ai/tilelang

Length of output: 2857


Remove the extra uint1{} around tl::pack_half2.

tl::pack_half2 at line 680 in src/tl_templates/cuda/common.h already returns uint1, so wrapping it again produces nested initialization uint1{uint1{...}} in the generated code. Other pack operations in the same file use __pack_half2 without extra wrapping.

Proposed fix
-      os << "uint1{tl::pack_half2(" << e0 << ", " << e1 << ")}";
+      os << "tl::pack_half2(" << e0 << ", " << e1 << ")";
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
os << "uint1{tl::pack_half2(" << e0 << ", " << e1 << ")}";
}
os << "tl::pack_half2(" << e0 << ", " << e1 << ")";
}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/target/codegen_cuda.cc` around lines 4362 - 4363, The generated code
wraps tl::pack_half2 in an extra uint1{...} causing nested uint1 initializers;
update the emitter in codegen_cuda.cc that currently does os <<
"uint1{tl::pack_half2(" << e0 << ", " << e1 << ")}"; to instead emit os <<
"tl::pack_half2(" << e0 << ", " << e1 << ")"; so the output uses the single
uint1 returned by tl::pack_half2 (also check similar pack emitters to match the
__pack_half2 style).

return;
}
// Handle ExtractElement: extract a scalar lane from a bfloat16x2 / float16x2
// vector (produced by packed reduction, etc.). The vector is stored as an
// opaque uint1 in the lowered code, but semantically it is a packed pair.
DataType vec_t =
op->vectors.size() == 1 ? op->vectors[0].dtype() : DataType();
bool vec_is_bf16x2 = vec_t.is_bfloat16() && vec_t.lanes() == 2;
bool vec_is_fp16x2 = vec_t.is_float16() && vec_t.lanes() == 2;
if ((vec_is_bf16x2 || vec_is_fp16x2) && op->vectors.size() == 1 &&
op->indices.size() == 1) {
int lane = Downcast<IntImm>(op->indices[0])->value;
std::string vec = PrintExpr(op->vectors[0]);
if (vec_is_bf16x2) {
enable_bf16_ = true;
os << "bfloat16_t(((nv_bfloat162*)(&(" << vec << ")))->"
<< (lane == 0 ? "x" : "y") << ")";
} else {
enable_fp16_ = true;
os << "half_t(((half2*)(&(" << vec << ")))->" << (lane == 0 ? "x" : "y")
<< ")";
}
return;
}
Expand Down
Loading