tile-ai · kurisu6912 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/src/op/reduce.cc b/src/op/reduce.cc
diff --git a/src/op/reduce.h b/src/op/reduce.h
@@ -130,7 +130,9 @@ class ReduceOpNode : public TileOperatorNode {
   /// Generate initial value for reduction
   PrimExpr MakeInitValue() const;
   /// Generate reduction expression
-  PrimExpr MakeReduce(const PrimExpr &acc, const PrimExpr &b) const;
+  /// pack_lanes = 1 for scalar, 2 for add2/max2/min2, etc.
+  PrimExpr MakeReduce(const PrimExpr &acc, const PrimExpr &b,
+                      int pack_lanes = 1) const;
   /// Generate codegen reducer string
   std::string MakeCodegenReducer() const;
 };

diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc
@@ -3,7 +3,9 @@
 #include <ctime>
 #include <iomanip>
 #include <iostream>
+#include <sstream>
 #include <string>
+#include <unordered_map>
 
 namespace tvm {
 namespace runtime {
@@ -17,6 +19,24 @@ const char *level_strings[] = {
     ": Error: ",   // TVM_LOG_LEVEL_ERROR = 3
     ": Fatal: ",   // TVM_LOG_LEVEL_FATAL = 4
 };
+
+constexpr const char *kSrcPrefix = "/src/";
+constexpr const size_t kSrcPrefixLength = 5;
+constexpr const char *kDefaultKeyword = "DEFAULT";
+
+std::string FileToVLogMapKey(const std::string &filename) {
+  size_t last_src =
+      filename.rfind(kSrcPrefix, std::string::npos, kSrcPrefixLength);
+  if (last_src == std::string::npos) {
+    std::string no_slash_src{kSrcPrefix + 1};
+    if (filename.substr(0, no_slash_src.size()) == no_slash_src) {
+      return filename.substr(no_slash_src.size());
+    }
+  }
+  return (last_src == std::string::npos)
+             ? filename
+             : filename.substr(last_src + kSrcPrefixLength);
+}
 } // namespace
 
 void LogMessageImpl(const std::string &file, int lineno, int level,
@@ -39,6 +59,75 @@ void LogMessageImpl(const std::string &file, int lineno, int level,
   throw InternalError(file, lineno, message);
 }
 
+TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char *opt_spec) {
+  TvmLogDebugSettings settings;
+  if (opt_spec == nullptr) {
+    return settings;
+  }
+  std::string spec(opt_spec);
+  if (spec.empty() || spec == "0") {
+    return settings;
+  }
+  settings.dlog_enabled_ = true;
+  if (spec == "1") {
+    return settings;
+  }
+  std::istringstream spec_stream(spec);
+  auto tell_pos = [&](const std::string &last_read) {
+    int pos = spec_stream.tellg();
+    if (pos == -1) {
+      pos = spec.size() - last_read.size();
+    }
+    return pos;
+  };
+  while (spec_stream) {
+    std::string name;
+    if (!std::getline(spec_stream, name, '=')) {
+      break;
+    }
+    if (name.empty()) {
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(name)
+                 << ": empty filename";
+    }
+    name = FileToVLogMapKey(name);
+    std::string level;
+    if (!std::getline(spec_stream, level, ',')) {
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level)
+                 << ": expecting \"=<level>\" after \"" << name << "\"";
+      return settings;
+    }
+    if (level.empty()) {
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level)
+                 << ": empty level after \"" << name << "\"";
+      return settings;
+    }
+    char *end_of_level = nullptr;
+    int level_val = static_cast<int>(strtol(level.c_str(), &end_of_level, 10));
+    if (end_of_level != level.c_str() + level.size()) {
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level)
+                 << ": invalid level: \"" << level << "\"";
+      return settings;
+    }
+    LOG(INFO) << "TVM_LOG_DEBUG enables VLOG statements in '" << name
+              << "' up to level " << level;
+    settings.vlog_level_map_.emplace(name, level_val);
+  }
+  return settings;
+}
+
+bool TvmLogDebugSettings::VerboseEnabledImpl(const std::string &filename,
+                                             int level) const {
+  auto itr = vlog_level_map_.find(FileToVLogMapKey(filename));
+  if (itr != vlog_level_map_.end()) {
+    return level <= itr->second;
+  }
+  itr = vlog_level_map_.find(kDefaultKeyword);
+  if (itr != vlog_level_map_.end()) {
+    return level <= itr->second;
+  }
+  return false;
+}
+
 } // namespace detail
 } // namespace runtime
 } // namespace tvm
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
@@ -4359,10 +4359,29 @@ void CodeGenTileLangCUDA::VisitExpr_(const ShuffleNode *op,
       os << "uint1{__pack_nv_bfloat162(" << e0 << ", " << e1 << ")}";
     } else {
       enable_fp16_ = true;
-      // __pack_half2 returns __half2 which is 32-bit.
-      // Reinterpret via aggregate initialisation.
-      os << "uint1{*(unsigned*)&(__pack_half2((__half)(" << e0 << "), (__half)("
-         << e1 << ")))}";
+      os << "uint1{tl::pack_half2(" << e0 << ", " << e1 << ")}";
+    }
+    return;
+  }
+  // Handle ExtractElement: extract a scalar lane from a bfloat16x2 / float16x2
+  // vector (produced by packed reduction, etc.). The vector is stored as an
+  // opaque uint1 in the lowered code, but semantically it is a packed pair.
+  DataType vec_t =
+      op->vectors.size() == 1 ? op->vectors[0].dtype() : DataType();
+  bool vec_is_bf16x2 = vec_t.is_bfloat16() && vec_t.lanes() == 2;
+  bool vec_is_fp16x2 = vec_t.is_float16() && vec_t.lanes() == 2;
+  if ((vec_is_bf16x2 || vec_is_fp16x2) && op->vectors.size() == 1 &&
+      op->indices.size() == 1) {
+    int lane = Downcast<IntImm>(op->indices[0])->value;
+    std::string vec = PrintExpr(op->vectors[0]);
+    if (vec_is_bf16x2) {
+      enable_bf16_ = true;
+      os << "bfloat16_t(((nv_bfloat162*)(&(" << vec << ")))->"
+         << (lane == 0 ? "x" : "y") << ")";
+    } else {
+      enable_fp16_ = true;
+      os << "half_t(((half2*)(&(" << vec << ")))->" << (lane == 0 ? "x" : "y")
+         << ")";
     }
     return;
   }

diff --git a/src/tl_templates/cuda/common.h b/src/tl_templates/cuda/common.h
@@ -676,6 +676,13 @@ template <typename T> TL_DEVICE uint1 to_uint1(T v) {
   return r;
 }
 
+// Pack two half_t into a uint1.
+TL_DEVICE uint1 pack_half2(half_t a, half_t b) {
+  unsigned packed =
+      __pack_half2(static_cast<__half>(a), static_cast<__half>(b));
+  return uint1{packed};
+}
+
 // --- add2 ----------------------------------------------------------------
 
 TL_DEVICE float2 add2(float2 a, float2 b) {
@@ -959,4 +966,26 @@ TL_DEVICE bfloat16_t shfl_sync(unsigned mask, bfloat16_t val, int srcLane) {
   return reinterpret_cast<bfloat16_t &>(ret16);
 }
 
+// Specializations for uint1 (packed bfloat16x2 / float16x2).
+// uint1 is a 32-bit struct { unsigned x; } used to represent packed pairs.
+// __shfl_xor_sync operates on native 32-bit types, so we pass the raw unsigned.
+
+template <>
+TL_DEVICE uint1 shfl_xor_sync(unsigned mask, uint1 val, int laneMask) {
+  return uint1{__shfl_xor_sync(mask, val.x, laneMask)};
+}
+
+template <>
+TL_DEVICE uint1 shfl_down_sync(unsigned mask, uint1 val, int delta) {
+  return uint1{__shfl_down_sync(mask, val.x, delta)};
+}
+
+template <> TL_DEVICE uint1 shfl_up_sync(unsigned mask, uint1 val, int delta) {
+  return uint1{__shfl_up_sync(mask, val.x, delta)};
+}
+
+template <> TL_DEVICE uint1 shfl_sync(unsigned mask, uint1 val, int srcLane) {
+  return uint1{__shfl_sync(mask, val.x, srcLane)};
+}
+
 } // namespace tl
diff --git a/src/tl_templates/cuda/reduce.h b/src/tl_templates/cuda/reduce.h
@@ -88,6 +88,48 @@ struct MinOpNan {
   }
 };
 
+struct SumOp_bf16x2 {
+  template <typename T> TL_DEVICE T operator()(T const &x, T const &y) {
+    return tl::to_uint1(tl::add2(tl::from_uint1<__nv_bfloat162>(x),
+                                 tl::from_uint1<__nv_bfloat162>(y)));
+  }
+};
+
+struct MaxOp_bf16x2 {
+  template <typename T> TL_DEVICE T operator()(T const &x, T const &y) {
+    return tl::to_uint1(tl::max2(tl::from_uint1<__nv_bfloat162>(x),
+                                 tl::from_uint1<__nv_bfloat162>(y)));
+  }
+};
+
+struct MinOp_bf16x2 {
+  template <typename T> TL_DEVICE T operator()(T const &x, T const &y) {
+    return tl::to_uint1(tl::min2(tl::from_uint1<__nv_bfloat162>(x),
+                                 tl::from_uint1<__nv_bfloat162>(y)));
+  }
+};
+
+struct SumOp_fp16x2 {
+  template <typename T> TL_DEVICE T operator()(T const &x, T const &y) {
+    return tl::to_uint1(
+        tl::add2(tl::from_uint1<__half2>(x), tl::from_uint1<__half2>(y)));
+  }
+};
+
+struct MaxOp_fp16x2 {
+  template <typename T> TL_DEVICE T operator()(T const &x, T const &y) {
+    return tl::to_uint1(
+        tl::max2(tl::from_uint1<__half2>(x), tl::from_uint1<__half2>(y)));
+  }
+};
+
+struct MinOp_fp16x2 {
+  template <typename T> TL_DEVICE T operator()(T const &x, T const &y) {
+    return tl::to_uint1(
+        tl::min2(tl::from_uint1<__half2>(x), tl::from_uint1<__half2>(y)));
+  }
+};
+
 struct BitAndOp {
   template <typename T> TL_DEVICE T operator()(T const &x, T const &y) {
     return x & y;

diff --git a/testing/python/language/test_tilelang_language_reduce.py b/testing/python/language/test_tilelang_language_reduce.py
@@ -7,7 +7,6 @@
 
 tilelang.testing.set_random_seed()
 
-
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@@ -76,6 +75,8 @@ def _reduce_op(T, op, src, dst, dim, batch=1):
     ("sum", T.float32, 128, 64, "shared", "fragment", 256, 2),
     ("sum", T.float32, 128, 64, "shared", "fragment", 256, 4),
     ("sum", T.float16, 64, 128, "fragment", "fragment", 256, 4),
+    ("sum", T.bfloat16, 128, 128, "fragment", "fragment", 32, 1),
+    ("sum", T.bfloat16, 64, 128, "fragment", "fragment", 256, 4),
     ("max", T.bfloat16, 128, 64, "shared", "fragment", 256, 2),
     ("max", T.float32, 128, 128, "fragment", "fragment", 256, 4),
     ("min", T.float32, 64, 128, "shared", "fragment", 128, 2),