conda-forge · hmaarrfk · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025
@@ -1,6 +1,6 @@
 {% set version = "2.19.1" %}
 {% set estimator_version = "2.15.0" %}
-{% set build = 2 %}
+{% set build = 3 %}
 
 {% if cuda_compiler_version != "None" %}
 {% set build = build + 200 %}
@@ -75,8 +75,10 @@ source:
       - patches/0041-Disable-profiler.patch
       - patches/0042-bump-h5py-req.patch     # [aarch64]
       - patches/0043-cross-arch-config.patch  # [aarch64 and target_platform != build_platform]
+      # https://github.com/tensorflow/tensorflow/pull/99046
+      - patches/99046-Fix-shared-memory-alignment-in-concat-split-GPU-kernels.patch
       # backport https://github.com/tensorflow/tensorflow/pull/99364
-      - patches/0044-Support-for-TFv2.20-to-compile-with-CUDA-v12.9.1.patch
+      - patches/99364-Support-for-TFv2.20-to-compile-with-CUDA-v12.9.1.diff
   - url: https://github.com/tensorflow/estimator/archive/refs/tags/v{{ estimator_version.replace(".rc", "-rc") }}.tar.gz
     sha256: 2d7e100b1878084da34b5e23b49a0cbb5ee8a7add74b7dd189a82ada1cf85530
     folder: tensorflow-estimator

diff --git a/recipe/patches/99046-Fix-shared-memory-alignment-in-concat-split-GPU-kernels.patch b/recipe/patches/99046-Fix-shared-memory-alignment-in-concat-split-GPU-kernels.patch
@@ -0,0 +1,44 @@
+From 6014ea08c6171c62c2a12956a3cf2162be0b6836 Mon Sep 17 00:00:00 2001
+From: stevemcgregory <stevemcgregory@gmail.com>
+Date: Sat, 16 Aug 2025 09:16:35 -0500
+Subject: [PATCH] Fix shared memory alignment in concat/split GPU kernels
+
+Align shared memory buffer to 16 bytes (or max of T and IntType) to
+avoid misaligned access issues on newer GPUs.
+---
+ tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc | 5 ++++-
+ tensorflow/core/kernels/split_lib_gpu.cu.cc       | 5 ++++-
+ 2 files changed, 8 insertions(+), 2 deletions(-)
+
+diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+index 031464dfdd9ca2..a6cece16d20ddf 100644
+--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
++++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+@@ -70,7 +70,10 @@ __global__ void concat_variable_kernel(
+   IntType num_inputs = input_ptr_data.size;
+
+   // verbose declaration needed due to template
+-  GPU_DYNAMIC_SHARED_MEM_DECL(sizeof(T), unsigned char, smem);
++  constexpr size_t kAlignTI =
++      (alignof(T) > alignof(IntType)) ? alignof(T) : alignof(IntType);
++  constexpr size_t kAlign = (kAlignTI < 16) ? 16 : kAlignTI;
++  GPU_DYNAMIC_SHARED_MEM_DECL(kAlign, unsigned char, smem);
+   IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
+
+   if (useSmem) {
+diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
+index 90b28292ac0748..b55845bb4e9f6f 100644
+--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
++++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
+@@ -120,7 +120,10 @@ __global__ void split_v_kernel(const T* __restrict__ input_ptr,
+   int num_outputs = output_ptr_data.size;
+
+   // verbose declaration needed due to template
+-  GPU_DYNAMIC_SHARED_MEM_DECL(sizeof(T), unsigned char, smem);
++  constexpr size_t kAlignTI =
++      (alignof(T) > alignof(IntType)) ? alignof(T) : alignof(IntType);
++  constexpr size_t kAlign = (kAlignTI < 16) ? 16 : kAlignTI;
++  GPU_DYNAMIC_SHARED_MEM_DECL(kAlign, unsigned char, smem);
+   IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
+
+   if (useSmem) {
diff --git a/recipe/patches/99364-Support-for-TFv2.20-to-compile-with-CUDA-v12.9.1.diff b/recipe/patches/99364-Support-for-TFv2.20-to-compile-with-CUDA-v12.9.1.diff
@@ -0,0 +1,39 @@
+diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+index ae3fe6d1..737e22e5 100644
+--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
++++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+@@ -410,7 +410,11 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
+                                             num_partitions_);
+
+ #if GOOGLE_CUDA
+-    cub::ConstantInputIterator<int32> values_in(1);
++    #if THRUST_VERSION >= 200802
++        thrust::constant_iterator<int32> values_in(1);
++    #else
++        cub::ConstantInputIterator<int32> values_in(1);
++    #endif
+ #elif TENSORFLOW_USE_ROCM
+     using ConstantInputIterator =
+         ::rocprim::constant_iterator<int32, ptrdiff_t>;
+diff --git a/tensorflow/core/kernels/where_op_gpu.cu.h b/tensorflow/core/kernels/where_op_gpu.cu.h
+index 5eb03ec6..3de25b7a 100644
+--- a/tensorflow/core/kernels/where_op_gpu.cu.h
++++ b/tensorflow/core/kernels/where_op_gpu.cu.h
+@@ -233,6 +233,17 @@ class WhereOutputIterator {
+     return *(ptr_ + (valid ? (NDIM * n) : 0));
+   }
+
++
++  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE reference operator*() const {
++    // Dereference the current pointer
++    return *ptr_;
++  }
++
++
++  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_type operator+(std::ptrdiff_t n) const {
++    return self_type(ptr_ + NDIM * n, max_row_);
++  }
++
+  private:
+   int64* ptr_;
+   const Eigen::DenseIndex max_row_;