Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions recipe/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{% set version = "2.19.1" %}
{% set estimator_version = "2.15.0" %}
{% set build = 2 %}
{% set build = 3 %}

{% if cuda_compiler_version != "None" %}
{% set build = build + 200 %}
Expand Down Expand Up @@ -75,8 +75,10 @@ source:
- patches/0041-Disable-profiler.patch
- patches/0042-bump-h5py-req.patch # [aarch64]
- patches/0043-cross-arch-config.patch # [aarch64 and target_platform != build_platform]
# https://github.com/tensorflow/tensorflow/pull/99046
- patches/99046-Fix-shared-memory-alignment-in-concat-split-GPU-kernels.patch
# backport https://github.com/tensorflow/tensorflow/pull/99364
- patches/0044-Support-for-TFv2.20-to-compile-with-CUDA-v12.9.1.patch
- patches/99364-Support-for-TFv2.20-to-compile-with-CUDA-v12.9.1.diff
- url: https://github.com/tensorflow/estimator/archive/refs/tags/v{{ estimator_version.replace(".rc", "-rc") }}.tar.gz
sha256: 2d7e100b1878084da34b5e23b49a0cbb5ee8a7add74b7dd189a82ada1cf85530
folder: tensorflow-estimator
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
From 6014ea08c6171c62c2a12956a3cf2162be0b6836 Mon Sep 17 00:00:00 2001
From: stevemcgregory <stevemcgregory@gmail.com>
Date: Sat, 16 Aug 2025 09:16:35 -0500
Subject: [PATCH] Fix shared memory alignment in concat/split GPU kernels

Align shared memory buffer to 16 bytes (or max of T and IntType) to
avoid misaligned access issues on newer GPUs.
---
tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc | 5 ++++-
tensorflow/core/kernels/split_lib_gpu.cu.cc | 5 ++++-
2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 031464dfdd9ca2..a6cece16d20ddf 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -70,7 +70,10 @@ __global__ void concat_variable_kernel(
IntType num_inputs = input_ptr_data.size;

// verbose declaration needed due to template
- GPU_DYNAMIC_SHARED_MEM_DECL(sizeof(T), unsigned char, smem);
+ constexpr size_t kAlignTI =
+ (alignof(T) > alignof(IntType)) ? alignof(T) : alignof(IntType);
+ constexpr size_t kAlign = (kAlignTI < 16) ? 16 : kAlignTI;
+ GPU_DYNAMIC_SHARED_MEM_DECL(kAlign, unsigned char, smem);
IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);

if (useSmem) {
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index 90b28292ac0748..b55845bb4e9f6f 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -120,7 +120,10 @@ __global__ void split_v_kernel(const T* __restrict__ input_ptr,
int num_outputs = output_ptr_data.size;

// verbose declaration needed due to template
- GPU_DYNAMIC_SHARED_MEM_DECL(sizeof(T), unsigned char, smem);
+ constexpr size_t kAlignTI =
+ (alignof(T) > alignof(IntType)) ? alignof(T) : alignof(IntType);
+ constexpr size_t kAlign = (kAlignTI < 16) ? 16 : kAlignTI;
+ GPU_DYNAMIC_SHARED_MEM_DECL(kAlign, unsigned char, smem);
IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);

if (useSmem) {
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index ae3fe6d1..737e22e5 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -410,7 +410,11 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
num_partitions_);

#if GOOGLE_CUDA
- cub::ConstantInputIterator<int32> values_in(1);
+ #if THRUST_VERSION >= 200802
+ thrust::constant_iterator<int32> values_in(1);
+ #else
+ cub::ConstantInputIterator<int32> values_in(1);
+ #endif
#elif TENSORFLOW_USE_ROCM
using ConstantInputIterator =
::rocprim::constant_iterator<int32, ptrdiff_t>;
diff --git a/tensorflow/core/kernels/where_op_gpu.cu.h b/tensorflow/core/kernels/where_op_gpu.cu.h
index 5eb03ec6..3de25b7a 100644
--- a/tensorflow/core/kernels/where_op_gpu.cu.h
+++ b/tensorflow/core/kernels/where_op_gpu.cu.h
@@ -233,6 +233,17 @@ class WhereOutputIterator {
return *(ptr_ + (valid ? (NDIM * n) : 0));
}

+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE reference operator*() const {
+ // Dereference the current pointer
+ return *ptr_;
+ }
+
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_type operator+(std::ptrdiff_t n) const {
+ return self_type(ptr_ + NDIM * n, max_row_);
+ }
+
private:
int64* ptr_;
const Eigen::DenseIndex max_row_;
Loading