Skip to content

Commit 2fcc3f7

Browse files
Luke Iwanskibenoitsteiner
authored andcommitted
Improvements to the SYCL device support
* Registers FloorDiv, FloorMod and SoftMax Ops for SYCL device
1 parent 426eec9 commit 2fcc3f7

File tree

9 files changed

+179
-5
lines changed

9 files changed

+179
-5
lines changed

tensorflow/core/kernels/concat_lib.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,14 @@ void ConcatGPU(
3838
Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
3939

4040
#endif // GOOGLE_CUDA
41+
42+
#ifdef TENSORFLOW_USE_SYCL
43+
template <typename T>
44+
void ConcatSYCL(const Eigen::SyclDevice& d,
45+
const std::vector<
46+
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
47+
typename TTypes<T, 2>::Matrix* output);
48+
#endif // TENSORFLOW_USE_SYCL
4149
} // namespace tensorflow
4250

4351
#endif // TENSORFLOW_KERNELS_CONCAT_LIB_H_

tensorflow/core/kernels/concat_lib_cpu.cc

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,23 @@ REGISTER(qint16)
7474
REGISTER(qint32)
7575
REGISTER(bfloat16)
7676

77+
#ifdef TENSORFLOW_USE_SYCL
78+
template <typename T>
79+
void ConcatSYCL(const Eigen::SyclDevice& d,
80+
const std::vector<
81+
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
82+
typename TTypes<T, 2>::Matrix* output) {
83+
ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
84+
output);
85+
}
86+
#define REGISTER_SYCL(T) \
87+
template void ConcatSYCL<T>( \
88+
const Eigen::SyclDevice&, \
89+
const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
90+
typename TTypes<T, 2>::Matrix* output);
91+
92+
TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL)
93+
94+
#undef REGISTER_SYCL
95+
#endif // TENSORFLOW_USE_SYCL
7796
} // namespace tensorflow

tensorflow/core/kernels/concat_lib_cpu.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,4 +126,39 @@ void ConcatCPUImpl(
126126
cost_per_unit, work);
127127
}
128128

129+
#ifdef TENSORFLOW_USE_SYCL
130+
template <typename T, typename ElementCopier>
131+
void ConcatSYCLImpl(
132+
const Eigen::SyclDevice& d,
133+
const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
134+
inputs,
135+
int64 cost_per_unit, ElementCopier copier,
136+
typename TTypes<T, 2>::Matrix* output) {
137+
size_t num_inputs = inputs.size();
138+
139+
std::vector<ptrdiff_t> sizes;
140+
sizes.reserve(num_inputs);
141+
int64 row_size = 0;
142+
for (const auto& input : inputs) {
143+
sizes.push_back(input->dimension(1));
144+
row_size += sizes.back();
145+
}
146+
147+
T* out = &(*output)(0, 0);
148+
std::vector<const T*> inp;
149+
inp.reserve(num_inputs);
150+
for (const auto& input : inputs) {
151+
inp.push_back(&(*input)(0, 0));
152+
}
153+
const int64 dim0 = output->dimension(0);
154+
for (int64 i = 0; i < dim0; ++i) {
155+
for (int64 j = 0; j < num_inputs; ++j) {
156+
auto size = sizes[j];
157+
d.memcpy(out, inp[j], size * sizeof(T));
158+
out += size;
159+
inp[j] += size;
160+
}
161+
}
162+
}
163+
#endif // TENSORFLOW_USE_SYCL
129164
} // namespace tensorflow

tensorflow/core/kernels/concat_op.cc

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,12 @@ class ConcatBaseOp : public OpKernel {
137137
return;
138138
}
139139
#endif // GOOGLE_CUDA
140+
#ifdef TENSORFLOW_USE_SYCL
141+
if (std::is_same<Device, SYCLDevice>::value) {
142+
ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
143+
return;
144+
}
145+
#endif // TENSORFLOW_USE_SYCL
140146
ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
141147
}
142148
}
@@ -210,6 +216,39 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
210216

211217
#endif // GOOGLE_CUDA
212218

219+
#ifdef TENSORFLOW_USE_SYCL
220+
#define REGISTER_SYCL(type) \
221+
REGISTER_KERNEL_BUILDER(Name("Concat") \
222+
.Device(DEVICE_SYCL) \
223+
.TypeConstraint<type>("T") \
224+
.HostMemory("concat_dim"), \
225+
ConcatOp<SYCLDevice, type>) \
226+
REGISTER_KERNEL_BUILDER(Name("ConcatV2") \
227+
.Device(DEVICE_SYCL) \
228+
.TypeConstraint<type>("T") \
229+
.TypeConstraint<int32>("Tidx") \
230+
.HostMemory("axis"), \
231+
ConcatV2Op<SYCLDevice, type>)
232+
233+
TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL);
234+
REGISTER_KERNEL_BUILDER(Name("Concat")
235+
.Device(DEVICE_SYCL)
236+
.TypeConstraint<int32>("T")
237+
.HostMemory("concat_dim")
238+
.HostMemory("values")
239+
.HostMemory("output"),
240+
ConcatOp<CPUDevice, int32>);
241+
REGISTER_KERNEL_BUILDER(Name("ConcatV2")
242+
.Device(DEVICE_SYCL)
243+
.TypeConstraint<int32>("T")
244+
.TypeConstraint<int32>("Tidx")
245+
.HostMemory("values")
246+
.HostMemory("axis")
247+
.HostMemory("output"),
248+
ConcatV2Op<CPUDevice, int32>);
249+
#undef REGISTER_SYCL
250+
#endif // TENSORFLOW_USE_SYCL
251+
213252
class ConcatOffsetOp : public OpKernel {
214253
public:
215254
explicit ConcatOffsetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}

tensorflow/core/kernels/cwise_op_floor_div.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,14 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
4040
.TypeConstraint<int32>("T"),
4141
BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
4242
#endif
43+
44+
#ifdef TENSORFLOW_USE_SYCL
45+
REGISTER_KERNEL_BUILDER(Name("FloorDiv")
46+
.Device(DEVICE_SYCL)
47+
.HostMemory("x")
48+
.HostMemory("y")
49+
.HostMemory("z")
50+
.TypeConstraint<int32>("T"),
51+
BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
52+
#endif // TENSORFLOW_USE_SYCL
4353
} // namespace tensorflow

tensorflow/core/kernels/cwise_op_floor_mod.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,14 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
3131
.TypeConstraint<int32>("T"),
3232
BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
3333
#endif
34+
35+
#ifdef TENSORFLOW_USE_SYCL
36+
REGISTER_KERNEL_BUILDER(Name("FloorMod")
37+
.Device(DEVICE_SYCL)
38+
.HostMemory("x")
39+
.HostMemory("y")
40+
.HostMemory("z")
41+
.TypeConstraint<int32>("T"),
42+
BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
43+
#endif // TENSORFLOW_USE_SYCL
3444
} // namespace tensorflow

tensorflow/core/kernels/reverse_op.cc

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ namespace tensorflow {
3333

3434
typedef Eigen::ThreadPoolDevice CPUDevice;
3535
typedef Eigen::GpuDevice GPUDevice;
36+
#ifdef TENSORFLOW_USE_SYCL
37+
typedef Eigen::SyclDevice SYCLDevice;
38+
#endif // TENSORFLOW_USE_SYCL
3639

3740
namespace {
3841

@@ -351,4 +354,36 @@ REGISTER_KERNEL_BUILDER(Name("ReverseV2")
351354
ReverseV2Op<CPUDevice, int32>);
352355
#endif // GOOGLE_CUDA
353356

357+
#ifdef TENSORFLOW_USE_SYCL
358+
#define REGISTER_SYCL_KERNELS(T) \
359+
REGISTER_KERNEL_BUILDER(Name("Reverse") \
360+
.Device(DEVICE_SYCL) \
361+
.TypeConstraint<T>("T") \
362+
.HostMemory("dims"), \
363+
ReverseOp<SYCLDevice, T>) \
364+
REGISTER_KERNEL_BUILDER(Name("ReverseV2") \
365+
.Device(DEVICE_SYCL) \
366+
.TypeConstraint<T>("T") \
367+
.TypeConstraint<int32>("Tidx") \
368+
.HostMemory("axis"), \
369+
ReverseV2Op<SYCLDevice, T>)
370+
TF_CALL_float(REGISTER_SYCL_KERNELS);
371+
372+
REGISTER_KERNEL_BUILDER(Name("Reverse")
373+
.Device(DEVICE_SYCL)
374+
.TypeConstraint<int32>("T")
375+
.HostMemory("tensor")
376+
.HostMemory("dims")
377+
.HostMemory("output"),
378+
ReverseOp<CPUDevice, int32>);
379+
REGISTER_KERNEL_BUILDER(Name("ReverseV2")
380+
.Device(DEVICE_SYCL)
381+
.TypeConstraint<int32>("T")
382+
.TypeConstraint<int32>("Tidx")
383+
.HostMemory("tensor")
384+
.HostMemory("axis")
385+
.HostMemory("output"),
386+
ReverseV2Op<CPUDevice, int32>);
387+
#endif // TENSORFLOW_USE_SYCL
388+
354389
} // namespace tensorflow

tensorflow/core/kernels/softmax_op.cc

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,27 @@ namespace tensorflow {
2828

2929
typedef Eigen::ThreadPoolDevice CPUDevice;
3030
typedef Eigen::GpuDevice GPUDevice;
31+
#ifdef TENSORFLOW_USE_SYCL
32+
typedef Eigen::SyclDevice SYCLDevice;
33+
#endif // TENSORFLOW_USE_SYCL
3134

3235
// Partial specialization for a CPUDevice, that uses the Eigen implementation
3336
// from SoftmaxEigenImpl.
3437
namespace functor {
35-
template <typename T>
36-
struct SoftmaxFunctor<CPUDevice, T> {
37-
void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
38+
template <typename Device, typename T>
39+
struct SoftmaxFunctorBase {
40+
void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
3841
typename TTypes<T>::Matrix softmax, const bool log) {
39-
SoftmaxEigenImpl<CPUDevice, T>::Compute(d, logits, softmax, log);
42+
SoftmaxEigenImpl<Device, T>::Compute(d, logits, softmax, log);
4043
}
4144
};
45+
template <typename T>
46+
struct SoftmaxFunctor<CPUDevice, T> : SoftmaxFunctorBase<CPUDevice, T> {};
47+
48+
#ifdef TENSORFLOW_USE_SYCL
49+
template <typename T>
50+
struct SoftmaxFunctor<SYCLDevice, T> : SoftmaxFunctorBase<SYCLDevice, T> {};
51+
#endif // TENSORFLOW_USE_SYCL
4252
} // namespace functor
4353

4454
#define REGISTER_CPU(T) \
@@ -76,4 +86,10 @@ REGISTER_KERNEL_BUILDER(
7686
SoftmaxOp<GPUDevice, float>);
7787
#endif // GOOGLE_CUDA
7888

89+
#ifdef TENSORFLOW_USE_SYCL
90+
REGISTER_KERNEL_BUILDER(
91+
Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<float>("T"),
92+
SoftmaxOp<SYCLDevice, float>);
93+
#endif // TENSORFLOW_USE_SYCL
94+
7995
} // namespace tensorflow

third_party/sycl/crosstool/computecpp.tpl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,13 @@ def main():
7070
x = subprocess.call([COMPUTECPP_DRIVER] + computecpp_device_compiler_flags )
7171
if(x == 0):
7272
# dont want that in case of compiling with computecpp first
73-
host_compiler_flags = [flag.replace('-c', '--include') for flag in compiler_flags
73+
host_compiler_flags = [flag for flag in compiler_flags
7474
if not flag.startswith(('-MF', '-MD',))
7575
if not '.d' in flag
7676
]
7777

78+
host_compiler_flags[host_compiler_flags.index('-c')] = "--include"
79+
7880
host_compiler_flags = ['-xc++', '-D_GLIBCXX_USE_CXX11_ABI=0', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-c', bc_out] + host_compiler_flags
7981
x = subprocess.call([CPU_CXX_COMPILER] + host_compiler_flags)
8082
return x

0 commit comments

Comments
 (0)