@@ -57,31 +57,30 @@ Notice how easy it would be to change the precision (for example, `double` to `h
57
57
#include " kernel_float.h"
58
58
namespace kf = kernel_float;
59
59
60
- __ global__ void kernel(const kf::vec< half, 2>* input, float constant, kf::vec <float, 2>* output) {
60
+ __ global__ void kernel(kf::vec_ptr<const half, 2> input, int constant, kf::vec_ptr <float, 2> output) {
61
61
int i = blockIdx.x * blockDim.x + threadIdx.x;
62
- output[ i] = input[ i] + kf::cast< half >( constant) ;
62
+ output[ i] + = input[ i] * constant;
63
63
}
64
64
65
65
```
66
66
67
67
Here is how the same kernel would look for CUDA without Kernel Float.
68
68
69
69
```cpp
70
- __global__ void kernel(const __half * input, float constant, float* output) {
70
+ __global__ void kernel(const half * input, double constant, float* output) {
71
71
int i = blockIdx.x * blockDim.x + threadIdx.x;
72
72
__half in0 = input[2 * i + 0];
73
73
__half in1 = input[2 * i + 1];
74
74
__half2 a = __halves2half2(in0, in1);
75
- float b = float(constant);
76
- __half c = __float2half(b);
77
- __half2 d = __half2half2(c);
78
- __half2 e = __hadd2(a, d);
79
- __half f = __low2half(e);
80
- __half g = __high2half(e);
81
- float out0 = __half2float(f);
82
- float out1 = __half2float(g);
83
- output[2 * i + 0] = out0;
84
- output[2 * i + 1] = out1;
75
+ __half b = __int2half_rn(constant);
76
+ __half2 c = __half2half2(b);
77
+ __half2 d = __hmul2(a, c);
78
+ __half e = __low2half(d);
79
+ __half f = __high2half(d);
80
+ float out0 = __half2float(e);
81
+ float out1 = __half2float(f);
82
+ output[2 * i + 0] += out0;
83
+ output[2 * i + 1] += out1;
85
84
}
86
85
87
86
```
0 commit comments