diff --git a/numba/cuda/cudaimpl.py b/numba/cuda/cudaimpl.py index 382fd9fe84e..1be33f5ecc1 100644 --- a/numba/cuda/cudaimpl.py +++ b/numba/cuda/cudaimpl.py @@ -453,7 +453,7 @@ def fp16_div_impl(context, builder, sig, args): def fp16_div(x, y): return cuda.fp16.hdiv(x, y) - return context.compile_internal(builder, fp16_div, sig, args, ) + return context.compile_internal(builder, fp16_div, sig, args) _fp16_cmp = """{{ diff --git a/numba/cuda/cudamath.py b/numba/cuda/cudamath.py index 3cfcb2bc2e2..12d9715b62e 100644 --- a/numba/cuda/cudamath.py +++ b/numba/cuda/cudamath.py @@ -13,29 +13,18 @@ @infer_global(math.asinh) @infer_global(math.atan) @infer_global(math.atanh) -@infer_global(math.ceil) -@infer_global(math.cos) @infer_global(math.cosh) @infer_global(math.degrees) @infer_global(math.erf) @infer_global(math.erfc) -@infer_global(math.exp) @infer_global(math.expm1) -@infer_global(math.fabs) -@infer_global(math.floor) @infer_global(math.gamma) @infer_global(math.lgamma) -@infer_global(math.log) -@infer_global(math.log2) -@infer_global(math.log10) @infer_global(math.log1p) @infer_global(math.radians) -@infer_global(math.sin) @infer_global(math.sinh) -@infer_global(math.sqrt) -@infer_global(math.tan) @infer_global(math.tanh) -@infer_global(math.trunc) +@infer_global(math.tan) class Math_unary(ConcreteTemplate): cases = [ signature(types.float64, types.int64), @@ -45,6 +34,27 @@ class Math_unary(ConcreteTemplate): ] +@infer_global(math.sin) +@infer_global(math.cos) +@infer_global(math.ceil) +@infer_global(math.floor) +@infer_global(math.sqrt) +@infer_global(math.log) +@infer_global(math.log2) +@infer_global(math.log10) +@infer_global(math.exp) +@infer_global(math.fabs) +@infer_global(math.trunc) +class Math_unary_with_fp16(ConcreteTemplate): + cases = [ + signature(types.float64, types.int64), + signature(types.float64, types.uint64), + signature(types.float32, types.float32), + signature(types.float64, types.float64), + signature(types.float16, types.float16), + ] + + @infer_global(math.atan2) class Math_atan2(ConcreteTemplate): key = math.atan2 diff --git a/numba/cuda/mathimpl.py b/numba/cuda/mathimpl.py index dea30a0deb8..9dcd6dbefa0 100644 --- a/numba/cuda/mathimpl.py +++ b/numba/cuda/mathimpl.py @@ -5,6 +5,7 @@ from numba.core.imputils import Registry from numba.types import float32, float64, int64, uint64 from numba.cuda import libdevice +from numba import cuda registry = Registry() lower = registry.lower @@ -42,6 +43,7 @@ unarys += [('atanh', 'atanhf', math.atanh)] unarys += [('tan', 'tanf', math.tan)] unarys += [('tanh', 'tanhf', math.tanh)] +unarys += [('trunc', 'truncf', math.trunc)] unarys_fastmath = {} unarys_fastmath['cosf'] = 'fast_cosf' @@ -88,6 +90,94 @@ def math_isfinite_int(context, builder, sig, args): return context.get_constant(types.boolean, 1) +@lower(math.sin, types.float16) +def fp16_sin_impl(context, builder, sig, args): + def fp16_sin(x): + return cuda.fp16.hsin(x) + + return context.compile_internal(builder, fp16_sin, sig, args) + + +@lower(math.cos, types.float16) +def fp16_cos_impl(context, builder, sig, args): + def fp16_cos(x): + return cuda.fp16.hcos(x) + + return context.compile_internal(builder, fp16_cos, sig, args) + + +@lower(math.log, types.float16) +def fp16_log_impl(context, builder, sig, args): + def fp16_log(x): + return cuda.fp16.hlog(x) + + return context.compile_internal(builder, fp16_log, sig, args) + + +@lower(math.log10, types.float16) +def fp16_log10_impl(context, builder, sig, args): + def fp16_log10(x): + return cuda.fp16.hlog10(x) + + return context.compile_internal(builder, fp16_log10, sig, args) + + +@lower(math.log2, types.float16) +def fp16_log2_impl(context, builder, sig, args): + def fp16_log2(x): + return cuda.fp16.hlog2(x) + + return context.compile_internal(builder, fp16_log2, sig, args) + + +@lower(math.exp, types.float16) +def fp16_exp_impl(context, builder, sig, args): + def fp16_exp(x): + return cuda.fp16.hexp(x) + + return context.compile_internal(builder, fp16_exp, sig, args) + + +@lower(math.floor, types.float16) +def fp16_floor_impl(context, builder, sig, args): + def fp16_floor(x): + return cuda.fp16.hfloor(x) + + return context.compile_internal(builder, fp16_floor, sig, args) + + +@lower(math.ceil, types.float16) +def fp16_ceil_impl(context, builder, sig, args): + def fp16_ceil(x): + return cuda.fp16.hceil(x) + + return context.compile_internal(builder, fp16_ceil, sig, args) + + +@lower(math.sqrt, types.float16) +def fp16_sqrt_impl(context, builder, sig, args): + def fp16_sqrt(x): + return cuda.fp16.hsqrt(x) + + return context.compile_internal(builder, fp16_sqrt, sig, args) + + +@lower(math.fabs, types.float16) +def fp16_fabs_impl(context, builder, sig, args): + def fp16_fabs(x): + return cuda.fp16.habs(x) + + return context.compile_internal(builder, fp16_fabs, sig, args) + + +@lower(math.trunc, types.float16) +def fp16_trunc_impl(context, builder, sig, args): + def fp16_trunc(x): + return cuda.fp16.htrunc(x) + + return context.compile_internal(builder, fp16_trunc, sig, args) + + def impl_boolean(key, ty, libfunc): def lower_boolean_impl(context, builder, sig, args): libfunc_impl = context.get_function(libfunc, diff --git a/numba/cuda/tests/cudapy/test_math.py b/numba/cuda/tests/cudapy/test_math.py index 043d9d44034..2ee39ab8e2c 100644 --- a/numba/cuda/tests/cudapy/test_math.py +++ b/numba/cuda/tests/cudapy/test_math.py @@ -1,5 +1,9 @@ import numpy as np -from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim +from numba.cuda.testing import (skip_unless_cc_53, + skip_unless_cuda_python, + unittest, + CUDATestCase, + skip_on_cudasim) from numba.np import numpy_support from numba import cuda, float32, float64, int32, vectorize, void, int64 import math @@ -195,6 +199,11 @@ def math_radians(A, B): B[i] = math.radians(A[i]) +def math_trunc(A, B): + i = cuda.grid(1) + B[i] = math.trunc(A[i]) + + def math_pow_binop(A, B, C): i = cuda.grid(1) C[i] = A[i] ** B[i] @@ -206,6 +215,9 @@ def math_mod_binop(A, B, C): class TestCudaMath(CUDATestCase): + def unary_template_float16(self, func, npfunc, start=0, stop=1): + self.unary_template(func, npfunc, np.float16, np.float16, start, stop) + def unary_template_float32(self, func, npfunc, start=0, stop=1): self.unary_template(func, npfunc, np.float32, np.float32, start, stop) @@ -233,8 +245,10 @@ def unary_template(self, func, npfunc, npdtype, nprestype, start, stop): # the tightest under which the tests will pass. if npdtype == np.float64: rtol = 1e-13 - else: + elif npdtype == np.float32: rtol = 1e-6 + else: + rtol = 1e-3 np.testing.assert_allclose(npfunc(A), B, rtol=rtol) def unary_bool_special_values(self, func, npfunc, npdtype, npmtype): @@ -369,6 +383,21 @@ def test_math_cos(self): self.unary_template_int64(math_cos, np.cos) self.unary_template_uint64(math_cos, np.cos) + @skip_unless_cc_53 + @skip_unless_cuda_python('NVIDIA Binding needed for NVRTC') + def test_math_fp16(self): + self.unary_template_float16(math_sin, np.sin) + self.unary_template_float16(math_cos, np.cos) + self.unary_template_float16(math_exp, np.exp) + self.unary_template_float16(math_log, np.log, start=1) + self.unary_template_float16(math_log2, np.log2, start=1) + self.unary_template_float16(math_log10, np.log10, start=1) + self.unary_template_float16(math_fabs, np.fabs, start=-1) + self.unary_template_float16(math_sqrt, np.sqrt) + self.unary_template_float16(math_ceil, np.ceil) + self.unary_template_float16(math_floor, np.floor) + self.unary_template_float16(math_trunc, np.trunc) + #--------------------------------------------------------------------------- # test_math_sin @@ -622,6 +651,15 @@ def test_math_floor(self): self.unary_template_int64(math_floor, np.floor) self.unary_template_uint64(math_floor, np.floor) + #--------------------------------------------------------------------------- + # test_math_trunc + + def test_math_trunc(self): + self.unary_template_float32(math_trunc, np.trunc) + self.unary_template_float64(math_trunc, np.trunc) + self.unary_template_int64(math_trunc, np.trunc) + self.unary_template_uint64(math_trunc, np.trunc) + #--------------------------------------------------------------------------- # test_math_copysign