From 3130c30a1242edad8ff3dfaf5b52612d20f2fd21 Mon Sep 17 00:00:00 2001 From: Michael Collison Date: Wed, 16 Nov 2022 22:50:25 -0500 Subject: [PATCH 1/8] Add lowering of initial standard of standard math functions that map to fp16 intrinsics --- numba/cuda/cudaimpl.py | 88 ++++++++++++++++++++++++++++ numba/cuda/tests/cudapy/test_math.py | 14 +++++ 2 files changed, 102 insertions(+) diff --git a/numba/cuda/cudaimpl.py b/numba/cuda/cudaimpl.py index 382fd9fe84e..aa8a5b77c04 100644 --- a/numba/cuda/cudaimpl.py +++ b/numba/cuda/cudaimpl.py @@ -456,6 +456,94 @@ def fp16_div(x, y): return context.compile_internal(builder, fp16_div, sig, args, ) +@lower(math.sin, types.float16) +def fp16_sin_impl(context, builder, sig, args): + def fp16_sin(x): + return cuda.fp16.hsin(x) + + return context.compile_internal(builder, fp16_sin, sig, args, ) + + +@lower(math.cos, types.float16) +def fp16_cos_impl(context, builder, sig, args): + def fp16_cos(x): + return cuda.fp16.hcos(x) + + return context.compile_internal(builder, fp16_cos, sig, args, ) + + +@lower(math.tan, types.float16) +def fp16_tan_impl(context, builder, sig, args): + def fp16_tan(x): + return cuda.fp16.hdiv(cuda.fp16.hsin(x), cuda.fp16.hcos(x)) + + return context.compile_internal(builder, fp16_tan, sig, args, ) + + +@lower(math.log, types.float16) +def fp16_log_impl(context, builder, sig, args): + def fp16_log(x): + return cuda.fp16.hlog(x) + + return context.compile_internal(builder, fp16_log, sig, args, ) + + +@lower(math.log10, types.float16) +def fp16_log10_impl(context, builder, sig, args): + def fp16_log10(x): + return cuda.fp16.hlog10(x) + + return context.compile_internal(builder, fp16_log10, sig, args, ) + + +@lower(math.log2, types.float16) +def fp16_log2_impl(context, builder, sig, args): + def fp16_log2(x): + return cuda.fp16.hlog2(x) + + return context.compile_internal(builder, fp16_log2, sig, args, ) + + +@lower(math.exp, types.float16) +def fp16_exp_impl(context, builder, sig, args): + def fp16_exp(x): + return cuda.fp16.hexp(x) + + return context.compile_internal(builder, fp16_exp, sig, args, ) + + +@lower(math.floor, types.float16) +def fp16_floor_impl(context, builder, sig, args): + def fp16_floor(x): + return cuda.fp16.hfloor(x) + + return context.compile_internal(builder, fp16_floor, sig, args, ) + + +@lower(math.ceil, types.float16) +def fp16_ceil_impl(context, builder, sig, args): + def fp16_ceil(x): + return cuda.fp16.hceil(x) + + return context.compile_internal(builder, fp16_ceil, sig, args, ) + + +@lower(math.sqrt, types.float16) +def fp16_sqrt_impl(context, builder, sig, args): + def fp16_sqrt(x): + return cuda.fp16.hsqrt(x) + + return context.compile_internal(builder, fp16_sqrt, sig, args, ) + + +@lower(math.fabs, types.float16) +def fp16_fabs_impl(context, builder, sig, args): + def fp16_fabs(x): + return cuda.fp16.habs(x) + + return context.compile_internal(builder, fp16_fabs, sig, args, ) + + _fp16_cmp = """{{ .reg .pred __$$f16_cmp_tmp; setp.{op}.f16 __$$f16_cmp_tmp, $1, $2; diff --git a/numba/cuda/tests/cudapy/test_math.py b/numba/cuda/tests/cudapy/test_math.py index 043d9d44034..477f5d2464a 100644 --- a/numba/cuda/tests/cudapy/test_math.py +++ b/numba/cuda/tests/cudapy/test_math.py @@ -206,6 +206,9 @@ def math_mod_binop(A, B, C): class TestCudaMath(CUDATestCase): + def unary_template_float16(self, func, npfunc, start=0, stop=1): + self.unary_template(func, npfunc, np.float16, np.float16, start, stop) + def unary_template_float32(self, func, npfunc, start=0, stop=1): self.unary_template(func, npfunc, np.float32, np.float32, start, stop) @@ -364,6 +367,7 @@ def test_math_atanh(self): # test_math_cos def test_math_cos(self): + self.unary_template_float16(math_cos, np.cos) self.unary_template_float32(math_cos, np.cos) self.unary_template_float64(math_cos, np.cos) self.unary_template_int64(math_cos, np.cos) @@ -373,6 +377,7 @@ def test_math_cos(self): # test_math_sin def test_math_sin(self): + self.unary_template_float16(math_sin, np.sin) self.unary_template_float32(math_sin, np.sin) self.unary_template_float64(math_sin, np.sin) self.unary_template_int64(math_sin, np.sin) @@ -382,6 +387,7 @@ def test_math_sin(self): # test_math_tan def test_math_tan(self): + self.unary_template_float16(math_tan, np.tan) self.unary_template_float32(math_tan, np.tan) self.unary_template_float64(math_tan, np.tan) self.unary_template_int64(math_tan, np.tan) @@ -451,6 +457,7 @@ def ufunc(x): # test_math_exp def test_math_exp(self): + self.unary_template_float16(math_exp, np.exp) self.unary_template_float32(math_exp, np.exp) self.unary_template_float64(math_exp, np.exp) self.unary_template_int64(math_exp, np.exp) @@ -469,6 +476,7 @@ def test_math_expm1(self): # test_math_fabs def test_math_fabs(self): + self.unary_template_float16(math_fabs, np.fabs, start=-1) self.unary_template_float32(math_fabs, np.fabs, start=-1) self.unary_template_float64(math_fabs, np.fabs, start=-1) self.unary_template_int64(math_fabs, np.fabs, start=-1) @@ -502,6 +510,7 @@ def ufunc(x): # test_math_log def test_math_log(self): + self.unary_template_float16(math_log, np.log, start=1) self.unary_template_float32(math_log, np.log, start=1) self.unary_template_float64(math_log, np.log, start=1) self.unary_template_int64(math_log, np.log, start=1) @@ -511,6 +520,7 @@ def test_math_log(self): # test_math_log2 def test_math_log2(self): + self.unary_template_float16(math_log2, np.log2, start=1) self.unary_template_float32(math_log2, np.log2, start=1) self.unary_template_float64(math_log2, np.log2, start=1) self.unary_template_int64(math_log2, np.log2, start=1) @@ -520,6 +530,7 @@ def test_math_log2(self): # test_math_log10 def test_math_log10(self): + self.unary_template_float16(math_log10, np.log10, start=1) self.unary_template_float32(math_log10, np.log10, start=1) self.unary_template_float64(math_log10, np.log10, start=1) self.unary_template_int64(math_log10, np.log10, start=1) @@ -556,6 +567,7 @@ def test_0_0(r, x, y): # test_math_sqrt def test_math_sqrt(self): + self.unary_template_float16(math_sqrt, np.sqrt) self.unary_template_float32(math_sqrt, np.sqrt) self.unary_template_float64(math_sqrt, np.sqrt) self.unary_template_int64(math_sqrt, np.sqrt) @@ -608,6 +620,7 @@ def test_math_pow_binop(self): # test_math_ceil def test_math_ceil(self): + self.unary_template_float16(math_ceil, np.ceil) self.unary_template_float32(math_ceil, np.ceil) self.unary_template_float64(math_ceil, np.ceil) self.unary_template_int64(math_ceil, np.ceil) @@ -617,6 +630,7 @@ def test_math_ceil(self): # test_math_floor def test_math_floor(self): + self.unary_template_float16(math_floor, np.floor) self.unary_template_float32(math_floor, np.floor) self.unary_template_float64(math_floor, np.floor) self.unary_template_int64(math_floor, np.floor) From 25a87f182c7b68c1aeecc3973dae9afccc6b51bc Mon Sep 17 00:00:00 2001 From: Michael Collison Date: Wed, 16 Nov 2022 23:41:26 -0500 Subject: [PATCH 2/8] Refactor fp16 tests to skip whe necessary --- numba/cuda/tests/cudapy/test_math.py | 32 +++++++++++++++++----------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/numba/cuda/tests/cudapy/test_math.py b/numba/cuda/tests/cudapy/test_math.py index 477f5d2464a..f9189576e5e 100644 --- a/numba/cuda/tests/cudapy/test_math.py +++ b/numba/cuda/tests/cudapy/test_math.py @@ -1,5 +1,9 @@ import numpy as np -from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim +from numba.cuda.testing import (skip_unless_cc_53, + skip_unless_cuda_python, + unittest, + CUDATestCase, + skip_on_cudasim) from numba.np import numpy_support from numba import cuda, float32, float64, int32, vectorize, void, int64 import math @@ -367,17 +371,30 @@ def test_math_atanh(self): # test_math_cos def test_math_cos(self): - self.unary_template_float16(math_cos, np.cos) self.unary_template_float32(math_cos, np.cos) self.unary_template_float64(math_cos, np.cos) self.unary_template_int64(math_cos, np.cos) self.unary_template_uint64(math_cos, np.cos) + @skip_unless_cc_53 + @skip_unless_cuda_python('NVIDIA Binding needed for NVRTC') + def test_math_fp16(self): + self.unary_template_float16(math_sin, np.sin) + self.unary_template_float16(math_cos, np.cos) + self.unary_template_float16(math_tan, np.tan) + self.unary_template_float16(math_exp, np.exp) + self.unary_template_float16(math_log, np.log, start=1) + self.unary_template_float16(math_log2, np.log2, start=1) + self.unary_template_float16(math_log10, np.log10, start=1) + self.unary_template_float16(math_fabs, np.fabs, start=-1) + self.unary_template_float16(math_sqrt, np.sqrt) + self.unary_template_float16(math_ceil, np.ceil) + self.unary_template_float16(math_floor, np.floor) + #--------------------------------------------------------------------------- # test_math_sin def test_math_sin(self): - self.unary_template_float16(math_sin, np.sin) self.unary_template_float32(math_sin, np.sin) self.unary_template_float64(math_sin, np.sin) self.unary_template_int64(math_sin, np.sin) @@ -387,7 +404,6 @@ def test_math_sin(self): # test_math_tan def test_math_tan(self): - self.unary_template_float16(math_tan, np.tan) self.unary_template_float32(math_tan, np.tan) self.unary_template_float64(math_tan, np.tan) self.unary_template_int64(math_tan, np.tan) @@ -457,7 +473,6 @@ def ufunc(x): # test_math_exp def test_math_exp(self): - self.unary_template_float16(math_exp, np.exp) self.unary_template_float32(math_exp, np.exp) self.unary_template_float64(math_exp, np.exp) self.unary_template_int64(math_exp, np.exp) @@ -476,7 +491,6 @@ def test_math_expm1(self): # test_math_fabs def test_math_fabs(self): - self.unary_template_float16(math_fabs, np.fabs, start=-1) self.unary_template_float32(math_fabs, np.fabs, start=-1) self.unary_template_float64(math_fabs, np.fabs, start=-1) self.unary_template_int64(math_fabs, np.fabs, start=-1) @@ -510,7 +524,6 @@ def ufunc(x): # test_math_log def test_math_log(self): - self.unary_template_float16(math_log, np.log, start=1) self.unary_template_float32(math_log, np.log, start=1) self.unary_template_float64(math_log, np.log, start=1) self.unary_template_int64(math_log, np.log, start=1) @@ -520,7 +533,6 @@ def test_math_log(self): # test_math_log2 def test_math_log2(self): - self.unary_template_float16(math_log2, np.log2, start=1) self.unary_template_float32(math_log2, np.log2, start=1) self.unary_template_float64(math_log2, np.log2, start=1) self.unary_template_int64(math_log2, np.log2, start=1) @@ -530,7 +542,6 @@ def test_math_log2(self): # test_math_log10 def test_math_log10(self): - self.unary_template_float16(math_log10, np.log10, start=1) self.unary_template_float32(math_log10, np.log10, start=1) self.unary_template_float64(math_log10, np.log10, start=1) self.unary_template_int64(math_log10, np.log10, start=1) @@ -567,7 +578,6 @@ def test_0_0(r, x, y): # test_math_sqrt def test_math_sqrt(self): - self.unary_template_float16(math_sqrt, np.sqrt) self.unary_template_float32(math_sqrt, np.sqrt) self.unary_template_float64(math_sqrt, np.sqrt) self.unary_template_int64(math_sqrt, np.sqrt) @@ -620,7 +630,6 @@ def test_math_pow_binop(self): # test_math_ceil def test_math_ceil(self): - self.unary_template_float16(math_ceil, np.ceil) self.unary_template_float32(math_ceil, np.ceil) self.unary_template_float64(math_ceil, np.ceil) self.unary_template_int64(math_ceil, np.ceil) @@ -630,7 +639,6 @@ def test_math_ceil(self): # test_math_floor def test_math_floor(self): - self.unary_template_float16(math_floor, np.floor) self.unary_template_float32(math_floor, np.floor) self.unary_template_float64(math_floor, np.floor) self.unary_template_int64(math_floor, np.floor) From f06a04aee6a235f4a8b108c3337ea0511bf327a8 Mon Sep 17 00:00:00 2001 From: Michael Collison Date: Thu, 17 Nov 2022 21:14:23 -0500 Subject: [PATCH 3/8] Add support for fp16 math.trunc --- numba/cuda/cudaimpl.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/numba/cuda/cudaimpl.py b/numba/cuda/cudaimpl.py index aa8a5b77c04..13fb342d2b0 100644 --- a/numba/cuda/cudaimpl.py +++ b/numba/cuda/cudaimpl.py @@ -544,6 +544,14 @@ def fp16_fabs(x): return context.compile_internal(builder, fp16_fabs, sig, args, ) +@lower(math.trunc, types.float16) +def fp16_trunc_impl(context, builder, sig, args): + def fp16_trunc(x): + return cuda.fp16.htrunc(x) + + return context.compile_internal(builder, fp16_trunc, sig, args, ) + + _fp16_cmp = """{{ .reg .pred __$$f16_cmp_tmp; setp.{op}.f16 __$$f16_cmp_tmp, $1, $2; From f98a863c0c917a570ae9eb9efa8309cdbc85b4e2 Mon Sep 17 00:00:00 2001 From: Michael Collison Date: Thu, 17 Nov 2022 21:15:13 -0500 Subject: [PATCH 4/8] Add typing information for fp16 math functions --- numba/cuda/cudamath.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/numba/cuda/cudamath.py b/numba/cuda/cudamath.py index 3cfcb2bc2e2..6176706a3c8 100644 --- a/numba/cuda/cudamath.py +++ b/numba/cuda/cudamath.py @@ -13,29 +13,17 @@ @infer_global(math.asinh) @infer_global(math.atan) @infer_global(math.atanh) -@infer_global(math.ceil) -@infer_global(math.cos) @infer_global(math.cosh) @infer_global(math.degrees) @infer_global(math.erf) @infer_global(math.erfc) -@infer_global(math.exp) @infer_global(math.expm1) -@infer_global(math.fabs) -@infer_global(math.floor) @infer_global(math.gamma) @infer_global(math.lgamma) -@infer_global(math.log) -@infer_global(math.log2) -@infer_global(math.log10) @infer_global(math.log1p) @infer_global(math.radians) -@infer_global(math.sin) @infer_global(math.sinh) -@infer_global(math.sqrt) -@infer_global(math.tan) @infer_global(math.tanh) -@infer_global(math.trunc) class Math_unary(ConcreteTemplate): cases = [ signature(types.float64, types.int64), @@ -45,6 +33,28 @@ class Math_unary(ConcreteTemplate): ] +@infer_global(math.sin) +@infer_global(math.cos) +@infer_global(math.tan) +@infer_global(math.ceil) +@infer_global(math.floor) +@infer_global(math.sqrt) +@infer_global(math.log) +@infer_global(math.log2) +@infer_global(math.log10) +@infer_global(math.exp) +@infer_global(math.fabs) +@infer_global(math.trunc) +class Math_sin(ConcreteTemplate): + cases = [ + signature(types.float64, types.int64), + signature(types.float64, types.uint64), + signature(types.float32, types.float32), + signature(types.float64, types.float64), + signature(types.float16, types.float16), + ] + + @infer_global(math.atan2) class Math_atan2(ConcreteTemplate): key = math.atan2 From 06366d685ef7f3a83a439f4282ace908ff7ff873 Mon Sep 17 00:00:00 2001 From: Michael Collison Date: Thu, 17 Nov 2022 21:16:11 -0500 Subject: [PATCH 5/8] Add testcase for math.trunc and add support for greater tolerance with fp16 --- numba/cuda/tests/cudapy/test_math.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/numba/cuda/tests/cudapy/test_math.py b/numba/cuda/tests/cudapy/test_math.py index f9189576e5e..d2e105e6aa0 100644 --- a/numba/cuda/tests/cudapy/test_math.py +++ b/numba/cuda/tests/cudapy/test_math.py @@ -199,6 +199,11 @@ def math_radians(A, B): B[i] = math.radians(A[i]) +def math_trunc(A, B): + i = cuda.grid(1) + B[i] = math.trunc(A[i]) + + def math_pow_binop(A, B, C): i = cuda.grid(1) C[i] = A[i] ** B[i] @@ -240,8 +245,10 @@ def unary_template(self, func, npfunc, npdtype, nprestype, start, stop): # the tightest under which the tests will pass. if npdtype == np.float64: rtol = 1e-13 - else: + elif npdtype == np.float32: rtol = 1e-6 + else: + rtol = 1e-3 np.testing.assert_allclose(npfunc(A), B, rtol=rtol) def unary_bool_special_values(self, func, npfunc, npdtype, npmtype): @@ -390,6 +397,7 @@ def test_math_fp16(self): self.unary_template_float16(math_sqrt, np.sqrt) self.unary_template_float16(math_ceil, np.ceil) self.unary_template_float16(math_floor, np.floor) + self.unary_template_float16(math_trunc, np.trunc) #--------------------------------------------------------------------------- # test_math_sin From 2f01c9a2c473c89212a2e6acf6bc7196de7f1846 Mon Sep 17 00:00:00 2001 From: Michael Collison Date: Mon, 21 Nov 2022 11:18:09 -0500 Subject: [PATCH 6/8] Move lowering to mathimpl and address formatting issues --- numba/cuda/cudaimpl.py | 98 +----------------------------------------- numba/cuda/cudamath.py | 2 +- numba/cuda/mathimpl.py | 97 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 98 deletions(-) diff --git a/numba/cuda/cudaimpl.py b/numba/cuda/cudaimpl.py index 13fb342d2b0..1be33f5ecc1 100644 --- a/numba/cuda/cudaimpl.py +++ b/numba/cuda/cudaimpl.py @@ -453,103 +453,7 @@ def fp16_div_impl(context, builder, sig, args): def fp16_div(x, y): return cuda.fp16.hdiv(x, y) - return context.compile_internal(builder, fp16_div, sig, args, ) - - -@lower(math.sin, types.float16) -def fp16_sin_impl(context, builder, sig, args): - def fp16_sin(x): - return cuda.fp16.hsin(x) - - return context.compile_internal(builder, fp16_sin, sig, args, ) - - -@lower(math.cos, types.float16) -def fp16_cos_impl(context, builder, sig, args): - def fp16_cos(x): - return cuda.fp16.hcos(x) - - return context.compile_internal(builder, fp16_cos, sig, args, ) - - -@lower(math.tan, types.float16) -def fp16_tan_impl(context, builder, sig, args): - def fp16_tan(x): - return cuda.fp16.hdiv(cuda.fp16.hsin(x), cuda.fp16.hcos(x)) - - return context.compile_internal(builder, fp16_tan, sig, args, ) - - -@lower(math.log, types.float16) -def fp16_log_impl(context, builder, sig, args): - def fp16_log(x): - return cuda.fp16.hlog(x) - - return context.compile_internal(builder, fp16_log, sig, args, ) - - -@lower(math.log10, types.float16) -def fp16_log10_impl(context, builder, sig, args): - def fp16_log10(x): - return cuda.fp16.hlog10(x) - - return context.compile_internal(builder, fp16_log10, sig, args, ) - - -@lower(math.log2, types.float16) -def fp16_log2_impl(context, builder, sig, args): - def fp16_log2(x): - return cuda.fp16.hlog2(x) - - return context.compile_internal(builder, fp16_log2, sig, args, ) - - -@lower(math.exp, types.float16) -def fp16_exp_impl(context, builder, sig, args): - def fp16_exp(x): - return cuda.fp16.hexp(x) - - return context.compile_internal(builder, fp16_exp, sig, args, ) - - -@lower(math.floor, types.float16) -def fp16_floor_impl(context, builder, sig, args): - def fp16_floor(x): - return cuda.fp16.hfloor(x) - - return context.compile_internal(builder, fp16_floor, sig, args, ) - - -@lower(math.ceil, types.float16) -def fp16_ceil_impl(context, builder, sig, args): - def fp16_ceil(x): - return cuda.fp16.hceil(x) - - return context.compile_internal(builder, fp16_ceil, sig, args, ) - - -@lower(math.sqrt, types.float16) -def fp16_sqrt_impl(context, builder, sig, args): - def fp16_sqrt(x): - return cuda.fp16.hsqrt(x) - - return context.compile_internal(builder, fp16_sqrt, sig, args, ) - - -@lower(math.fabs, types.float16) -def fp16_fabs_impl(context, builder, sig, args): - def fp16_fabs(x): - return cuda.fp16.habs(x) - - return context.compile_internal(builder, fp16_fabs, sig, args, ) - - -@lower(math.trunc, types.float16) -def fp16_trunc_impl(context, builder, sig, args): - def fp16_trunc(x): - return cuda.fp16.htrunc(x) - - return context.compile_internal(builder, fp16_trunc, sig, args, ) + return context.compile_internal(builder, fp16_div, sig, args) _fp16_cmp = """{{ diff --git a/numba/cuda/cudamath.py b/numba/cuda/cudamath.py index 6176706a3c8..2ae56df3058 100644 --- a/numba/cuda/cudamath.py +++ b/numba/cuda/cudamath.py @@ -45,7 +45,7 @@ class Math_unary(ConcreteTemplate): @infer_global(math.exp) @infer_global(math.fabs) @infer_global(math.trunc) -class Math_sin(ConcreteTemplate): +class Math_unary_with_fp16(ConcreteTemplate): cases = [ signature(types.float64, types.int64), signature(types.float64, types.uint64), diff --git a/numba/cuda/mathimpl.py b/numba/cuda/mathimpl.py index dea30a0deb8..677e1b51814 100644 --- a/numba/cuda/mathimpl.py +++ b/numba/cuda/mathimpl.py @@ -5,6 +5,7 @@ from numba.core.imputils import Registry from numba.types import float32, float64, int64, uint64 from numba.cuda import libdevice +from numba import cuda registry = Registry() lower = registry.lower @@ -88,6 +89,102 @@ def math_isfinite_int(context, builder, sig, args): return context.get_constant(types.boolean, 1) +@lower(math.sin, types.float16) +def fp16_sin_impl(context, builder, sig, args): + def fp16_sin(x): + return cuda.fp16.hsin(x) + + return context.compile_internal(builder, fp16_sin, sig, args) + + +@lower(math.cos, types.float16) +def fp16_cos_impl(context, builder, sig, args): + def fp16_cos(x): + return cuda.fp16.hcos(x) + + return context.compile_internal(builder, fp16_cos, sig, args) + + +@lower(math.tan, types.float16) +def fp16_tan_impl(context, builder, sig, args): + def fp16_tan(x): + return cuda.fp16.hdiv(cuda.fp16.hsin(x), cuda.fp16.hcos(x)) + + return context.compile_internal(builder, fp16_tan, sig, args) + + +@lower(math.log, types.float16) +def fp16_log_impl(context, builder, sig, args): + def fp16_log(x): + return cuda.fp16.hlog(x) + + return context.compile_internal(builder, fp16_log, sig, args) + + +@lower(math.log10, types.float16) +def fp16_log10_impl(context, builder, sig, args): + def fp16_log10(x): + return cuda.fp16.hlog10(x) + + return context.compile_internal(builder, fp16_log10, sig, args) + + +@lower(math.log2, types.float16) +def fp16_log2_impl(context, builder, sig, args): + def fp16_log2(x): + return cuda.fp16.hlog2(x) + + return context.compile_internal(builder, fp16_log2, sig, args) + + +@lower(math.exp, types.float16) +def fp16_exp_impl(context, builder, sig, args): + def fp16_exp(x): + return cuda.fp16.hexp(x) + + return context.compile_internal(builder, fp16_exp, sig, args) + + +@lower(math.floor, types.float16) +def fp16_floor_impl(context, builder, sig, args): + def fp16_floor(x): + return cuda.fp16.hfloor(x) + + return context.compile_internal(builder, fp16_floor, sig, args) + + +@lower(math.ceil, types.float16) +def fp16_ceil_impl(context, builder, sig, args): + def fp16_ceil(x): + return cuda.fp16.hceil(x) + + return context.compile_internal(builder, fp16_ceil, sig, args) + + +@lower(math.sqrt, types.float16) +def fp16_sqrt_impl(context, builder, sig, args): + def fp16_sqrt(x): + return cuda.fp16.hsqrt(x) + + return context.compile_internal(builder, fp16_sqrt, sig, args) + + +@lower(math.fabs, types.float16) +def fp16_fabs_impl(context, builder, sig, args): + def fp16_fabs(x): + return cuda.fp16.habs(x) + + return context.compile_internal(builder, fp16_fabs, sig, args) + + +@lower(math.trunc, types.float16) +def fp16_trunc_impl(context, builder, sig, args): + def fp16_trunc(x): + return cuda.fp16.htrunc(x) + + return context.compile_internal(builder, fp16_trunc, sig, args) + + def impl_boolean(key, ty, libfunc): def lower_boolean_impl(context, builder, sig, args): libfunc_impl = context.get_function(libfunc, From 25de1a32de11178acc114d117adb3d3b66c9ac0a Mon Sep 17 00:00:00 2001 From: Michael Collison Date: Mon, 21 Nov 2022 13:04:44 -0500 Subject: [PATCH 7/8] Add implementations for math.trunc for float32/float64 for cuda target --- numba/cuda/mathimpl.py | 1 + numba/cuda/tests/cudapy/test_math.py | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/numba/cuda/mathimpl.py b/numba/cuda/mathimpl.py index 677e1b51814..eec7cfec1c4 100644 --- a/numba/cuda/mathimpl.py +++ b/numba/cuda/mathimpl.py @@ -43,6 +43,7 @@ unarys += [('atanh', 'atanhf', math.atanh)] unarys += [('tan', 'tanf', math.tan)] unarys += [('tanh', 'tanhf', math.tanh)] +unarys += [('trunc', 'truncf', math.trunc)] unarys_fastmath = {} unarys_fastmath['cosf'] = 'fast_cosf' diff --git a/numba/cuda/tests/cudapy/test_math.py b/numba/cuda/tests/cudapy/test_math.py index d2e105e6aa0..00de78066bb 100644 --- a/numba/cuda/tests/cudapy/test_math.py +++ b/numba/cuda/tests/cudapy/test_math.py @@ -652,6 +652,15 @@ def test_math_floor(self): self.unary_template_int64(math_floor, np.floor) self.unary_template_uint64(math_floor, np.floor) + #--------------------------------------------------------------------------- + # test_math_trunc + + def test_math_trunc(self): + self.unary_template_float32(math_trunc, np.trunc) + self.unary_template_float64(math_trunc, np.trunc) + self.unary_template_int64(math_trunc, np.trunc) + self.unary_template_uint64(math_trunc, np.trunc) + #--------------------------------------------------------------------------- # test_math_copysign From b50026822604ec85bd0c54f14c2e297f60a60ce1 Mon Sep 17 00:00:00 2001 From: Michael Collison Date: Tue, 22 Nov 2022 09:41:22 -0500 Subject: [PATCH 8/8] Remove fp16 tan implementation --- numba/cuda/cudamath.py | 2 +- numba/cuda/mathimpl.py | 8 -------- numba/cuda/tests/cudapy/test_math.py | 1 - 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/numba/cuda/cudamath.py b/numba/cuda/cudamath.py index 2ae56df3058..12d9715b62e 100644 --- a/numba/cuda/cudamath.py +++ b/numba/cuda/cudamath.py @@ -24,6 +24,7 @@ @infer_global(math.radians) @infer_global(math.sinh) @infer_global(math.tanh) +@infer_global(math.tan) class Math_unary(ConcreteTemplate): cases = [ signature(types.float64, types.int64), @@ -35,7 +36,6 @@ class Math_unary(ConcreteTemplate): @infer_global(math.sin) @infer_global(math.cos) -@infer_global(math.tan) @infer_global(math.ceil) @infer_global(math.floor) @infer_global(math.sqrt) diff --git a/numba/cuda/mathimpl.py b/numba/cuda/mathimpl.py index eec7cfec1c4..9dcd6dbefa0 100644 --- a/numba/cuda/mathimpl.py +++ b/numba/cuda/mathimpl.py @@ -106,14 +106,6 @@ def fp16_cos(x): return context.compile_internal(builder, fp16_cos, sig, args) -@lower(math.tan, types.float16) -def fp16_tan_impl(context, builder, sig, args): - def fp16_tan(x): - return cuda.fp16.hdiv(cuda.fp16.hsin(x), cuda.fp16.hcos(x)) - - return context.compile_internal(builder, fp16_tan, sig, args) - - @lower(math.log, types.float16) def fp16_log_impl(context, builder, sig, args): def fp16_log(x): diff --git a/numba/cuda/tests/cudapy/test_math.py b/numba/cuda/tests/cudapy/test_math.py index 00de78066bb..2ee39ab8e2c 100644 --- a/numba/cuda/tests/cudapy/test_math.py +++ b/numba/cuda/tests/cudapy/test_math.py @@ -388,7 +388,6 @@ def test_math_cos(self): def test_math_fp16(self): self.unary_template_float16(math_sin, np.sin) self.unary_template_float16(math_cos, np.cos) - self.unary_template_float16(math_tan, np.tan) self.unary_template_float16(math_exp, np.exp) self.unary_template_float16(math_log, np.log, start=1) self.unary_template_float16(math_log2, np.log2, start=1)