From 3130c30a1242edad8ff3dfaf5b52612d20f2fd21 Mon Sep 17 00:00:00 2001
From: Michael Collison <collison956@gmail.com>
Date: Wed, 16 Nov 2022 22:50:25 -0500
Subject: [PATCH 1/8] Add lowering of initial standard of standard math
 functions that map to fp16 intrinsics

---
 numba/cuda/cudaimpl.py               | 88 ++++++++++++++++++++++++++++
 numba/cuda/tests/cudapy/test_math.py | 14 +++++
 2 files changed, 102 insertions(+)

diff --git a/numba/cuda/cudaimpl.py b/numba/cuda/cudaimpl.py
index 382fd9fe84e..aa8a5b77c04 100644
--- a/numba/cuda/cudaimpl.py
+++ b/numba/cuda/cudaimpl.py
@@ -456,6 +456,94 @@ def fp16_div(x, y):
     return context.compile_internal(builder, fp16_div, sig, args, )
 
 
+@lower(math.sin, types.float16)
+def fp16_sin_impl(context, builder, sig, args):
+    def fp16_sin(x):
+        return cuda.fp16.hsin(x)
+
+    return context.compile_internal(builder, fp16_sin, sig, args, )
+
+
+@lower(math.cos, types.float16)
+def fp16_cos_impl(context, builder, sig, args):
+    def fp16_cos(x):
+        return cuda.fp16.hcos(x)
+
+    return context.compile_internal(builder, fp16_cos, sig, args, )
+
+
+@lower(math.tan, types.float16)
+def fp16_tan_impl(context, builder, sig, args):
+    def fp16_tan(x):
+        return cuda.fp16.hdiv(cuda.fp16.hsin(x), cuda.fp16.hcos(x))
+
+    return context.compile_internal(builder, fp16_tan, sig, args, )
+
+
+@lower(math.log, types.float16)
+def fp16_log_impl(context, builder, sig, args):
+    def fp16_log(x):
+        return cuda.fp16.hlog(x)
+
+    return context.compile_internal(builder, fp16_log, sig, args, )
+
+
+@lower(math.log10, types.float16)
+def fp16_log10_impl(context, builder, sig, args):
+    def fp16_log10(x):
+        return cuda.fp16.hlog10(x)
+
+    return context.compile_internal(builder, fp16_log10, sig, args, )
+
+
+@lower(math.log2, types.float16)
+def fp16_log2_impl(context, builder, sig, args):
+    def fp16_log2(x):
+        return cuda.fp16.hlog2(x)
+
+    return context.compile_internal(builder, fp16_log2, sig, args, )
+
+
+@lower(math.exp, types.float16)
+def fp16_exp_impl(context, builder, sig, args):
+    def fp16_exp(x):
+        return cuda.fp16.hexp(x)
+
+    return context.compile_internal(builder, fp16_exp, sig, args, )
+
+
+@lower(math.floor, types.float16)
+def fp16_floor_impl(context, builder, sig, args):
+    def fp16_floor(x):
+        return cuda.fp16.hfloor(x)
+
+    return context.compile_internal(builder, fp16_floor, sig, args, )
+
+
+@lower(math.ceil, types.float16)
+def fp16_ceil_impl(context, builder, sig, args):
+    def fp16_ceil(x):
+        return cuda.fp16.hceil(x)
+
+    return context.compile_internal(builder, fp16_ceil, sig, args, )
+
+
+@lower(math.sqrt, types.float16)
+def fp16_sqrt_impl(context, builder, sig, args):
+    def fp16_sqrt(x):
+        return cuda.fp16.hsqrt(x)
+
+    return context.compile_internal(builder, fp16_sqrt, sig, args, )
+
+
+@lower(math.fabs, types.float16)
+def fp16_fabs_impl(context, builder, sig, args):
+    def fp16_fabs(x):
+        return cuda.fp16.habs(x)
+
+    return context.compile_internal(builder, fp16_fabs, sig, args, )
+
+
 _fp16_cmp = """{{
           .reg .pred __$$f16_cmp_tmp;
           setp.{op}.f16 __$$f16_cmp_tmp, $1, $2;
diff --git a/numba/cuda/tests/cudapy/test_math.py b/numba/cuda/tests/cudapy/test_math.py
index 043d9d44034..477f5d2464a 100644
--- a/numba/cuda/tests/cudapy/test_math.py
+++ b/numba/cuda/tests/cudapy/test_math.py
@@ -206,6 +206,9 @@ def math_mod_binop(A, B, C):
 
 
 class TestCudaMath(CUDATestCase):
+    def unary_template_float16(self, func, npfunc, start=0, stop=1):
+        self.unary_template(func, npfunc, np.float16, np.float16, start, stop)
+
     def unary_template_float32(self, func, npfunc, start=0, stop=1):
         self.unary_template(func, npfunc, np.float32, np.float32, start, stop)
 
@@ -364,6 +367,7 @@ def test_math_atanh(self):
     # test_math_cos
 
     def test_math_cos(self):
+        self.unary_template_float16(math_cos, np.cos)
         self.unary_template_float32(math_cos, np.cos)
         self.unary_template_float64(math_cos, np.cos)
         self.unary_template_int64(math_cos, np.cos)
@@ -373,6 +377,7 @@ def test_math_cos(self):
     # test_math_sin
 
     def test_math_sin(self):
+        self.unary_template_float16(math_sin, np.sin)
         self.unary_template_float32(math_sin, np.sin)
         self.unary_template_float64(math_sin, np.sin)
         self.unary_template_int64(math_sin, np.sin)
@@ -382,6 +387,7 @@ def test_math_sin(self):
     # test_math_tan
 
     def test_math_tan(self):
+        self.unary_template_float16(math_tan, np.tan)
         self.unary_template_float32(math_tan, np.tan)
         self.unary_template_float64(math_tan, np.tan)
         self.unary_template_int64(math_tan, np.tan)
@@ -451,6 +457,7 @@ def ufunc(x):
     # test_math_exp
 
     def test_math_exp(self):
+        self.unary_template_float16(math_exp, np.exp)
         self.unary_template_float32(math_exp, np.exp)
         self.unary_template_float64(math_exp, np.exp)
         self.unary_template_int64(math_exp, np.exp)
@@ -469,6 +476,7 @@ def test_math_expm1(self):
     # test_math_fabs
 
     def test_math_fabs(self):
+        self.unary_template_float16(math_fabs, np.fabs, start=-1)
         self.unary_template_float32(math_fabs, np.fabs, start=-1)
         self.unary_template_float64(math_fabs, np.fabs, start=-1)
         self.unary_template_int64(math_fabs, np.fabs, start=-1)
@@ -502,6 +510,7 @@ def ufunc(x):
     # test_math_log
 
     def test_math_log(self):
+        self.unary_template_float16(math_log, np.log, start=1)
         self.unary_template_float32(math_log, np.log, start=1)
         self.unary_template_float64(math_log, np.log, start=1)
         self.unary_template_int64(math_log, np.log, start=1)
@@ -511,6 +520,7 @@ def test_math_log(self):
     # test_math_log2
 
     def test_math_log2(self):
+        self.unary_template_float16(math_log2, np.log2, start=1)
         self.unary_template_float32(math_log2, np.log2, start=1)
         self.unary_template_float64(math_log2, np.log2, start=1)
         self.unary_template_int64(math_log2, np.log2, start=1)
@@ -520,6 +530,7 @@ def test_math_log2(self):
     # test_math_log10
 
     def test_math_log10(self):
+        self.unary_template_float16(math_log10, np.log10, start=1)
         self.unary_template_float32(math_log10, np.log10, start=1)
         self.unary_template_float64(math_log10, np.log10, start=1)
         self.unary_template_int64(math_log10, np.log10, start=1)
@@ -556,6 +567,7 @@ def test_0_0(r, x, y):
     # test_math_sqrt
 
     def test_math_sqrt(self):
+        self.unary_template_float16(math_sqrt, np.sqrt)
         self.unary_template_float32(math_sqrt, np.sqrt)
         self.unary_template_float64(math_sqrt, np.sqrt)
         self.unary_template_int64(math_sqrt, np.sqrt)
@@ -608,6 +620,7 @@ def test_math_pow_binop(self):
     # test_math_ceil
 
     def test_math_ceil(self):
+        self.unary_template_float16(math_ceil, np.ceil)
         self.unary_template_float32(math_ceil, np.ceil)
         self.unary_template_float64(math_ceil, np.ceil)
         self.unary_template_int64(math_ceil, np.ceil)
@@ -617,6 +630,7 @@ def test_math_ceil(self):
     # test_math_floor
 
     def test_math_floor(self):
+        self.unary_template_float16(math_floor, np.floor)
         self.unary_template_float32(math_floor, np.floor)
         self.unary_template_float64(math_floor, np.floor)
         self.unary_template_int64(math_floor, np.floor)

From 25a87f182c7b68c1aeecc3973dae9afccc6b51bc Mon Sep 17 00:00:00 2001
From: Michael Collison <collison956@gmail.com>
Date: Wed, 16 Nov 2022 23:41:26 -0500
Subject: [PATCH 2/8] Refactor fp16 tests to skip whe necessary

---
 numba/cuda/tests/cudapy/test_math.py | 32 +++++++++++++++++-----------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/numba/cuda/tests/cudapy/test_math.py b/numba/cuda/tests/cudapy/test_math.py
index 477f5d2464a..f9189576e5e 100644
--- a/numba/cuda/tests/cudapy/test_math.py
+++ b/numba/cuda/tests/cudapy/test_math.py
@@ -1,5 +1,9 @@
 import numpy as np
-from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
+from numba.cuda.testing import (skip_unless_cc_53,
+                                skip_unless_cuda_python,
+                                unittest,
+                                CUDATestCase,
+                                skip_on_cudasim)
 from numba.np import numpy_support
 from numba import cuda, float32, float64, int32, vectorize, void, int64
 import math
@@ -367,17 +371,30 @@ def test_math_atanh(self):
     # test_math_cos
 
     def test_math_cos(self):
-        self.unary_template_float16(math_cos, np.cos)
         self.unary_template_float32(math_cos, np.cos)
         self.unary_template_float64(math_cos, np.cos)
         self.unary_template_int64(math_cos, np.cos)
         self.unary_template_uint64(math_cos, np.cos)
 
+    @skip_unless_cc_53
+    @skip_unless_cuda_python('NVIDIA Binding needed for NVRTC')
+    def test_math_fp16(self):
+        self.unary_template_float16(math_sin, np.sin)
+        self.unary_template_float16(math_cos, np.cos)
+        self.unary_template_float16(math_tan, np.tan)
+        self.unary_template_float16(math_exp, np.exp)
+        self.unary_template_float16(math_log, np.log, start=1)
+        self.unary_template_float16(math_log2, np.log2, start=1)
+        self.unary_template_float16(math_log10, np.log10, start=1)
+        self.unary_template_float16(math_fabs, np.fabs, start=-1)
+        self.unary_template_float16(math_sqrt, np.sqrt)
+        self.unary_template_float16(math_ceil, np.ceil)
+        self.unary_template_float16(math_floor, np.floor)
+
     #---------------------------------------------------------------------------
     # test_math_sin
 
     def test_math_sin(self):
-        self.unary_template_float16(math_sin, np.sin)
         self.unary_template_float32(math_sin, np.sin)
         self.unary_template_float64(math_sin, np.sin)
         self.unary_template_int64(math_sin, np.sin)
@@ -387,7 +404,6 @@ def test_math_sin(self):
     # test_math_tan
 
     def test_math_tan(self):
-        self.unary_template_float16(math_tan, np.tan)
         self.unary_template_float32(math_tan, np.tan)
         self.unary_template_float64(math_tan, np.tan)
         self.unary_template_int64(math_tan, np.tan)
@@ -457,7 +473,6 @@ def ufunc(x):
     # test_math_exp
 
     def test_math_exp(self):
-        self.unary_template_float16(math_exp, np.exp)
         self.unary_template_float32(math_exp, np.exp)
         self.unary_template_float64(math_exp, np.exp)
         self.unary_template_int64(math_exp, np.exp)
@@ -476,7 +491,6 @@ def test_math_expm1(self):
     # test_math_fabs
 
     def test_math_fabs(self):
-        self.unary_template_float16(math_fabs, np.fabs, start=-1)
         self.unary_template_float32(math_fabs, np.fabs, start=-1)
         self.unary_template_float64(math_fabs, np.fabs, start=-1)
         self.unary_template_int64(math_fabs, np.fabs, start=-1)
@@ -510,7 +524,6 @@ def ufunc(x):
     # test_math_log
 
     def test_math_log(self):
-        self.unary_template_float16(math_log, np.log, start=1)
         self.unary_template_float32(math_log, np.log, start=1)
         self.unary_template_float64(math_log, np.log, start=1)
         self.unary_template_int64(math_log, np.log, start=1)
@@ -520,7 +533,6 @@ def test_math_log(self):
     # test_math_log2
 
     def test_math_log2(self):
-        self.unary_template_float16(math_log2, np.log2, start=1)
         self.unary_template_float32(math_log2, np.log2, start=1)
         self.unary_template_float64(math_log2, np.log2, start=1)
         self.unary_template_int64(math_log2, np.log2, start=1)
@@ -530,7 +542,6 @@ def test_math_log2(self):
     # test_math_log10
 
     def test_math_log10(self):
-        self.unary_template_float16(math_log10, np.log10, start=1)
         self.unary_template_float32(math_log10, np.log10, start=1)
         self.unary_template_float64(math_log10, np.log10, start=1)
         self.unary_template_int64(math_log10, np.log10, start=1)
@@ -567,7 +578,6 @@ def test_0_0(r, x, y):
     # test_math_sqrt
 
     def test_math_sqrt(self):
-        self.unary_template_float16(math_sqrt, np.sqrt)
         self.unary_template_float32(math_sqrt, np.sqrt)
         self.unary_template_float64(math_sqrt, np.sqrt)
         self.unary_template_int64(math_sqrt, np.sqrt)
@@ -620,7 +630,6 @@ def test_math_pow_binop(self):
     # test_math_ceil
 
     def test_math_ceil(self):
-        self.unary_template_float16(math_ceil, np.ceil)
         self.unary_template_float32(math_ceil, np.ceil)
         self.unary_template_float64(math_ceil, np.ceil)
         self.unary_template_int64(math_ceil, np.ceil)
@@ -630,7 +639,6 @@ def test_math_ceil(self):
     # test_math_floor
 
     def test_math_floor(self):
-        self.unary_template_float16(math_floor, np.floor)
         self.unary_template_float32(math_floor, np.floor)
         self.unary_template_float64(math_floor, np.floor)
         self.unary_template_int64(math_floor, np.floor)

From f06a04aee6a235f4a8b108c3337ea0511bf327a8 Mon Sep 17 00:00:00 2001
From: Michael Collison <collison956@gmail.com>
Date: Thu, 17 Nov 2022 21:14:23 -0500
Subject: [PATCH 3/8] Add support for fp16 math.trunc

---
 numba/cuda/cudaimpl.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/numba/cuda/cudaimpl.py b/numba/cuda/cudaimpl.py
index aa8a5b77c04..13fb342d2b0 100644
--- a/numba/cuda/cudaimpl.py
+++ b/numba/cuda/cudaimpl.py
@@ -544,6 +544,14 @@ def fp16_fabs(x):
     return context.compile_internal(builder, fp16_fabs, sig, args, )
 
 
+@lower(math.trunc, types.float16)
+def fp16_trunc_impl(context, builder, sig, args):
+    def fp16_trunc(x):
+        return cuda.fp16.htrunc(x)
+
+    return context.compile_internal(builder, fp16_trunc, sig, args, )
+
+
 _fp16_cmp = """{{
           .reg .pred __$$f16_cmp_tmp;
           setp.{op}.f16 __$$f16_cmp_tmp, $1, $2;

From f98a863c0c917a570ae9eb9efa8309cdbc85b4e2 Mon Sep 17 00:00:00 2001
From: Michael Collison <collison956@gmail.com>
Date: Thu, 17 Nov 2022 21:15:13 -0500
Subject: [PATCH 4/8] Add typing information for fp16 math functions

---
 numba/cuda/cudamath.py | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/numba/cuda/cudamath.py b/numba/cuda/cudamath.py
index 3cfcb2bc2e2..6176706a3c8 100644
--- a/numba/cuda/cudamath.py
+++ b/numba/cuda/cudamath.py
@@ -13,29 +13,17 @@
 @infer_global(math.asinh)
 @infer_global(math.atan)
 @infer_global(math.atanh)
-@infer_global(math.ceil)
-@infer_global(math.cos)
 @infer_global(math.cosh)
 @infer_global(math.degrees)
 @infer_global(math.erf)
 @infer_global(math.erfc)
-@infer_global(math.exp)
 @infer_global(math.expm1)
-@infer_global(math.fabs)
-@infer_global(math.floor)
 @infer_global(math.gamma)
 @infer_global(math.lgamma)
-@infer_global(math.log)
-@infer_global(math.log2)
-@infer_global(math.log10)
 @infer_global(math.log1p)
 @infer_global(math.radians)
-@infer_global(math.sin)
 @infer_global(math.sinh)
-@infer_global(math.sqrt)
-@infer_global(math.tan)
 @infer_global(math.tanh)
-@infer_global(math.trunc)
 class Math_unary(ConcreteTemplate):
     cases = [
         signature(types.float64, types.int64),
@@ -45,6 +33,28 @@ class Math_unary(ConcreteTemplate):
     ]
 
 
+@infer_global(math.sin)
+@infer_global(math.cos)
+@infer_global(math.tan)
+@infer_global(math.ceil)
+@infer_global(math.floor)
+@infer_global(math.sqrt)
+@infer_global(math.log)
+@infer_global(math.log2)
+@infer_global(math.log10)
+@infer_global(math.exp)
+@infer_global(math.fabs)
+@infer_global(math.trunc)
+class Math_sin(ConcreteTemplate):
+    cases = [
+        signature(types.float64, types.int64),
+        signature(types.float64, types.uint64),
+        signature(types.float32, types.float32),
+        signature(types.float64, types.float64),
+        signature(types.float16, types.float16),
+    ]
+
+
 @infer_global(math.atan2)
 class Math_atan2(ConcreteTemplate):
     key = math.atan2

From 06366d685ef7f3a83a439f4282ace908ff7ff873 Mon Sep 17 00:00:00 2001
From: Michael Collison <collison956@gmail.com>
Date: Thu, 17 Nov 2022 21:16:11 -0500
Subject: [PATCH 5/8] Add testcase for math.trunc and add support for greater
 tolerance with fp16

---
 numba/cuda/tests/cudapy/test_math.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/numba/cuda/tests/cudapy/test_math.py b/numba/cuda/tests/cudapy/test_math.py
index f9189576e5e..d2e105e6aa0 100644
--- a/numba/cuda/tests/cudapy/test_math.py
+++ b/numba/cuda/tests/cudapy/test_math.py
@@ -199,6 +199,11 @@ def math_radians(A, B):
     B[i] = math.radians(A[i])
 
 
+def math_trunc(A, B):
+    i = cuda.grid(1)
+    B[i] = math.trunc(A[i])
+
+
 def math_pow_binop(A, B, C):
     i = cuda.grid(1)
     C[i] = A[i] ** B[i]
@@ -240,8 +245,10 @@ def unary_template(self, func, npfunc, npdtype, nprestype, start, stop):
         # the tightest under which the tests will pass.
         if npdtype == np.float64:
             rtol = 1e-13
-        else:
+        elif npdtype == np.float32:
             rtol = 1e-6
+        else:
+            rtol = 1e-3
         np.testing.assert_allclose(npfunc(A), B, rtol=rtol)
 
     def unary_bool_special_values(self, func, npfunc, npdtype, npmtype):
@@ -390,6 +397,7 @@ def test_math_fp16(self):
         self.unary_template_float16(math_sqrt, np.sqrt)
         self.unary_template_float16(math_ceil, np.ceil)
         self.unary_template_float16(math_floor, np.floor)
+        self.unary_template_float16(math_trunc, np.trunc)
 
     #---------------------------------------------------------------------------
     # test_math_sin

From 2f01c9a2c473c89212a2e6acf6bc7196de7f1846 Mon Sep 17 00:00:00 2001
From: Michael Collison <collison956@gmail.com>
Date: Mon, 21 Nov 2022 11:18:09 -0500
Subject: [PATCH 6/8] Move lowering to mathimpl and address formatting issues

---
 numba/cuda/cudaimpl.py | 98 +-----------------------------------------
 numba/cuda/cudamath.py |  2 +-
 numba/cuda/mathimpl.py | 97 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 98 deletions(-)

diff --git a/numba/cuda/cudaimpl.py b/numba/cuda/cudaimpl.py
index 13fb342d2b0..1be33f5ecc1 100644
--- a/numba/cuda/cudaimpl.py
+++ b/numba/cuda/cudaimpl.py
@@ -453,103 +453,7 @@ def fp16_div_impl(context, builder, sig, args):
     def fp16_div(x, y):
         return cuda.fp16.hdiv(x, y)
 
-    return context.compile_internal(builder, fp16_div, sig, args, )
-
-
-@lower(math.sin, types.float16)
-def fp16_sin_impl(context, builder, sig, args):
-    def fp16_sin(x):
-        return cuda.fp16.hsin(x)
-
-    return context.compile_internal(builder, fp16_sin, sig, args, )
-
-
-@lower(math.cos, types.float16)
-def fp16_cos_impl(context, builder, sig, args):
-    def fp16_cos(x):
-        return cuda.fp16.hcos(x)
-
-    return context.compile_internal(builder, fp16_cos, sig, args, )
-
-
-@lower(math.tan, types.float16)
-def fp16_tan_impl(context, builder, sig, args):
-    def fp16_tan(x):
-        return cuda.fp16.hdiv(cuda.fp16.hsin(x), cuda.fp16.hcos(x))
-
-    return context.compile_internal(builder, fp16_tan, sig, args, )
-
-
-@lower(math.log, types.float16)
-def fp16_log_impl(context, builder, sig, args):
-    def fp16_log(x):
-        return cuda.fp16.hlog(x)
-
-    return context.compile_internal(builder, fp16_log, sig, args, )
-
-
-@lower(math.log10, types.float16)
-def fp16_log10_impl(context, builder, sig, args):
-    def fp16_log10(x):
-        return cuda.fp16.hlog10(x)
-
-    return context.compile_internal(builder, fp16_log10, sig, args, )
-
-
-@lower(math.log2, types.float16)
-def fp16_log2_impl(context, builder, sig, args):
-    def fp16_log2(x):
-        return cuda.fp16.hlog2(x)
-
-    return context.compile_internal(builder, fp16_log2, sig, args, )
-
-
-@lower(math.exp, types.float16)
-def fp16_exp_impl(context, builder, sig, args):
-    def fp16_exp(x):
-        return cuda.fp16.hexp(x)
-
-    return context.compile_internal(builder, fp16_exp, sig, args, )
-
-
-@lower(math.floor, types.float16)
-def fp16_floor_impl(context, builder, sig, args):
-    def fp16_floor(x):
-        return cuda.fp16.hfloor(x)
-
-    return context.compile_internal(builder, fp16_floor, sig, args, )
-
-
-@lower(math.ceil, types.float16)
-def fp16_ceil_impl(context, builder, sig, args):
-    def fp16_ceil(x):
-        return cuda.fp16.hceil(x)
-
-    return context.compile_internal(builder, fp16_ceil, sig, args, )
-
-
-@lower(math.sqrt, types.float16)
-def fp16_sqrt_impl(context, builder, sig, args):
-    def fp16_sqrt(x):
-        return cuda.fp16.hsqrt(x)
-
-    return context.compile_internal(builder, fp16_sqrt, sig, args, )
-
-
-@lower(math.fabs, types.float16)
-def fp16_fabs_impl(context, builder, sig, args):
-    def fp16_fabs(x):
-        return cuda.fp16.habs(x)
-
-    return context.compile_internal(builder, fp16_fabs, sig, args, )
-
-
-@lower(math.trunc, types.float16)
-def fp16_trunc_impl(context, builder, sig, args):
-    def fp16_trunc(x):
-        return cuda.fp16.htrunc(x)
-
-    return context.compile_internal(builder, fp16_trunc, sig, args, )
+    return context.compile_internal(builder, fp16_div, sig, args)
 
 
 _fp16_cmp = """{{
diff --git a/numba/cuda/cudamath.py b/numba/cuda/cudamath.py
index 6176706a3c8..2ae56df3058 100644
--- a/numba/cuda/cudamath.py
+++ b/numba/cuda/cudamath.py
@@ -45,7 +45,7 @@ class Math_unary(ConcreteTemplate):
 @infer_global(math.exp)
 @infer_global(math.fabs)
 @infer_global(math.trunc)
-class Math_sin(ConcreteTemplate):
+class Math_unary_with_fp16(ConcreteTemplate):
     cases = [
         signature(types.float64, types.int64),
         signature(types.float64, types.uint64),
diff --git a/numba/cuda/mathimpl.py b/numba/cuda/mathimpl.py
index dea30a0deb8..677e1b51814 100644
--- a/numba/cuda/mathimpl.py
+++ b/numba/cuda/mathimpl.py
@@ -5,6 +5,7 @@
 from numba.core.imputils import Registry
 from numba.types import float32, float64, int64, uint64
 from numba.cuda import libdevice
+from numba import cuda
 
 registry = Registry()
 lower = registry.lower
@@ -88,6 +89,102 @@ def math_isfinite_int(context, builder, sig, args):
     return context.get_constant(types.boolean, 1)
 
 
+@lower(math.sin, types.float16)
+def fp16_sin_impl(context, builder, sig, args):
+    def fp16_sin(x):
+        return cuda.fp16.hsin(x)
+
+    return context.compile_internal(builder, fp16_sin, sig, args)
+
+
+@lower(math.cos, types.float16)
+def fp16_cos_impl(context, builder, sig, args):
+    def fp16_cos(x):
+        return cuda.fp16.hcos(x)
+
+    return context.compile_internal(builder, fp16_cos, sig, args)
+
+
+@lower(math.tan, types.float16)
+def fp16_tan_impl(context, builder, sig, args):
+    def fp16_tan(x):
+        return cuda.fp16.hdiv(cuda.fp16.hsin(x), cuda.fp16.hcos(x))
+
+    return context.compile_internal(builder, fp16_tan, sig, args)
+
+
+@lower(math.log, types.float16)
+def fp16_log_impl(context, builder, sig, args):
+    def fp16_log(x):
+        return cuda.fp16.hlog(x)
+
+    return context.compile_internal(builder, fp16_log, sig, args)
+
+
+@lower(math.log10, types.float16)
+def fp16_log10_impl(context, builder, sig, args):
+    def fp16_log10(x):
+        return cuda.fp16.hlog10(x)
+
+    return context.compile_internal(builder, fp16_log10, sig, args)
+
+
+@lower(math.log2, types.float16)
+def fp16_log2_impl(context, builder, sig, args):
+    def fp16_log2(x):
+        return cuda.fp16.hlog2(x)
+
+    return context.compile_internal(builder, fp16_log2, sig, args)
+
+
+@lower(math.exp, types.float16)
+def fp16_exp_impl(context, builder, sig, args):
+    def fp16_exp(x):
+        return cuda.fp16.hexp(x)
+
+    return context.compile_internal(builder, fp16_exp, sig, args)
+
+
+@lower(math.floor, types.float16)
+def fp16_floor_impl(context, builder, sig, args):
+    def fp16_floor(x):
+        return cuda.fp16.hfloor(x)
+
+    return context.compile_internal(builder, fp16_floor, sig, args)
+
+
+@lower(math.ceil, types.float16)
+def fp16_ceil_impl(context, builder, sig, args):
+    def fp16_ceil(x):
+        return cuda.fp16.hceil(x)
+
+    return context.compile_internal(builder, fp16_ceil, sig, args)
+
+
+@lower(math.sqrt, types.float16)
+def fp16_sqrt_impl(context, builder, sig, args):
+    def fp16_sqrt(x):
+        return cuda.fp16.hsqrt(x)
+
+    return context.compile_internal(builder, fp16_sqrt, sig, args)
+
+
+@lower(math.fabs, types.float16)
+def fp16_fabs_impl(context, builder, sig, args):
+    def fp16_fabs(x):
+        return cuda.fp16.habs(x)
+
+    return context.compile_internal(builder, fp16_fabs, sig, args)
+
+
+@lower(math.trunc, types.float16)
+def fp16_trunc_impl(context, builder, sig, args):
+    def fp16_trunc(x):
+        return cuda.fp16.htrunc(x)
+
+    return context.compile_internal(builder, fp16_trunc, sig, args)
+
+
 def impl_boolean(key, ty, libfunc):
     def lower_boolean_impl(context, builder, sig, args):
         libfunc_impl = context.get_function(libfunc,

From 25de1a32de11178acc114d117adb3d3b66c9ac0a Mon Sep 17 00:00:00 2001
From: Michael Collison <collison956@gmail.com>
Date: Mon, 21 Nov 2022 13:04:44 -0500
Subject: [PATCH 7/8] Add implementations for math.trunc for float32/float64
 for cuda target

---
 numba/cuda/mathimpl.py               | 1 +
 numba/cuda/tests/cudapy/test_math.py | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/numba/cuda/mathimpl.py b/numba/cuda/mathimpl.py
index 677e1b51814..eec7cfec1c4 100644
--- a/numba/cuda/mathimpl.py
+++ b/numba/cuda/mathimpl.py
@@ -43,6 +43,7 @@
 unarys += [('atanh', 'atanhf', math.atanh)]
 unarys += [('tan', 'tanf', math.tan)]
 unarys += [('tanh', 'tanhf', math.tanh)]
+unarys += [('trunc', 'truncf', math.trunc)]
 
 unarys_fastmath = {}
 unarys_fastmath['cosf'] = 'fast_cosf'
diff --git a/numba/cuda/tests/cudapy/test_math.py b/numba/cuda/tests/cudapy/test_math.py
index d2e105e6aa0..00de78066bb 100644
--- a/numba/cuda/tests/cudapy/test_math.py
+++ b/numba/cuda/tests/cudapy/test_math.py
@@ -652,6 +652,15 @@ def test_math_floor(self):
         self.unary_template_int64(math_floor, np.floor)
         self.unary_template_uint64(math_floor, np.floor)
 
+    #---------------------------------------------------------------------------
+    # test_math_trunc
+
+    def test_math_trunc(self):
+        self.unary_template_float32(math_trunc, np.trunc)
+        self.unary_template_float64(math_trunc, np.trunc)
+        self.unary_template_int64(math_trunc, np.trunc)
+        self.unary_template_uint64(math_trunc, np.trunc)
+
     #---------------------------------------------------------------------------
     # test_math_copysign
 

From b50026822604ec85bd0c54f14c2e297f60a60ce1 Mon Sep 17 00:00:00 2001
From: Michael Collison <collison956@gmail.com>
Date: Tue, 22 Nov 2022 09:41:22 -0500
Subject: [PATCH 8/8] Remove fp16 tan implementation

---
 numba/cuda/cudamath.py               | 2 +-
 numba/cuda/mathimpl.py               | 8 --------
 numba/cuda/tests/cudapy/test_math.py | 1 -
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/numba/cuda/cudamath.py b/numba/cuda/cudamath.py
index 2ae56df3058..12d9715b62e 100644
--- a/numba/cuda/cudamath.py
+++ b/numba/cuda/cudamath.py
@@ -24,6 +24,7 @@
 @infer_global(math.radians)
 @infer_global(math.sinh)
 @infer_global(math.tanh)
+@infer_global(math.tan)
 class Math_unary(ConcreteTemplate):
     cases = [
         signature(types.float64, types.int64),
@@ -35,7 +36,6 @@ class Math_unary(ConcreteTemplate):
 
 @infer_global(math.sin)
 @infer_global(math.cos)
-@infer_global(math.tan)
 @infer_global(math.ceil)
 @infer_global(math.floor)
 @infer_global(math.sqrt)
diff --git a/numba/cuda/mathimpl.py b/numba/cuda/mathimpl.py
index eec7cfec1c4..9dcd6dbefa0 100644
--- a/numba/cuda/mathimpl.py
+++ b/numba/cuda/mathimpl.py
@@ -106,14 +106,6 @@ def fp16_cos(x):
     return context.compile_internal(builder, fp16_cos, sig, args)
 
 
-@lower(math.tan, types.float16)
-def fp16_tan_impl(context, builder, sig, args):
-    def fp16_tan(x):
-        return cuda.fp16.hdiv(cuda.fp16.hsin(x), cuda.fp16.hcos(x))
-
-    return context.compile_internal(builder, fp16_tan, sig, args)
-
-
 @lower(math.log, types.float16)
 def fp16_log_impl(context, builder, sig, args):
     def fp16_log(x):
diff --git a/numba/cuda/tests/cudapy/test_math.py b/numba/cuda/tests/cudapy/test_math.py
index 00de78066bb..2ee39ab8e2c 100644
--- a/numba/cuda/tests/cudapy/test_math.py
+++ b/numba/cuda/tests/cudapy/test_math.py
@@ -388,7 +388,6 @@ def test_math_cos(self):
     def test_math_fp16(self):
         self.unary_template_float16(math_sin, np.sin)
         self.unary_template_float16(math_cos, np.cos)
-        self.unary_template_float16(math_tan, np.tan)
         self.unary_template_float16(math_exp, np.exp)
         self.unary_template_float16(math_log, np.log, start=1)
         self.unary_template_float16(math_log2, np.log2, start=1)