From d7f58eddf8c78951b85bb036a441a3986df8042c Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 12:18:04 +0000
Subject: [PATCH 01/21] CUDA Dispatcher: inherit from _DispatcherBase

Current test status:

```
Ran 1266 tests in 102.919s

FAILED (failures=1, errors=5, skipped=12, expected failures=7)
```
---
 numba/cuda/dispatcher.py | 64 +++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 40 deletions(-)

diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index 94cc817f7fb..b76d6bae01b 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -1,4 +1,3 @@
-import collections
 import inspect
 import numpy as np
 import os
@@ -8,9 +7,8 @@
 
 from numba.core import config, serialize, sigutils, types, typing, utils
 from numba.core.compiler_lock import global_compiler_lock
-from numba.core.dispatcher import CompilingCounter, OmittedArg
+from numba.core.dispatcher import _DispatcherBase
 from numba.core.errors import NumbaPerformanceWarning
-from numba.core.typeconv.rules import default_type_manager
 from numba.core.typing.templates import AbstractTemplate
 from numba.core.typing.typeof import Purpose, typeof
 
@@ -446,7 +444,7 @@ def __call__(self, *args):
                                     self.stream, self.sharedmem)
 
 
-class Dispatcher(_dispatcher.Dispatcher, serialize.ReduceMixin):
+class Dispatcher(_DispatcherBase, serialize.ReduceMixin):
     '''
     CUDA Dispatcher object. When configured and called, the dispatcher will
     specialize itself for the given arguments (if no suitable specialized
@@ -465,54 +463,33 @@ class Dispatcher(_dispatcher.Dispatcher, serialize.ReduceMixin):
     targetdescr = cuda_target
 
     def __init__(self, py_func, sigs, targetoptions):
-        self.py_func = py_func
+        self.typingctx = self.targetdescr.typing_context
+        self.targetctx = self.targetdescr.target_context
+
+        pysig = utils.pysignature(py_func)
+        arg_count = len(pysig.parameters)
+        can_fallback = False # CUDA cannot fallback to object mode
+
+        _DispatcherBase.__init__(self, arg_count, py_func, pysig, can_fallback,
+                                 exact_match_required=False)
+
+        # TODO: Check if this fixes the cuda docstring jit issue
+        functools.update_wrapper(self, py_func)
+
+        self.targetoptions = targetoptions
+
         self.sigs = []
         self.link = targetoptions.pop('link', (),)
         self._can_compile = True
         self._type = self._numba_type_
 
-        # The compiling counter is only used when compiling device functions as
-        # it is used to detect recursion - recursion is not possible when
-        # compiling a kernel.
-        self._compiling_counter = CompilingCounter()
-
         # Specializations for given sets of argument types
         self.specializations = {}
 
-        # A mapping of signatures to compile results
-        self.overloads = collections.OrderedDict()
-
-        self.targetoptions = targetoptions
-
         # defensive copy
         self.targetoptions['extensions'] = \
             list(self.targetoptions.get('extensions', []))
 
-        self.typingctx = self.targetdescr.typing_context
-
-        self._tm = default_type_manager
-
-        pysig = utils.pysignature(py_func)
-        arg_count = len(pysig.parameters)
-        argnames = tuple(pysig.parameters)
-        default_values = self.py_func.__defaults__ or ()
-        defargs = tuple(OmittedArg(val) for val in default_values)
-        can_fallback = False # CUDA cannot fallback to object mode
-
-        try:
-            lastarg = list(pysig.parameters.values())[-1]
-        except IndexError:
-            has_stararg = False
-        else:
-            has_stararg = lastarg.kind == lastarg.VAR_POSITIONAL
-
-        exact_match_required = False
-
-        _dispatcher.Dispatcher.__init__(self, self._tm.get_pointer(),
-                                        arg_count, self._fold_args, argnames,
-                                        defargs, can_fallback, has_stararg,
-                                        exact_match_required)
-
         if sigs:
             if len(sigs) > 1:
                 raise TypeError("Only one signature supported at present")
@@ -527,6 +504,13 @@ def __init__(self, py_func, sigs, targetoptions):
         if targetoptions.get('device'):
             self._register_device_function()
 
+    def _make_finalizer(self):
+        # Dummy finalizer whilst _DispatcherBase assumes the existence of a
+        # finalizer
+        def finalizer():
+            pass
+        return finalizer
+
     def _register_device_function(self):
         dispatcher = self
         pyfunc = self.py_func

From 31c1a6436a0a0c00507c9481143e63cb463aa7ba Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 13:39:20 +0000
Subject: [PATCH 02/21] [WIP] Fix CUDA target tests

---
 numba/cuda/compiler.py                 | 25 ++++++++++++++++++++++++-
 numba/cuda/dispatcher.py               | 20 ++++++++++++--------
 numba/cuda/target.py                   |  2 +-
 numba/cuda/tests/cudapy/test_errors.py |  2 +-
 4 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py
index 0002c6bea8b..deb154efb92 100644
--- a/numba/cuda/compiler.py
+++ b/numba/cuda/compiler.py
@@ -1,7 +1,7 @@
 from numba.core.typing.templates import ConcreteTemplate
 from numba.core import types, typing, funcdesc, config, compiler
 from numba.core.compiler import (CompilerBase, DefaultPassBuilder,
-                                 compile_result, Flags, Option)
+                                 Flags, Option, CompileResult, CR_FIELDS)
 from numba.core.compiler_lock import global_compiler_lock
 from numba.core.compiler_machinery import (LoweringPass, AnalysisPass,
                                            PassManager, register_pass)
@@ -29,6 +29,28 @@ class CUDAFlags(Flags):
     )
 
 
+class CUDACompileResult(CompileResult):
+    @property
+    def entry_point(self):
+        return id(self)
+
+
+def compile_result(**kws):
+    keys = set(kws.keys())
+    fieldset = set(CR_FIELDS)
+    badnames = keys - fieldset
+    if badnames:
+        raise NameError(*badnames)
+    missing = fieldset - keys
+    for k in missing:
+        kws[k] = None
+    # Avoid keeping alive traceback variables
+    err = kws['typing_error']
+    if err is not None:
+        kws['typing_error'] = err.with_traceback(None)
+    return CUDACompileResult(**kws)
+
+
 @register_pass(mutates_CFG=True, analysis_only=False)
 class CUDABackend(LoweringPass):
 
@@ -54,6 +76,7 @@ def run_pass(self, state):
             signature=signature,
             fndesc=lowered.fndesc,
         )
+
         return True
 
 
diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index b76d6bae01b..0c03a2fc05e 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -731,13 +731,16 @@ def get_call_template(self, args, kws):
 
             return call_template, pysig, args, kws
 
-    def get_overload(self, sig):
-        # We give the id of the overload (a CompileResult) because this is used
-        # as a key into a dict of overloads, and this is the only small and
-        # unique property of a CompileResult on CUDA (c.f. the CPU target,
-        # which uses its entry_point, which is a pointer value).
-        args, return_type = sigutils.normalize_signature(sig)
-        return id(self.overloads[args])
+#    def get_overload(self, sig):
+#        # We give the id of the overload (a CompileResult) because this is used
+#        # as a key into a dict of overloads, and this is the only small and
+#        # unique property of a CompileResult on CUDA (c.f. the CPU target,
+#        # which uses its entry_point, which is a pointer value).
+#        args, return_type = sigutils.normalize_signature(sig)
+#        return id(self.overloads[args])
+
+    def __repr__(self):
+        return f"numba.cuda.dispatcher.Dispatcher({self.py_func})"
 
     def compile_device(self, args):
         """Compile the device function for the given argument types.
@@ -763,7 +766,8 @@ def compile_device(self, args):
 
             # The inserted function uses the id of the CompileResult as a key,
             # consistent with get_overload() above.
-            cres.target_context.insert_user_function(id(cres), cres.fndesc,
+            cres.target_context.insert_user_function(cres.entry_point,
+                                                     cres.fndesc,
                                                      [cres.library])
         else:
             cres = self.overloads[args]
diff --git a/numba/cuda/target.py b/numba/cuda/target.py
index 3d63dde49f0..59cf27eb4b8 100644
--- a/numba/cuda/target.py
+++ b/numba/cuda/target.py
@@ -42,7 +42,7 @@ def resolve_value_type(self, val):
                 targetoptions['opt'] = targetoptions.get('opt', True)
                 sigs = None
                 from numba.cuda.dispatcher import Dispatcher
-                disp = Dispatcher(val, sigs, targetoptions)
+                disp = Dispatcher(val.py_func, sigs, targetoptions)
                 # cache the device function for future use and to avoid
                 # duplicated copy of the same function.
                 val.__dispatcher = disp
diff --git a/numba/cuda/tests/cudapy/test_errors.py b/numba/cuda/tests/cudapy/test_errors.py
index bbd8f5fd518..e13197f7d8f 100644
--- a/numba/cuda/tests/cudapy/test_errors.py
+++ b/numba/cuda/tests/cudapy/test_errors.py
@@ -72,7 +72,7 @@ def kernel_func():
             kernel_func[1, 1]()
         excstr = str(raises.exception)
         self.assertIn("resolving callee type: "
-                      "type(<numba.cuda.dispatcher.Dispatcher object",
+                      "type(numba.cuda.dispatcher.Dispatcher",
                       excstr)
         self.assertIn("NameError: name 'floor' is not defined", excstr)
 

From d9649f8bd627ba89a3b3d602f67a355664a2931f Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 15:31:32 +0000
Subject: [PATCH 03/21] Some tidy-up

---
 numba/cuda/compiler.py   |  6 ++++++
 numba/cuda/dispatcher.py | 11 +----------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py
index deb154efb92..31dd4fb02c2 100644
--- a/numba/cuda/compiler.py
+++ b/numba/cuda/compiler.py
@@ -29,6 +29,12 @@ class CUDAFlags(Flags):
     )
 
 
+# FIXME: Update this comment
+# We give the id of the overload (a CompileResult) because this is used
+# as a key into a dict of overloads, and this is the only small and
+# unique property of a CompileResult on CUDA (c.f. the CPU target,
+# which uses its entry_point, which is a pointer value).
+
 class CUDACompileResult(CompileResult):
     @property
     def entry_point(self):
diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index 0c03a2fc05e..4c3d18435c8 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -731,14 +731,7 @@ def get_call_template(self, args, kws):
 
             return call_template, pysig, args, kws
 
-#    def get_overload(self, sig):
-#        # We give the id of the overload (a CompileResult) because this is used
-#        # as a key into a dict of overloads, and this is the only small and
-#        # unique property of a CompileResult on CUDA (c.f. the CPU target,
-#        # which uses its entry_point, which is a pointer value).
-#        args, return_type = sigutils.normalize_signature(sig)
-#        return id(self.overloads[args])
-
+    # XXX: Delete this and call the class CUDADispatcher
     def __repr__(self):
         return f"numba.cuda.dispatcher.Dispatcher({self.py_func})"
 
@@ -764,8 +757,6 @@ def compile_device(self, args):
                                 inline=inline, nvvm_options=nvvm_options)
             self.overloads[args] = cres
 
-            # The inserted function uses the id of the CompileResult as a key,
-            # consistent with get_overload() above.
             cres.target_context.insert_user_function(cres.entry_point,
                                                      cres.fndesc,
                                                      [cres.library])

From 70dbacd9dbffebaa35bc7285388a2f9c4c3cd72c Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 16:07:40 +0000
Subject: [PATCH 04/21] [WIP] CUDA Dispatcher inherits from uber_Dispatcher

Test results:

```
Ran 1266 tests in 127.002s

FAILED (failures=66, errors=17, skipped=12, expected failures=7)
```
---
 numba/cuda/dispatcher.py | 28 ++++++++++------------------
 numba/cuda/target.py     | 13 +++++++------
 2 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index 4c3d18435c8..8b7d435d5cc 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -7,14 +7,14 @@
 
 from numba.core import config, serialize, sigutils, types, typing, utils
 from numba.core.compiler_lock import global_compiler_lock
-from numba.core.dispatcher import _DispatcherBase
+from numba.core.dispatcher import Dispatcher as uber_Dispatcher
 from numba.core.errors import NumbaPerformanceWarning
 from numba.core.typing.templates import AbstractTemplate
 from numba.core.typing.typeof import Purpose, typeof
 
 from numba.cuda.api import get_current_device
 from numba.cuda.args import wrap_arg
-from numba.cuda.compiler import compile_cuda
+from numba.cuda.compiler import compile_cuda, CUDACompiler
 from numba.cuda.cudadrv import driver
 from numba.cuda.cudadrv.devices import get_context
 from numba.cuda.cudadrv.libs import get_cudalib
@@ -444,7 +444,7 @@ def __call__(self, *args):
                                     self.stream, self.sharedmem)
 
 
-class Dispatcher(_DispatcherBase, serialize.ReduceMixin):
+class Dispatcher(uber_Dispatcher, serialize.ReduceMixin):
     '''
     CUDA Dispatcher object. When configured and called, the dispatcher will
     specialize itself for the given arguments (if no suitable specialized
@@ -462,21 +462,13 @@ class Dispatcher(_DispatcherBase, serialize.ReduceMixin):
 
     targetdescr = cuda_target
 
-    def __init__(self, py_func, sigs, targetoptions):
-        self.typingctx = self.targetdescr.typing_context
-        self.targetctx = self.targetdescr.target_context
-
-        pysig = utils.pysignature(py_func)
-        arg_count = len(pysig.parameters)
-        can_fallback = False # CUDA cannot fallback to object mode
-
-        _DispatcherBase.__init__(self, arg_count, py_func, pysig, can_fallback,
-                                 exact_match_required=False)
-
+    def __init__(self, py_func, sigs, targetoptions,
+                 pipeline_class=CUDACompiler):
         # TODO: Check if this fixes the cuda docstring jit issue
-        functools.update_wrapper(self, py_func)
 
-        self.targetoptions = targetoptions
+        super().__init__(py_func, pipeline_class=pipeline_class)
+
+        # CUDA-specific stuff - hopefully some of it can be removed ASAP
 
         self.sigs = []
         self.link = targetoptions.pop('link', (),)
@@ -501,8 +493,8 @@ def __init__(self, py_func, sigs, targetoptions):
 
             self._can_compile = False
 
-        if targetoptions.get('device'):
-            self._register_device_function()
+        #if targetoptions.get('device'):
+        #    self._register_device_function()
 
     def _make_finalizer(self):
         # Dummy finalizer whilst _DispatcherBase assumes the existence of a
diff --git a/numba/cuda/target.py b/numba/cuda/target.py
index 59cf27eb4b8..17b990c6816 100644
--- a/numba/cuda/target.py
+++ b/numba/cuda/target.py
@@ -2,8 +2,8 @@
 import llvmlite.binding as ll
 from llvmlite import ir
 
-from numba.core import (typing, types, dispatcher, debuginfo, itanium_mangler,
-                        cgutils)
+from numba.core import typing, types, debuginfo, itanium_mangler, cgutils
+from numba.core.dispatcher import Dispatcher as uber_Dispatcher
 from numba.core.utils import cached_property
 from numba.core.base import BaseContext
 from numba.core.callconv import MinimalCallConv
@@ -27,8 +27,10 @@ def load_additional_registries(self):
         self.install_registry(libdevicedecl.registry)
 
     def resolve_value_type(self, val):
-        # treat dispatcher object as another device function
-        if isinstance(val, dispatcher.Dispatcher):
+        # treat other dispatcher object as another device function
+        from numba.cuda.dispatcher import Dispatcher as CUDADispatcher
+        if (isinstance(val, uber_Dispatcher) and not
+                isinstance(val, CUDADispatcher)):
             try:
                 # use cached device function
                 val = val.__dispatcher
@@ -41,8 +43,7 @@ def resolve_value_type(self, val):
                 targetoptions['debug'] = targetoptions.get('debug', False)
                 targetoptions['opt'] = targetoptions.get('opt', True)
                 sigs = None
-                from numba.cuda.dispatcher import Dispatcher
-                disp = Dispatcher(val.py_func, sigs, targetoptions)
+                disp = CUDADispatcher(val.py_func, sigs, targetoptions)
                 # cache the device function for future use and to avoid
                 # duplicated copy of the same function.
                 val.__dispatcher = disp

From 922624fc2fc7c39a285fee34bbfef879c435ad04 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 16:15:16 +0000
Subject: [PATCH 05/21] Wire in target options in CUDA dispatcher

Test results now:

```
Ran 1266 tests in 105.203s

OK (skipped=12, expected failures=7)
```
---
 numba/cuda/dispatcher.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index 8b7d435d5cc..5f99680b7a5 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -466,7 +466,8 @@ def __init__(self, py_func, sigs, targetoptions,
                  pipeline_class=CUDACompiler):
         # TODO: Check if this fixes the cuda docstring jit issue
 
-        super().__init__(py_func, pipeline_class=pipeline_class)
+        super().__init__(py_func, targetoptions=targetoptions,
+                         pipeline_class=pipeline_class)
 
         # CUDA-specific stuff - hopefully some of it can be removed ASAP
 

From ed2e17a67660b2eca60de83ad05be60828be911f Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 16:33:51 +0000
Subject: [PATCH 06/21] Delete some dead code

---
 numba/cuda/dispatcher.py | 43 ----------------------------------------
 1 file changed, 43 deletions(-)

diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index 5f99680b7a5..aba0a34b21f 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -1,4 +1,3 @@
-import inspect
 import numpy as np
 import os
 import sys
@@ -9,7 +8,6 @@
 from numba.core.compiler_lock import global_compiler_lock
 from numba.core.dispatcher import Dispatcher as uber_Dispatcher
 from numba.core.errors import NumbaPerformanceWarning
-from numba.core.typing.templates import AbstractTemplate
 from numba.core.typing.typeof import Purpose, typeof
 
 from numba.cuda.api import get_current_device
@@ -494,9 +492,6 @@ def __init__(self, py_func, sigs, targetoptions,
 
             self._can_compile = False
 
-        #if targetoptions.get('device'):
-        #    self._register_device_function()
-
     def _make_finalizer(self):
         # Dummy finalizer whilst _DispatcherBase assumes the existence of a
         # finalizer
@@ -504,48 +499,10 @@ def finalizer():
             pass
         return finalizer
 
-    def _register_device_function(self):
-        dispatcher = self
-        pyfunc = self.py_func
-
-        class device_function_template(AbstractTemplate):
-            key = dispatcher
-
-            def generic(self, args, kws):
-                assert not kws
-                return dispatcher.compile(args).signature
-
-            def get_template_info(cls):
-                basepath = os.path.dirname(
-                    os.path.dirname(os.path.dirname(cuda.__file__)))
-                code, firstlineno = inspect.getsourcelines(pyfunc)
-                path = inspect.getsourcefile(pyfunc)
-                sig = str(utils.pysignature(pyfunc))
-                info = {
-                    'kind': "overload",
-                    'name': getattr(cls.key, '__name__', "unknown"),
-                    'sig': sig,
-                    'filename': utils.safe_relpath(path, start=basepath),
-                    'lines': (firstlineno, firstlineno + len(code) - 1),
-                    'docstring': pyfunc.__doc__
-                }
-                return info
-
-        from .descriptor import cuda_target
-        typingctx = cuda_target.typing_context
-        typingctx.insert_user_function(dispatcher, device_function_template)
-
     @property
     def _numba_type_(self):
         return cuda_types.CUDADispatcher(self)
 
-    @property
-    def is_compiling(self):
-        """
-        Whether a specialization is currently being compiled.
-        """
-        return self._compiling_counter
-
     def configure(self, griddim, blockdim, stream=0, sharedmem=0):
         griddim, blockdim = normalize_kernel_dimensions(griddim, blockdim)
         return _LaunchConfiguration(self, griddim, blockdim, stream, sharedmem)

From 8238610e1edd886ed2e57d0df438cb03479991f1 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 17:07:10 +0000
Subject: [PATCH 07/21] Remove sigs from cuda dispatcher

---
 numba/cuda/decorators.py | 15 +++++++++++----
 numba/cuda/dispatcher.py | 40 ++++++++++++++++++++--------------------
 numba/cuda/target.py     |  3 +--
 3 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/numba/cuda/decorators.py b/numba/cuda/decorators.py
index 4a4bed69f83..d49826542e0 100644
--- a/numba/cuda/decorators.py
+++ b/numba/cuda/decorators.py
@@ -97,7 +97,16 @@ def _jit(func):
             targetoptions['opt'] = opt
             targetoptions['fastmath'] = fastmath
             targetoptions['device'] = device
-            return Dispatcher(func, [func_or_sig], targetoptions=targetoptions)
+            disp = Dispatcher(func, targetoptions=targetoptions)
+            if device:
+                disp.compile_device(argtypes)
+                disp._specialized = True
+            else:
+                disp.compile(argtypes)
+                disp._specialized = True
+            disp.disable_compile()
+
+            return disp
 
         return _jit
     else:
@@ -124,9 +133,7 @@ def autojitwrapper(func):
                 targetoptions['link'] = link
                 targetoptions['fastmath'] = fastmath
                 targetoptions['device'] = device
-                sigs = None
-                return Dispatcher(func_or_sig, sigs,
-                                  targetoptions=targetoptions)
+                return Dispatcher(func_or_sig, targetoptions=targetoptions)
 
 
 def declare_device(name, sig):
diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index aba0a34b21f..3211d020fff 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -460,8 +460,7 @@ class Dispatcher(uber_Dispatcher, serialize.ReduceMixin):
 
     targetdescr = cuda_target
 
-    def __init__(self, py_func, sigs, targetoptions,
-                 pipeline_class=CUDACompiler):
+    def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler):
         # TODO: Check if this fixes the cuda docstring jit issue
 
         super().__init__(py_func, targetoptions=targetoptions,
@@ -469,9 +468,9 @@ def __init__(self, py_func, sigs, targetoptions,
 
         # CUDA-specific stuff - hopefully some of it can be removed ASAP
 
-        self.sigs = []
+        self._specialized = False
         self.link = targetoptions.pop('link', (),)
-        self._can_compile = True
+        #self._can_compile = True
         self._type = self._numba_type_
 
         # Specializations for given sets of argument types
@@ -481,16 +480,16 @@ def __init__(self, py_func, sigs, targetoptions,
         self.targetoptions['extensions'] = \
             list(self.targetoptions.get('extensions', []))
 
-        if sigs:
-            if len(sigs) > 1:
-                raise TypeError("Only one signature supported at present")
-            if targetoptions.get('device'):
-                argtypes, restype = sigutils.normalize_signature(sigs[0])
-                self.compile_device(argtypes)
-            else:
-                self.compile(sigs[0])
+        #if sigs:
+        #    if len(sigs) > 1:
+        #        raise TypeError("Only one signature supported at present")
+        #    if targetoptions.get('device'):
+        #        argtypes, restype = sigutils.normalize_signature(sigs[0])
+        #        self.compile_device(argtypes)
+        #    else:
+        #        self.compile(sigs[0])
 
-            self._can_compile = False
+        #    self._can_compile = False
 
     def _make_finalizer(self):
         # Dummy finalizer whilst _DispatcherBase assumes the existence of a
@@ -622,8 +621,10 @@ def specialize(self, *args):
 
         targetoptions = self.targetoptions
         targetoptions['link'] = self.link
-        specialization = Dispatcher(self.py_func, [types.void(*argtypes)],
-                                    targetoptions)
+        specialization = Dispatcher(self.py_func, targetoptions=targetoptions)
+        specialization.compile(argtypes)
+        specialization.disable_compile()
+        specialization._specialized = True
         self.specializations[cc, argtypes] = specialization
         return specialization
 
@@ -635,7 +636,7 @@ def specialized(self):
         """
         True if the Dispatcher has been specialized.
         """
-        return len(self.sigs) == 1 and not self._can_compile
+        return self._specialized
 
     def get_regs_per_thread(self, signature=None):
         '''
@@ -739,7 +740,6 @@ def compile(self, sig):
             self.overloads[argtypes] = kernel
 
             kernel.bind()
-            self.sigs.append(sig)
         return kernel
 
     def inspect_llvm(self, signature=None):
@@ -832,11 +832,11 @@ def bind(self):
             defn.bind()
 
     @classmethod
-    def _rebuild(cls, py_func, sigs, targetoptions):
+    def _rebuild(cls, py_func, targetoptions):
         """
         Rebuild an instance.
         """
-        instance = cls(py_func, sigs, targetoptions)
+        instance = cls(py_func, targetoptions)
         return instance
 
     def _reduce_states(self):
@@ -844,5 +844,5 @@ def _reduce_states(self):
         Reduce the instance for serialization.
         Compiled definitions are discarded.
         """
-        return dict(py_func=self.py_func, sigs=self.sigs,
+        return dict(py_func=self.py_func,
                     targetoptions=self.targetoptions)
diff --git a/numba/cuda/target.py b/numba/cuda/target.py
index 17b990c6816..c7a4e4baf11 100644
--- a/numba/cuda/target.py
+++ b/numba/cuda/target.py
@@ -42,8 +42,7 @@ def resolve_value_type(self, val):
                 targetoptions['device'] = True
                 targetoptions['debug'] = targetoptions.get('debug', False)
                 targetoptions['opt'] = targetoptions.get('opt', True)
-                sigs = None
-                disp = CUDADispatcher(val.py_func, sigs, targetoptions)
+                disp = CUDADispatcher(val.py_func, targetoptions)
                 # cache the device function for future use and to avoid
                 # duplicated copy of the same function.
                 val.__dispatcher = disp

From 53e98fb828355eb66460932546dfda3bbf59a637 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 17:30:45 +0000
Subject: [PATCH 08/21] Some refactoring

---
 numba/cuda/decorators.py |  5 +++++
 numba/cuda/dispatcher.py | 20 +-------------------
 2 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/numba/cuda/decorators.py b/numba/cuda/decorators.py
index d49826542e0..43290c89fea 100644
--- a/numba/cuda/decorators.py
+++ b/numba/cuda/decorators.py
@@ -69,6 +69,7 @@ def jit(func_or_sig=None, device=False, inline=False, link=[], debug=None,
 
     debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
     fastmath = kws.get('fastmath', False)
+    extensions = kws.get('extensions', [])
 
     if debug and opt:
         msg = ("debug=True with opt=True (the default) "
@@ -97,7 +98,10 @@ def _jit(func):
             targetoptions['opt'] = opt
             targetoptions['fastmath'] = fastmath
             targetoptions['device'] = device
+            targetoptions['extensions'] = extensions
             disp = Dispatcher(func, targetoptions=targetoptions)
+            # TODO: Support multiple signatures by compiling in a loop over
+            # signatures.
             if device:
                 disp.compile_device(argtypes)
                 disp._specialized = True
@@ -133,6 +137,7 @@ def autojitwrapper(func):
                 targetoptions['link'] = link
                 targetoptions['fastmath'] = fastmath
                 targetoptions['device'] = device
+                targetoptions['extensions'] = extensions
                 return Dispatcher(func_or_sig, targetoptions=targetoptions)
 
 
diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index 3211d020fff..bc7d68119bc 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -469,28 +469,11 @@ def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler):
         # CUDA-specific stuff - hopefully some of it can be removed ASAP
 
         self._specialized = False
-        self.link = targetoptions.pop('link', (),)
-        #self._can_compile = True
         self._type = self._numba_type_
 
         # Specializations for given sets of argument types
         self.specializations = {}
 
-        # defensive copy
-        self.targetoptions['extensions'] = \
-            list(self.targetoptions.get('extensions', []))
-
-        #if sigs:
-        #    if len(sigs) > 1:
-        #        raise TypeError("Only one signature supported at present")
-        #    if targetoptions.get('device'):
-        #        argtypes, restype = sigutils.normalize_signature(sigs[0])
-        #        self.compile_device(argtypes)
-        #    else:
-        #        self.compile(sigs[0])
-
-        #    self._can_compile = False
-
     def _make_finalizer(self):
         # Dummy finalizer whilst _DispatcherBase assumes the existence of a
         # finalizer
@@ -620,7 +603,6 @@ def specialize(self, *args):
             return specialization
 
         targetoptions = self.targetoptions
-        targetoptions['link'] = self.link
         specialization = Dispatcher(self.py_func, targetoptions=targetoptions)
         specialization.compile(argtypes)
         specialization.disable_compile()
@@ -730,7 +712,7 @@ def compile(self, sig):
         if kernel is None:
             if not self._can_compile:
                 raise RuntimeError("Compilation disabled")
-            kernel = _Kernel(self.py_func, argtypes, link=self.link,
+            kernel = _Kernel(self.py_func, argtypes,
                              **self.targetoptions)
             # Inspired by _DispatcherBase.add_overload, but differs slightly
             # because we're inserting a _Kernel object instead of a compiled

From e307e9a732013161b91dddeed8cbadd744619d2b Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 17:41:16 +0000
Subject: [PATCH 09/21] Some renaming to closer align ForAll with reality

---
 numba/cuda/dispatcher.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index bc7d68119bc..dd0cf9d94ed 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -374,11 +374,11 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs):
 
 
 class ForAll(object):
-    def __init__(self, kernel, ntasks, tpb, stream, sharedmem):
+    def __init__(self, dispatcher, ntasks, tpb, stream, sharedmem):
         if ntasks < 0:
             raise ValueError("Can't create ForAll with negative task count: %s"
                              % ntasks)
-        self.kernel = kernel
+        self.dispatcher = dispatcher
         self.ntasks = ntasks
         self.thread_per_block = tpb
         self.stream = stream
@@ -388,16 +388,17 @@ def __call__(self, *args):
         if self.ntasks == 0:
             return
 
-        if self.kernel.specialized:
-            kernel = self.kernel
+        if self.dispatcher.specialized:
+            specialized = self.dispatcher
         else:
-            kernel = self.kernel.specialize(*args)
-        blockdim = self._compute_thread_per_block(kernel)
+            specialized = self.dispatcher.specialize(*args)
+        blockdim = self._compute_thread_per_block(specialized)
         griddim = (self.ntasks + blockdim - 1) // blockdim
 
-        return kernel[griddim, blockdim, self.stream, self.sharedmem](*args)
+        return specialized[griddim, blockdim, self.stream,
+                           self.sharedmem](*args)
 
-    def _compute_thread_per_block(self, kernel):
+    def _compute_thread_per_block(self, dispatcher):
         tpb = self.thread_per_block
         # Prefer user-specified config
         if tpb != 0:
@@ -405,11 +406,11 @@ def _compute_thread_per_block(self, kernel):
         # Else, ask the driver to give a good config
         else:
             ctx = get_context()
-            # Kernel is specialized, so there's only one definition - get it so
-            # we can get the cufunc from the code library
-            defn = next(iter(kernel.overloads.values()))
+            # Dispatcher is specialized, so there's only one definition - get
+            # it so we can get the cufunc from the code library
+            kernel = next(iter(dispatcher.overloads.values()))
             kwargs = dict(
-                func=defn._codelibrary.get_cufunc(),
+                func=kernel._codelibrary.get_cufunc(),
                 b2d_func=0,     # dynamic-shared memory is constant to blksz
                 memsize=self.sharedmem,
                 blocksizelimit=1024,
@@ -495,7 +496,7 @@ def __getitem__(self, args):
         return self.configure(*args)
 
     def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
-        """Returns a 1D-configured kernel for a given number of tasks.
+        """Returns a 1D-configured dispatcher for a given number of tasks.
 
         This assumes that:
 
@@ -507,11 +508,12 @@ def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
         :param ntasks: The number of tasks.
         :param tpb: The size of a block. An appropriate value is chosen if this
                     parameter is not supplied.
-        :param stream: The stream on which the configured kernel will be
+        :param stream: The stream on which the configured dispatcher will be
                        launched.
         :param sharedmem: The number of bytes of dynamic shared memory required
                           by the kernel.
-        :return: A configured kernel, ready to launch on a set of arguments."""
+        :return: A configured dispatcher, ready to launch on a set of
+                 arguments."""
 
         return ForAll(self, ntasks, tpb=tpb, stream=stream, sharedmem=sharedmem)
 

From 703d22255ebc9a15ba5e79b9819bd66da67b6481 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 17:47:39 +0000
Subject: [PATCH 10/21] Delete _search_new_conversions and nopython_signatures
 from CUDA dispatcher

---
 numba/cuda/dispatcher.py | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index dd0cf9d94ed..b74a430d560 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -43,6 +43,10 @@ def __init__(self, py_func, argtypes, link=None, debug=False,
 
         super().__init__()
 
+        # Emulate a CompileResult so that _DispatcherBase.nopython_signatures
+        # can be used as-is
+        self.objectmode = False
+
         self.py_func = py_func
         self.argtypes = argtypes
         self.debug = debug
@@ -559,17 +563,6 @@ def _compile_for_args(self, *args, **kws):
         argtypes = [self.typeof_pyval(a) for a in args]
         return self.compile(tuple(argtypes))
 
-    def _search_new_conversions(self, *args, **kws):
-        # Based on _DispatcherBase._search_new_conversions
-        assert not kws
-        args = [self.typeof_pyval(a) for a in args]
-        found = False
-        for sig in self.nopython_signatures:
-            conv = self.typingctx.install_possible_conversions(args, sig.args)
-            if conv:
-                found = True
-        return found
-
     def typeof_pyval(self, val):
         # Based on _DispatcherBase.typeof_pyval, but differs from it to support
         # the CUDA Array Interface.
@@ -584,11 +577,6 @@ def typeof_pyval(self, val):
             else:
                 raise
 
-    @property
-    def nopython_signatures(self):
-        # Based on _DispatcherBase.nopython_signatures
-        return [kernel.signature for kernel in self.overloads.values()]
-
     def specialize(self, *args):
         '''
         Create a new instance of this dispatcher specialized for the given

From 13aa50bdb3d7c56d1c41213028bb14dbd03ab553 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 17:53:31 +0000
Subject: [PATCH 11/21] Remove CUDA Dispatchers disable_compile and add a note
 to get_call_template

---
 numba/cuda/dispatcher.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index b74a430d560..fbac4426f02 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -600,9 +600,6 @@ def specialize(self, *args):
         self.specializations[cc, argtypes] = specialization
         return specialization
 
-    def disable_compile(self, val=True):
-        self._can_compile = not val
-
     @property
     def specialized(self):
         """
@@ -631,6 +628,9 @@ def get_regs_per_thread(self, signature=None):
 
     def get_call_template(self, args, kws):
         # Copied and simplified from _DispatcherBase.get_call_template.
+        #
+        # This seems to have some necessary differences to the _DispatcherBase
+        # version to force casts where necessay? XXX
         """
         Get a typing.ConcreteTemplate for this dispatcher and the given
         *args* and *kws* types.  This allows resolution of the return type.

From a11269ea19598edfb364ad4f0b6c2a98858d3380 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 21:57:05 +0000
Subject: [PATCH 12/21] Clarify comment on CUDA Compile Result

---
 numba/cuda/compiler.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py
index 31dd4fb02c2..dc24d2a8919 100644
--- a/numba/cuda/compiler.py
+++ b/numba/cuda/compiler.py
@@ -29,11 +29,21 @@ class CUDAFlags(Flags):
     )
 
 
-# FIXME: Update this comment
-# We give the id of the overload (a CompileResult) because this is used
-# as a key into a dict of overloads, and this is the only small and
-# unique property of a CompileResult on CUDA (c.f. the CPU target,
-# which uses its entry_point, which is a pointer value).
+# The CUDACompileResult has a specially-defined entry point equal to its id.
+# This is because the entry point is used as a key into a dict of overloads by
+# the base dispatcher. The id of the CCR is the only small and unique property
+# of a CompileResult in the CUDA target (c.f. the CPU target, which uses its
+# entry_point, which is a pointer value).
+#
+# This does feel a little hackish, and there are two ways in which this could
+# be improved:
+#
+# 1. We could change the core of Numba so that each CompileResult has its own
+#    unique ID that can be used as a key - e.g. a count, similar to the way in
+#    which types have unique counts.
+# 2. At some future time when kernel launch uses a compiled function, the entry
+#    point will no longer need to be a synthetic value, but will instead be a
+#    pointer to the compiled function as in the CPU target.
 
 class CUDACompileResult(CompileResult):
     @property

From 18c553f3b1aa6e9f05a8efd68ca7505cdfc96639 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 22:08:08 +0000
Subject: [PATCH 13/21] Remove duplication in compile_result functions

---
 numba/core/compiler.py | 17 +++++++++++------
 numba/cuda/compiler.py | 25 +++++++------------------
 2 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/numba/core/compiler.py b/numba/core/compiler.py
index 0bf12b0dbf1..85a5fdc8265 100644
--- a/numba/core/compiler.py
+++ b/numba/core/compiler.py
@@ -269,20 +269,25 @@ def dump(self, tab=''):
 ])
 
 
-def compile_result(**kws):
-    keys = set(kws.keys())
+def sanitize_compile_result_entries(entries):
+    keys = set(entries.keys())
     fieldset = set(CR_FIELDS)
     badnames = keys - fieldset
     if badnames:
         raise NameError(*badnames)
     missing = fieldset - keys
     for k in missing:
-        kws[k] = None
+        entries[k] = None
     # Avoid keeping alive traceback variables
-    err = kws['typing_error']
+    err = entries['typing_error']
     if err is not None:
-        kws['typing_error'] = err.with_traceback(None)
-    return CompileResult(**kws)
+        entries['typing_error'] = err.with_traceback(None)
+    return entries
+
+
+def compile_result(**entries):
+    entries = sanitize_compile_result_entries(entries)
+    return CompileResult(**entries)
 
 
 def compile_isolated(func, args, return_type=None, flags=DEFAULT_FLAGS,
diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py
index dc24d2a8919..cb2cfcb4f21 100644
--- a/numba/cuda/compiler.py
+++ b/numba/cuda/compiler.py
@@ -1,7 +1,8 @@
 from numba.core.typing.templates import ConcreteTemplate
 from numba.core import types, typing, funcdesc, config, compiler
-from numba.core.compiler import (CompilerBase, DefaultPassBuilder,
-                                 Flags, Option, CompileResult, CR_FIELDS)
+from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
+                                 DefaultPassBuilder, Flags, Option,
+                                 CompileResult)
 from numba.core.compiler_lock import global_compiler_lock
 from numba.core.compiler_machinery import (LoweringPass, AnalysisPass,
                                            PassManager, register_pass)
@@ -51,20 +52,9 @@ def entry_point(self):
         return id(self)
 
 
-def compile_result(**kws):
-    keys = set(kws.keys())
-    fieldset = set(CR_FIELDS)
-    badnames = keys - fieldset
-    if badnames:
-        raise NameError(*badnames)
-    missing = fieldset - keys
-    for k in missing:
-        kws[k] = None
-    # Avoid keeping alive traceback variables
-    err = kws['typing_error']
-    if err is not None:
-        kws['typing_error'] = err.with_traceback(None)
-    return CUDACompileResult(**kws)
+def cuda_compile_result(**entries):
+    entries = sanitize_compile_result_entries(entries)
+    return CUDACompileResult(**entries)
 
 
 @register_pass(mutates_CFG=True, analysis_only=False)
@@ -82,7 +72,7 @@ def run_pass(self, state):
         lowered = state['cr']
         signature = typing.signature(state.return_type, *state.args)
 
-        state.cr = compile_result(
+        state.cr = cuda_compile_result(
             typing_context=state.typingctx,
             target_context=state.targetctx,
             typing_error=state.status.fail_reason,
@@ -92,7 +82,6 @@ def run_pass(self, state):
             signature=signature,
             fndesc=lowered.fndesc,
         )
-
         return True
 
 

From 3ebcf2637744f2ef8a6a7c1bc7273ebb2a051b77 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 22:12:32 +0000
Subject: [PATCH 14/21] Refactor / tidy up decorators.py

---
 numba/cuda/decorators.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/numba/cuda/decorators.py b/numba/cuda/decorators.py
index 43290c89fea..ca25767230e 100644
--- a/numba/cuda/decorators.py
+++ b/numba/cuda/decorators.py
@@ -99,15 +99,15 @@ def _jit(func):
             targetoptions['fastmath'] = fastmath
             targetoptions['device'] = device
             targetoptions['extensions'] = extensions
+
             disp = Dispatcher(func, targetoptions=targetoptions)
-            # TODO: Support multiple signatures by compiling in a loop over
-            # signatures.
+
             if device:
                 disp.compile_device(argtypes)
-                disp._specialized = True
             else:
                 disp.compile(argtypes)
-                disp._specialized = True
+
+            disp._specialized = True
             disp.disable_compile()
 
             return disp

From 4f5b36152deeb27853346bbe3019f0fe0d274442 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 22:29:56 +0000
Subject: [PATCH 15/21] Rename CUDA dispatcher to CUDADispatcher

Also update some comments.
---
 numba/cuda/decorators.py               |  6 +++---
 numba/cuda/dispatcher.py               | 19 ++++++++-----------
 numba/cuda/initialize.py               |  4 ++--
 numba/cuda/target.py                   |  6 +++---
 numba/cuda/tests/cudapy/test_errors.py |  4 +---
 5 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/numba/cuda/decorators.py b/numba/cuda/decorators.py
index ca25767230e..ba901a7c344 100644
--- a/numba/cuda/decorators.py
+++ b/numba/cuda/decorators.py
@@ -2,7 +2,7 @@
 from numba.core import types, config, sigutils
 from numba.core.errors import DeprecationError, NumbaInvalidConfigWarning
 from numba.cuda.compiler import declare_device_function
-from numba.cuda.dispatcher import Dispatcher
+from numba.cuda.dispatcher import CUDADispatcher
 from numba.cuda.simulator.kernel import FakeCUDAKernel
 
 
@@ -100,7 +100,7 @@ def _jit(func):
             targetoptions['device'] = device
             targetoptions['extensions'] = extensions
 
-            disp = Dispatcher(func, targetoptions=targetoptions)
+            disp = CUDADispatcher(func, targetoptions=targetoptions)
 
             if device:
                 disp.compile_device(argtypes)
@@ -138,7 +138,7 @@ def autojitwrapper(func):
                 targetoptions['fastmath'] = fastmath
                 targetoptions['device'] = device
                 targetoptions['extensions'] = extensions
-                return Dispatcher(func_or_sig, targetoptions=targetoptions)
+                return CUDADispatcher(func_or_sig, targetoptions=targetoptions)
 
 
 def declare_device(name, sig):
diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index fbac4426f02..0f041ec5529 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -6,7 +6,7 @@
 
 from numba.core import config, serialize, sigutils, types, typing, utils
 from numba.core.compiler_lock import global_compiler_lock
-from numba.core.dispatcher import Dispatcher as uber_Dispatcher
+from numba.core.dispatcher import Dispatcher
 from numba.core.errors import NumbaPerformanceWarning
 from numba.core.typing.typeof import Purpose, typeof
 
@@ -447,7 +447,7 @@ def __call__(self, *args):
                                     self.stream, self.sharedmem)
 
 
-class Dispatcher(uber_Dispatcher, serialize.ReduceMixin):
+class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
     '''
     CUDA Dispatcher object. When configured and called, the dispatcher will
     specialize itself for the given arguments (if no suitable specialized
@@ -593,7 +593,8 @@ def specialize(self, *args):
             return specialization
 
         targetoptions = self.targetoptions
-        specialization = Dispatcher(self.py_func, targetoptions=targetoptions)
+        specialization = CUDADispatcher(self.py_func,
+                                        targetoptions=targetoptions)
         specialization.compile(argtypes)
         specialization.disable_compile()
         specialization._specialized = True
@@ -627,10 +628,10 @@ def get_regs_per_thread(self, signature=None):
                     for sig, overload in self.overloads.items()}
 
     def get_call_template(self, args, kws):
-        # Copied and simplified from _DispatcherBase.get_call_template.
-        #
-        # This seems to have some necessary differences to the _DispatcherBase
-        # version to force casts where necessay? XXX
+        # Originally copied from _DispatcherBase.get_call_template. This
+        # version deviates slightly from the _DispatcherBase version in order
+        # to force casts when calling device functions. See e.g.
+        # TestDeviceFunc.test_device_casting, added in PR #7496.
         """
         Get a typing.ConcreteTemplate for this dispatcher and the given
         *args* and *kws* types.  This allows resolution of the return type.
@@ -654,10 +655,6 @@ def get_call_template(self, args, kws):
 
             return call_template, pysig, args, kws
 
-    # XXX: Delete this and call the class CUDADispatcher
-    def __repr__(self):
-        return f"numba.cuda.dispatcher.Dispatcher({self.py_func})"
-
     def compile_device(self, args):
         """Compile the device function for the given argument types.
 
diff --git a/numba/cuda/initialize.py b/numba/cuda/initialize.py
index c9d1bc13d6e..0c9343aaa29 100644
--- a/numba/cuda/initialize.py
+++ b/numba/cuda/initialize.py
@@ -3,7 +3,7 @@ def initialize_all():
     import numba.cuda.models  # noqa: F401
 
     from numba import cuda
-    from numba.cuda.dispatcher import Dispatcher
+    from numba.cuda.dispatcher import CUDADispatcher
     from numba.core.target_extension import (target_registry,
                                              dispatcher_registry,
                                              jit_registry)
@@ -14,4 +14,4 @@ def cuda_jit_device(*args, **kwargs):
 
     cuda_target = target_registry["cuda"]
     jit_registry[cuda_target] = cuda_jit_device
-    dispatcher_registry[cuda_target] = Dispatcher
+    dispatcher_registry[cuda_target] = CUDADispatcher
diff --git a/numba/cuda/target.py b/numba/cuda/target.py
index c7a4e4baf11..c1800c558cb 100644
--- a/numba/cuda/target.py
+++ b/numba/cuda/target.py
@@ -3,7 +3,7 @@
 from llvmlite import ir
 
 from numba.core import typing, types, debuginfo, itanium_mangler, cgutils
-from numba.core.dispatcher import Dispatcher as uber_Dispatcher
+from numba.core.dispatcher import Dispatcher
 from numba.core.utils import cached_property
 from numba.core.base import BaseContext
 from numba.core.callconv import MinimalCallConv
@@ -28,8 +28,8 @@ def load_additional_registries(self):
 
     def resolve_value_type(self, val):
         # treat other dispatcher object as another device function
-        from numba.cuda.dispatcher import Dispatcher as CUDADispatcher
-        if (isinstance(val, uber_Dispatcher) and not
+        from numba.cuda.dispatcher import CUDADispatcher
+        if (isinstance(val, Dispatcher) and not
                 isinstance(val, CUDADispatcher)):
             try:
                 # use cached device function
diff --git a/numba/cuda/tests/cudapy/test_errors.py b/numba/cuda/tests/cudapy/test_errors.py
index e13197f7d8f..c20fb8dccdf 100644
--- a/numba/cuda/tests/cudapy/test_errors.py
+++ b/numba/cuda/tests/cudapy/test_errors.py
@@ -71,9 +71,7 @@ def kernel_func():
         with self.assertRaises(TypingError) as raises:
             kernel_func[1, 1]()
         excstr = str(raises.exception)
-        self.assertIn("resolving callee type: "
-                      "type(numba.cuda.dispatcher.Dispatcher",
-                      excstr)
+        self.assertIn("resolving callee type: type(CUDADispatcher", excstr)
         self.assertIn("NameError: name 'floor' is not defined", excstr)
 
 

From 078f2d958a0b8a0a70e4c349a053c17362175ed0 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 22:56:35 +0000
Subject: [PATCH 16/21] Clarify some comments

---
 numba/cuda/dispatcher.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index 0f041ec5529..bf966b26efb 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -43,8 +43,11 @@ def __init__(self, py_func, argtypes, link=None, debug=False,
 
         super().__init__()
 
-        # Emulate a CompileResult so that _DispatcherBase.nopython_signatures
-        # can be used as-is
+        # _DispatcherBase.nopython_signatures() expects this attribute to be
+        # present, because it assumes an overload is a CompileResult. In the
+        # CUDA target, _Kernel instances are stored instead, so we provide this
+        # attribute here to avoid duplicating nopython_signatures() in the CUDA
+        # target with slight modifications.
         self.objectmode = False
 
         self.py_func = py_func
@@ -466,17 +469,20 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
     targetdescr = cuda_target
 
     def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler):
-        # TODO: Check if this fixes the cuda docstring jit issue
-
         super().__init__(py_func, targetoptions=targetoptions,
                          pipeline_class=pipeline_class)
+        self._type = self._numba_type_
 
-        # CUDA-specific stuff - hopefully some of it can be removed ASAP
+        # The following properties are for specialization of CUDADisptachers. A
+        # specialized CUDADispatcher is one that is compiled for exactly one
+        # set of argument types, and bypasses some argument type checking for
+        # faster kernel launches.
 
+        # Is this a specialized dispatcher?
         self._specialized = False
-        self._type = self._numba_type_
 
-        # Specializations for given sets of argument types
+        # If we produced specialized dispatchers, we cache them for each set of
+        # argument types
         self.specializations = {}
 
     def _make_finalizer(self):

From 778f7e23846c8b0a92491e4ed84b6f312b1ebf50 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 23:11:23 +0000
Subject: [PATCH 17/21] Fix ref to CUDADispatcher in docs

---
 docs/source/cuda-reference/kernel.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/cuda-reference/kernel.rst b/docs/source/cuda-reference/kernel.rst
index 71476ab991f..4095b5b1767 100644
--- a/docs/source/cuda-reference/kernel.rst
+++ b/docs/source/cuda-reference/kernel.rst
@@ -56,7 +56,7 @@ This is similar to launch configuration in CUDA C/C++:
 Dispatcher objects also provide several utility methods for inspection and
 creating a specialized instance:
 
-.. autoclass:: numba.cuda.dispatcher.Dispatcher
+.. autoclass:: numba.cuda.dispatcher.CUDADispatcher
    :members: inspect_asm, inspect_llvm, inspect_sass, inspect_types,
              get_regs_per_thread, specialize, specialized, extensions, forall
 

From 5d6384c58e531509fca7a190fc17357397ec2558 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 23:12:27 +0000
Subject: [PATCH 18/21] CUDA Dispatcher: use default finalizer

- For device functions, this works exactly like on the CPU target
  because device functions are CompileResult objects that are inserted
  into the target context (and therefore need removing by the
  finalizer).
- For kernels, we give them a dummy entry point because they were never
  inserted and don't need removing (similar to with object mode
  functions).
---
 numba/cuda/dispatcher.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index bf966b26efb..c4ea9fbb95e 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -50,6 +50,14 @@ def __init__(self, py_func, argtypes, link=None, debug=False,
         # target with slight modifications.
         self.objectmode = False
 
+        # The finalizer constructed by _DispatcherBase._make_finalizer also
+        # expects overloads to be a CompileResult. It uses the entry_point to
+        # remove a CompileResult from a target context. However, since we never
+        # insert kernels into a target context (there is no need because they
+        # cannot be called by other functions, only through the dispatcher) it
+        # suffices to pretend we have an entry point of None.
+        self.entry_point = None
+
         self.py_func = py_func
         self.argtypes = argtypes
         self.debug = debug
@@ -485,13 +493,6 @@ def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler):
         # argument types
         self.specializations = {}
 
-    def _make_finalizer(self):
-        # Dummy finalizer whilst _DispatcherBase assumes the existence of a
-        # finalizer
-        def finalizer():
-            pass
-        return finalizer
-
     @property
     def _numba_type_(self):
         return cuda_types.CUDADispatcher(self)

From 4d9cff78dc97e291cd5fa599e864ed80d91595f7 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 4 Feb 2022 23:20:20 +0000
Subject: [PATCH 19/21] Add test for Issue #5902

---
 numba/cuda/tests/cudapy/test_dispatcher.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/numba/cuda/tests/cudapy/test_dispatcher.py b/numba/cuda/tests/cudapy/test_dispatcher.py
index a08cd4f474b..7090db221ac 100644
--- a/numba/cuda/tests/cudapy/test_dispatcher.py
+++ b/numba/cuda/tests/cudapy/test_dispatcher.py
@@ -272,6 +272,21 @@ def pi_sin_array(x, n):
         self.assertIsInstance(regs_per_thread, int)
         self.assertGreater(regs_per_thread, 0)
 
+    def test_dispatcher_docstring(self):
+        # Ensure that CUDA-jitting a function preserves its docstring. See
+        # Issue #5902: https://github.com/numba/numba/issues/5902
+
+        @cuda.jit
+        def add_kernel(a, b):
+            """Add two integers, kernel version"""
+
+        @cuda.jit(device=True)
+        def add_device(a, b):
+            """Add two integers, device version"""
+
+        self.assertEqual("Add two integers, kernel version", add_kernel.__doc__)
+        self.assertEqual("Add two integers, device version", add_device.__doc__)
+
 
 if __name__ == '__main__':
     unittest.main()

From ef5b4dce41cfbf6dfe4269626b9afb7b775f7fc1 Mon Sep 17 00:00:00 2001
From: Graham Markall <535640+gmarkall@users.noreply.github.com>
Date: Fri, 11 Feb 2022 09:47:13 +0000
Subject: [PATCH 20/21] Fix typos (PR #7815 review)

Co-authored-by: stuartarchibald <stuartarchibald@users.noreply.github.com>
---
 numba/cuda/compiler.py   | 4 ++--
 numba/cuda/dispatcher.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py
index cb2cfcb4f21..12d6c696a3e 100644
--- a/numba/cuda/compiler.py
+++ b/numba/cuda/compiler.py
@@ -30,10 +30,10 @@ class CUDAFlags(Flags):
     )
 
 
-# The CUDACompileResult has a specially-defined entry point equal to its id.
+# The CUDACompileResult (CCR) has a specially-defined entry point equal to its id.
 # This is because the entry point is used as a key into a dict of overloads by
 # the base dispatcher. The id of the CCR is the only small and unique property
-# of a CompileResult in the CUDA target (c.f. the CPU target, which uses its
+# of a CompileResult in the CUDA target (cf. the CPU target, which uses its
 # entry_point, which is a pointer value).
 #
 # This does feel a little hackish, and there are two ways in which this could
diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index c4ea9fbb95e..0eaa5c124ae 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -481,7 +481,7 @@ def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler):
                          pipeline_class=pipeline_class)
         self._type = self._numba_type_
 
-        # The following properties are for specialization of CUDADisptachers. A
+        # The following properties are for specialization of CUDADispatchers. A
         # specialized CUDADispatcher is one that is compiled for exactly one
         # set of argument types, and bypasses some argument type checking for
         # faster kernel launches.

From 499623fdb62663fb8c4fa4f96cb52901e0f4517e Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 11 Feb 2022 09:51:39 +0000
Subject: [PATCH 21/21] Fix line length in cuda/compiler.py

---
 numba/cuda/compiler.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py
index 12d6c696a3e..bb7ea3e4f6b 100644
--- a/numba/cuda/compiler.py
+++ b/numba/cuda/compiler.py
@@ -30,11 +30,11 @@ class CUDAFlags(Flags):
     )
 
 
-# The CUDACompileResult (CCR) has a specially-defined entry point equal to its id.
-# This is because the entry point is used as a key into a dict of overloads by
-# the base dispatcher. The id of the CCR is the only small and unique property
-# of a CompileResult in the CUDA target (cf. the CPU target, which uses its
-# entry_point, which is a pointer value).
+# The CUDACompileResult (CCR) has a specially-defined entry point equal to its
+# id.  This is because the entry point is used as a key into a dict of
+# overloads by the base dispatcher. The id of the CCR is the only small and
+# unique property of a CompileResult in the CUDA target (cf. the CPU target,
+# which uses its entry_point, which is a pointer value).
 #
 # This does feel a little hackish, and there are two ways in which this could
 # be improved: