diff --git a/docs/source/cuda-reference/kernel.rst b/docs/source/cuda-reference/kernel.rst
index 71476ab991f..4095b5b1767 100644
--- a/docs/source/cuda-reference/kernel.rst
+++ b/docs/source/cuda-reference/kernel.rst
@@ -56,7 +56,7 @@ This is similar to launch configuration in CUDA C/C++:
 Dispatcher objects also provide several utility methods for inspection and
 creating a specialized instance:
 
-.. autoclass:: numba.cuda.dispatcher.Dispatcher
+.. autoclass:: numba.cuda.dispatcher.CUDADispatcher
    :members: inspect_asm, inspect_llvm, inspect_sass, inspect_types,
              get_regs_per_thread, specialize, specialized, extensions, forall
 
diff --git a/numba/core/compiler.py b/numba/core/compiler.py
index 0bf12b0dbf1..85a5fdc8265 100644
--- a/numba/core/compiler.py
+++ b/numba/core/compiler.py
@@ -269,20 +269,25 @@ def dump(self, tab=''):
 ])
 
 
-def compile_result(**kws):
-    keys = set(kws.keys())
+def sanitize_compile_result_entries(entries):
+    keys = set(entries.keys())
     fieldset = set(CR_FIELDS)
     badnames = keys - fieldset
     if badnames:
         raise NameError(*badnames)
     missing = fieldset - keys
     for k in missing:
-        kws[k] = None
+        entries[k] = None
     # Avoid keeping alive traceback variables
-    err = kws['typing_error']
+    err = entries['typing_error']
     if err is not None:
-        kws['typing_error'] = err.with_traceback(None)
-    return CompileResult(**kws)
+        entries['typing_error'] = err.with_traceback(None)
+    return entries
+
+
+def compile_result(**entries):
+    entries = sanitize_compile_result_entries(entries)
+    return CompileResult(**entries)
 
 
 def compile_isolated(func, args, return_type=None, flags=DEFAULT_FLAGS,
diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py
index 0002c6bea8b..bb7ea3e4f6b 100644
--- a/numba/cuda/compiler.py
+++ b/numba/cuda/compiler.py
@@ -1,7 +1,8 @@
 from numba.core.typing.templates import ConcreteTemplate
 from numba.core import types, typing, funcdesc, config, compiler
-from numba.core.compiler import (CompilerBase, DefaultPassBuilder,
-                                 compile_result, Flags, Option)
+from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
+                                 DefaultPassBuilder, Flags, Option,
+                                 CompileResult)
 from numba.core.compiler_lock import global_compiler_lock
 from numba.core.compiler_machinery import (LoweringPass, AnalysisPass,
                                            PassManager, register_pass)
@@ -29,6 +30,33 @@ class CUDAFlags(Flags):
     )
 
 
+# The CUDACompileResult (CCR) has a specially-defined entry point equal to its
+# id.  This is because the entry point is used as a key into a dict of
+# overloads by the base dispatcher. The id of the CCR is the only small and
+# unique property of a CompileResult in the CUDA target (cf. the CPU target,
+# which uses its entry_point, which is a pointer value).
+#
+# This does feel a little hackish, and there are two ways in which this could
+# be improved:
+#
+# 1. We could change the core of Numba so that each CompileResult has its own
+#    unique ID that can be used as a key - e.g. a count, similar to the way in
+#    which types have unique counts.
+# 2. At some future time when kernel launch uses a compiled function, the entry
+#    point will no longer need to be a synthetic value, but will instead be a
+#    pointer to the compiled function as in the CPU target.
+
+class CUDACompileResult(CompileResult):
+    @property
+    def entry_point(self):
+        return id(self)
+
+
+def cuda_compile_result(**entries):
+    entries = sanitize_compile_result_entries(entries)
+    return CUDACompileResult(**entries)
+
+
 @register_pass(mutates_CFG=True, analysis_only=False)
 class CUDABackend(LoweringPass):
 
@@ -44,7 +72,7 @@ def run_pass(self, state):
         lowered = state['cr']
         signature = typing.signature(state.return_type, *state.args)
 
-        state.cr = compile_result(
+        state.cr = cuda_compile_result(
             typing_context=state.typingctx,
             target_context=state.targetctx,
             typing_error=state.status.fail_reason,
diff --git a/numba/cuda/decorators.py b/numba/cuda/decorators.py
index 4a4bed69f83..ba901a7c344 100644
--- a/numba/cuda/decorators.py
+++ b/numba/cuda/decorators.py
@@ -2,7 +2,7 @@
 from numba.core import types, config, sigutils
 from numba.core.errors import DeprecationError, NumbaInvalidConfigWarning
 from numba.cuda.compiler import declare_device_function
-from numba.cuda.dispatcher import Dispatcher
+from numba.cuda.dispatcher import CUDADispatcher
 from numba.cuda.simulator.kernel import FakeCUDAKernel
 
 
@@ -69,6 +69,7 @@ def jit(func_or_sig=None, device=False, inline=False, link=[], debug=None,
 
     debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
     fastmath = kws.get('fastmath', False)
+    extensions = kws.get('extensions', [])
 
     if debug and opt:
         msg = ("debug=True with opt=True (the default) "
@@ -97,7 +98,19 @@ def _jit(func):
             targetoptions['opt'] = opt
             targetoptions['fastmath'] = fastmath
             targetoptions['device'] = device
-            return Dispatcher(func, [func_or_sig], targetoptions=targetoptions)
+            targetoptions['extensions'] = extensions
+
+            disp = CUDADispatcher(func, targetoptions=targetoptions)
+
+            if device:
+                disp.compile_device(argtypes)
+            else:
+                disp.compile(argtypes)
+
+            disp._specialized = True
+            disp.disable_compile()
+
+            return disp
 
         return _jit
     else:
@@ -124,9 +137,8 @@ def autojitwrapper(func):
                 targetoptions['link'] = link
                 targetoptions['fastmath'] = fastmath
                 targetoptions['device'] = device
-                sigs = None
-                return Dispatcher(func_or_sig, sigs,
-                                  targetoptions=targetoptions)
+                targetoptions['extensions'] = extensions
+                return CUDADispatcher(func_or_sig, targetoptions=targetoptions)
 
 
 def declare_device(name, sig):
diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py
index ad9d0d25223..e55c4bf173f 100644
--- a/numba/cuda/dispatcher.py
+++ b/numba/cuda/dispatcher.py
@@ -1,5 +1,3 @@
-import collections
-import inspect
 import numpy as np
 import os
 import sys
@@ -8,15 +6,13 @@
 
 from numba.core import config, serialize, sigutils, types, typing, utils
 from numba.core.compiler_lock import global_compiler_lock
-from numba.core.dispatcher import CompilingCounter, OmittedArg
+from numba.core.dispatcher import Dispatcher
 from numba.core.errors import NumbaPerformanceWarning
-from numba.core.typeconv.rules import default_type_manager
-from numba.core.typing.templates import AbstractTemplate
 from numba.core.typing.typeof import Purpose, typeof
 
 from numba.cuda.api import get_current_device
 from numba.cuda.args import wrap_arg
-from numba.cuda.compiler import compile_cuda
+from numba.cuda.compiler import compile_cuda, CUDACompiler
 from numba.cuda.cudadrv import driver
 from numba.cuda.cudadrv.devices import get_context
 from numba.cuda.cudadrv.libs import get_cudalib
@@ -47,6 +43,21 @@ def __init__(self, py_func, argtypes, link=None, debug=False,
 
         super().__init__()
 
+        # _DispatcherBase.nopython_signatures() expects this attribute to be
+        # present, because it assumes an overload is a CompileResult. In the
+        # CUDA target, _Kernel instances are stored instead, so we provide this
+        # attribute here to avoid duplicating nopython_signatures() in the CUDA
+        # target with slight modifications.
+        self.objectmode = False
+
+        # The finalizer constructed by _DispatcherBase._make_finalizer also
+        # expects overloads to be a CompileResult. It uses the entry_point to
+        # remove a CompileResult from a target context. However, since we never
+        # insert kernels into a target context (there is no need because they
+        # cannot be called by other functions, only through the dispatcher) it
+        # suffices to pretend we have an entry point of None.
+        self.entry_point = None
+
         self.py_func = py_func
         self.argtypes = argtypes
         self.debug = debug
@@ -386,11 +397,11 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs):
 
 
 class ForAll(object):
-    def __init__(self, kernel, ntasks, tpb, stream, sharedmem):
+    def __init__(self, dispatcher, ntasks, tpb, stream, sharedmem):
         if ntasks < 0:
             raise ValueError("Can't create ForAll with negative task count: %s"
                              % ntasks)
-        self.kernel = kernel
+        self.dispatcher = dispatcher
         self.ntasks = ntasks
         self.thread_per_block = tpb
         self.stream = stream
@@ -400,16 +411,17 @@ def __call__(self, *args):
         if self.ntasks == 0:
             return
 
-        if self.kernel.specialized:
-            kernel = self.kernel
+        if self.dispatcher.specialized:
+            specialized = self.dispatcher
         else:
-            kernel = self.kernel.specialize(*args)
-        blockdim = self._compute_thread_per_block(kernel)
+            specialized = self.dispatcher.specialize(*args)
+        blockdim = self._compute_thread_per_block(specialized)
         griddim = (self.ntasks + blockdim - 1) // blockdim
 
-        return kernel[griddim, blockdim, self.stream, self.sharedmem](*args)
+        return specialized[griddim, blockdim, self.stream,
+                           self.sharedmem](*args)
 
-    def _compute_thread_per_block(self, kernel):
+    def _compute_thread_per_block(self, dispatcher):
         tpb = self.thread_per_block
         # Prefer user-specified config
         if tpb != 0:
@@ -417,11 +429,11 @@ def _compute_thread_per_block(self, kernel):
         # Else, ask the driver to give a good config
         else:
             ctx = get_context()
-            # Kernel is specialized, so there's only one definition - get it so
-            # we can get the cufunc from the code library
-            defn = next(iter(kernel.overloads.values()))
+            # Dispatcher is specialized, so there's only one definition - get
+            # it so we can get the cufunc from the code library
+            kernel = next(iter(dispatcher.overloads.values()))
             kwargs = dict(
-                func=defn._codelibrary.get_cufunc(),
+                func=kernel._codelibrary.get_cufunc(),
                 b2d_func=0,     # dynamic-shared memory is constant to blksz
                 memsize=self.sharedmem,
                 blocksizelimit=1024,
@@ -454,7 +466,7 @@ def __call__(self, *args):
                                     self.stream, self.sharedmem)
 
 
-class Dispatcher(_dispatcher.Dispatcher, serialize.ReduceMixin):
+class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
     '''
     CUDA Dispatcher object. When configured and called, the dispatcher will
     specialize itself for the given arguments (if no suitable specialized
@@ -472,111 +484,27 @@ class Dispatcher(_dispatcher.Dispatcher, serialize.ReduceMixin):
 
     targetdescr = cuda_target
 
-    def __init__(self, py_func, sigs, targetoptions):
-        self.py_func = py_func
-        self.sigs = []
-        self.link = targetoptions.pop('link', (),)
-        self._can_compile = True
+    def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler):
+        super().__init__(py_func, targetoptions=targetoptions,
+                         pipeline_class=pipeline_class)
         self._type = self._numba_type_
 
-        # The compiling counter is only used when compiling device functions as
-        # it is used to detect recursion - recursion is not possible when
-        # compiling a kernel.
-        self._compiling_counter = CompilingCounter()
-
-        # Specializations for given sets of argument types
-        self.specializations = {}
-
-        # A mapping of signatures to compile results
-        self.overloads = collections.OrderedDict()
-
-        self.targetoptions = targetoptions
-
-        # defensive copy
-        self.targetoptions['extensions'] = \
-            list(self.targetoptions.get('extensions', []))
-
-        self.typingctx = self.targetdescr.typing_context
-
-        self._tm = default_type_manager
-
-        pysig = utils.pysignature(py_func)
-        arg_count = len(pysig.parameters)
-        argnames = tuple(pysig.parameters)
-        default_values = self.py_func.__defaults__ or ()
-        defargs = tuple(OmittedArg(val) for val in default_values)
-        can_fallback = False # CUDA cannot fallback to object mode
-
-        try:
-            lastarg = list(pysig.parameters.values())[-1]
-        except IndexError:
-            has_stararg = False
-        else:
-            has_stararg = lastarg.kind == lastarg.VAR_POSITIONAL
+        # The following properties are for specialization of CUDADispatchers. A
+        # specialized CUDADispatcher is one that is compiled for exactly one
+        # set of argument types, and bypasses some argument type checking for
+        # faster kernel launches.
 
-        exact_match_required = False
+        # Is this a specialized dispatcher?
+        self._specialized = False
 
-        _dispatcher.Dispatcher.__init__(self, self._tm.get_pointer(),
-                                        arg_count, self._fold_args, argnames,
-                                        defargs, can_fallback, has_stararg,
-                                        exact_match_required)
-
-        if sigs:
-            if len(sigs) > 1:
-                raise TypeError("Only one signature supported at present")
-            if targetoptions.get('device'):
-                argtypes, restype = sigutils.normalize_signature(sigs[0])
-                self.compile_device(argtypes)
-            else:
-                self.compile(sigs[0])
-
-            self._can_compile = False
-
-        if targetoptions.get('device'):
-            self._register_device_function()
-
-    def _register_device_function(self):
-        dispatcher = self
-        pyfunc = self.py_func
-
-        class device_function_template(AbstractTemplate):
-            key = dispatcher
-
-            def generic(self, args, kws):
-                assert not kws
-                return dispatcher.compile(args).signature
-
-            def get_template_info(cls):
-                basepath = os.path.dirname(
-                    os.path.dirname(os.path.dirname(cuda.__file__)))
-                code, firstlineno = inspect.getsourcelines(pyfunc)
-                path = inspect.getsourcefile(pyfunc)
-                sig = str(utils.pysignature(pyfunc))
-                info = {
-                    'kind': "overload",
-                    'name': getattr(cls.key, '__name__', "unknown"),
-                    'sig': sig,
-                    'filename': utils.safe_relpath(path, start=basepath),
-                    'lines': (firstlineno, firstlineno + len(code) - 1),
-                    'docstring': pyfunc.__doc__
-                }
-                return info
-
-        from .descriptor import cuda_target
-        typingctx = cuda_target.typing_context
-        typingctx.insert_user_function(dispatcher, device_function_template)
+        # If we produced specialized dispatchers, we cache them for each set of
+        # argument types
+        self.specializations = {}
 
     @property
     def _numba_type_(self):
         return cuda_types.CUDADispatcher(self)
 
-    @property
-    def is_compiling(self):
-        """
-        Whether a specialization is currently being compiled.
-        """
-        return self._compiling_counter
-
     def configure(self, griddim, blockdim, stream=0, sharedmem=0):
         griddim, blockdim = normalize_kernel_dimensions(griddim, blockdim)
         return _LaunchConfiguration(self, griddim, blockdim, stream, sharedmem)
@@ -587,7 +515,7 @@ def __getitem__(self, args):
         return self.configure(*args)
 
     def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
-        """Returns a 1D-configured kernel for a given number of tasks.
+        """Returns a 1D-configured dispatcher for a given number of tasks.
 
         This assumes that:
 
@@ -599,11 +527,12 @@ def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
         :param ntasks: The number of tasks.
         :param tpb: The size of a block. An appropriate value is chosen if this
                     parameter is not supplied.
-        :param stream: The stream on which the configured kernel will be
+        :param stream: The stream on which the configured dispatcher will be
                        launched.
         :param sharedmem: The number of bytes of dynamic shared memory required
                           by the kernel.
-        :return: A configured kernel, ready to launch on a set of arguments."""
+        :return: A configured dispatcher, ready to launch on a set of
+                 arguments."""
 
         return ForAll(self, ntasks, tpb=tpb, stream=stream, sharedmem=sharedmem)
 
@@ -649,17 +578,6 @@ def _compile_for_args(self, *args, **kws):
         argtypes = [self.typeof_pyval(a) for a in args]
         return self.compile(tuple(argtypes))
 
-    def _search_new_conversions(self, *args, **kws):
-        # Based on _DispatcherBase._search_new_conversions
-        assert not kws
-        args = [self.typeof_pyval(a) for a in args]
-        found = False
-        for sig in self.nopython_signatures:
-            conv = self.typingctx.install_possible_conversions(args, sig.args)
-            if conv:
-                found = True
-        return found
-
     def typeof_pyval(self, val):
         # Based on _DispatcherBase.typeof_pyval, but differs from it to support
         # the CUDA Array Interface.
@@ -674,11 +592,6 @@ def typeof_pyval(self, val):
             else:
                 raise
 
-    @property
-    def nopython_signatures(self):
-        # Based on _DispatcherBase.nopython_signatures
-        return [kernel.signature for kernel in self.overloads.values()]
-
     def specialize(self, *args):
         '''
         Create a new instance of this dispatcher specialized for the given
@@ -695,21 +608,20 @@ def specialize(self, *args):
             return specialization
 
         targetoptions = self.targetoptions
-        targetoptions['link'] = self.link
-        specialization = Dispatcher(self.py_func, [types.void(*argtypes)],
-                                    targetoptions)
+        specialization = CUDADispatcher(self.py_func,
+                                        targetoptions=targetoptions)
+        specialization.compile(argtypes)
+        specialization.disable_compile()
+        specialization._specialized = True
         self.specializations[cc, argtypes] = specialization
         return specialization
 
-    def disable_compile(self, val=True):
-        self._can_compile = not val
-
     @property
     def specialized(self):
         """
         True if the Dispatcher has been specialized.
         """
-        return len(self.sigs) == 1 and not self._can_compile
+        return self._specialized
 
     def get_regs_per_thread(self, signature=None):
         '''
@@ -731,7 +643,10 @@ def get_regs_per_thread(self, signature=None):
                     for sig, overload in self.overloads.items()}
 
     def get_call_template(self, args, kws):
-        # Copied and simplified from _DispatcherBase.get_call_template.
+        # Originally copied from _DispatcherBase.get_call_template. This
+        # version deviates slightly from the _DispatcherBase version in order
+        # to force casts when calling device functions. See e.g.
+        # TestDeviceFunc.test_device_casting, added in PR #7496.
         """
         Get a typing.ConcreteTemplate for this dispatcher and the given
         *args* and *kws* types.  This allows resolution of the return type.
@@ -755,14 +670,6 @@ def get_call_template(self, args, kws):
 
             return call_template, pysig, args, kws
 
-    def get_overload(self, sig):
-        # We give the id of the overload (a CompileResult) because this is used
-        # as a key into a dict of overloads, and this is the only small and
-        # unique property of a CompileResult on CUDA (c.f. the CPU target,
-        # which uses its entry_point, which is a pointer value).
-        args, return_type = sigutils.normalize_signature(sig)
-        return id(self.overloads[args])
-
     def compile_device(self, args):
         """Compile the device function for the given argument types.
 
@@ -785,9 +692,8 @@ def compile_device(self, args):
                                 inline=inline, nvvm_options=nvvm_options)
             self.overloads[args] = cres
 
-            # The inserted function uses the id of the CompileResult as a key,
-            # consistent with get_overload() above.
-            cres.target_context.insert_user_function(id(cres), cres.fndesc,
+            cres.target_context.insert_user_function(cres.entry_point,
+                                                     cres.fndesc,
                                                      [cres.library])
         else:
             cres = self.overloads[args]
@@ -808,7 +714,7 @@ def compile(self, sig):
         if kernel is None:
             if not self._can_compile:
                 raise RuntimeError("Compilation disabled")
-            kernel = _Kernel(self.py_func, argtypes, link=self.link,
+            kernel = _Kernel(self.py_func, argtypes,
                              **self.targetoptions)
             # Inspired by _DispatcherBase.add_overload, but differs slightly
             # because we're inserting a _Kernel object instead of a compiled
@@ -818,7 +724,6 @@ def compile(self, sig):
             self.overloads[argtypes] = kernel
 
             kernel.bind()
-            self.sigs.append(sig)
         return kernel
 
     def inspect_llvm(self, signature=None):
@@ -911,11 +816,11 @@ def bind(self):
             defn.bind()
 
     @classmethod
-    def _rebuild(cls, py_func, sigs, targetoptions):
+    def _rebuild(cls, py_func, targetoptions):
         """
         Rebuild an instance.
         """
-        instance = cls(py_func, sigs, targetoptions)
+        instance = cls(py_func, targetoptions)
         return instance
 
     def _reduce_states(self):
@@ -923,5 +828,5 @@ def _reduce_states(self):
         Reduce the instance for serialization.
         Compiled definitions are discarded.
         """
-        return dict(py_func=self.py_func, sigs=self.sigs,
+        return dict(py_func=self.py_func,
                     targetoptions=self.targetoptions)
diff --git a/numba/cuda/initialize.py b/numba/cuda/initialize.py
index c9d1bc13d6e..0c9343aaa29 100644
--- a/numba/cuda/initialize.py
+++ b/numba/cuda/initialize.py
@@ -3,7 +3,7 @@ def initialize_all():
     import numba.cuda.models  # noqa: F401
 
     from numba import cuda
-    from numba.cuda.dispatcher import Dispatcher
+    from numba.cuda.dispatcher import CUDADispatcher
     from numba.core.target_extension import (target_registry,
                                              dispatcher_registry,
                                              jit_registry)
@@ -14,4 +14,4 @@ def cuda_jit_device(*args, **kwargs):
 
     cuda_target = target_registry["cuda"]
     jit_registry[cuda_target] = cuda_jit_device
-    dispatcher_registry[cuda_target] = Dispatcher
+    dispatcher_registry[cuda_target] = CUDADispatcher
diff --git a/numba/cuda/target.py b/numba/cuda/target.py
index 09f29e6bf69..98773eba54a 100644
--- a/numba/cuda/target.py
+++ b/numba/cuda/target.py
@@ -2,8 +2,8 @@
 import llvmlite.binding as ll
 from llvmlite import ir
 
-from numba.core import (typing, types, dispatcher, debuginfo, itanium_mangler,
-                        cgutils)
+from numba.core import typing, types, debuginfo, itanium_mangler, cgutils
+from numba.core.dispatcher import Dispatcher
 from numba.core.utils import cached_property
 from numba.core.base import BaseContext
 from numba.core.callconv import MinimalCallConv
@@ -29,8 +29,10 @@ def load_additional_registries(self):
         self.install_registry(enumdecl.registry)
 
     def resolve_value_type(self, val):
-        # treat dispatcher object as another device function
-        if isinstance(val, dispatcher.Dispatcher):
+        # treat other dispatcher object as another device function
+        from numba.cuda.dispatcher import CUDADispatcher
+        if (isinstance(val, Dispatcher) and not
+                isinstance(val, CUDADispatcher)):
             try:
                 # use cached device function
                 val = val.__dispatcher
@@ -42,9 +44,7 @@ def resolve_value_type(self, val):
                 targetoptions['device'] = True
                 targetoptions['debug'] = targetoptions.get('debug', False)
                 targetoptions['opt'] = targetoptions.get('opt', True)
-                sigs = None
-                from numba.cuda.dispatcher import Dispatcher
-                disp = Dispatcher(val, sigs, targetoptions)
+                disp = CUDADispatcher(val.py_func, targetoptions)
                 # cache the device function for future use and to avoid
                 # duplicated copy of the same function.
                 val.__dispatcher = disp
diff --git a/numba/cuda/tests/cudapy/test_dispatcher.py b/numba/cuda/tests/cudapy/test_dispatcher.py
index a08cd4f474b..7090db221ac 100644
--- a/numba/cuda/tests/cudapy/test_dispatcher.py
+++ b/numba/cuda/tests/cudapy/test_dispatcher.py
@@ -272,6 +272,21 @@ def pi_sin_array(x, n):
         self.assertIsInstance(regs_per_thread, int)
         self.assertGreater(regs_per_thread, 0)
 
+    def test_dispatcher_docstring(self):
+        # Ensure that CUDA-jitting a function preserves its docstring. See
+        # Issue #5902: https://github.com/numba/numba/issues/5902
+
+        @cuda.jit
+        def add_kernel(a, b):
+            """Add two integers, kernel version"""
+
+        @cuda.jit(device=True)
+        def add_device(a, b):
+            """Add two integers, device version"""
+
+        self.assertEqual("Add two integers, kernel version", add_kernel.__doc__)
+        self.assertEqual("Add two integers, device version", add_device.__doc__)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/numba/cuda/tests/cudapy/test_errors.py b/numba/cuda/tests/cudapy/test_errors.py
index bbd8f5fd518..c20fb8dccdf 100644
--- a/numba/cuda/tests/cudapy/test_errors.py
+++ b/numba/cuda/tests/cudapy/test_errors.py
@@ -71,9 +71,7 @@ def kernel_func():
         with self.assertRaises(TypingError) as raises:
             kernel_func[1, 1]()
         excstr = str(raises.exception)
-        self.assertIn("resolving callee type: "
-                      "type(<numba.cuda.dispatcher.Dispatcher object",
-                      excstr)
+        self.assertIn("resolving callee type: type(CUDADispatcher", excstr)
         self.assertIn("NameError: name 'floor' is not defined", excstr)