diff --git a/docs/source/cuda-reference/kernel.rst b/docs/source/cuda-reference/kernel.rst index 71476ab991f..4095b5b1767 100644 --- a/docs/source/cuda-reference/kernel.rst +++ b/docs/source/cuda-reference/kernel.rst @@ -56,7 +56,7 @@ This is similar to launch configuration in CUDA C/C++: Dispatcher objects also provide several utility methods for inspection and creating a specialized instance: -.. autoclass:: numba.cuda.dispatcher.Dispatcher +.. autoclass:: numba.cuda.dispatcher.CUDADispatcher :members: inspect_asm, inspect_llvm, inspect_sass, inspect_types, get_regs_per_thread, specialize, specialized, extensions, forall diff --git a/numba/core/compiler.py b/numba/core/compiler.py index 0bf12b0dbf1..85a5fdc8265 100644 --- a/numba/core/compiler.py +++ b/numba/core/compiler.py @@ -269,20 +269,25 @@ def dump(self, tab=''): ]) -def compile_result(**kws): - keys = set(kws.keys()) +def sanitize_compile_result_entries(entries): + keys = set(entries.keys()) fieldset = set(CR_FIELDS) badnames = keys - fieldset if badnames: raise NameError(*badnames) missing = fieldset - keys for k in missing: - kws[k] = None + entries[k] = None # Avoid keeping alive traceback variables - err = kws['typing_error'] + err = entries['typing_error'] if err is not None: - kws['typing_error'] = err.with_traceback(None) - return CompileResult(**kws) + entries['typing_error'] = err.with_traceback(None) + return entries + + +def compile_result(**entries): + entries = sanitize_compile_result_entries(entries) + return CompileResult(**entries) def compile_isolated(func, args, return_type=None, flags=DEFAULT_FLAGS, diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py index 0002c6bea8b..bb7ea3e4f6b 100644 --- a/numba/cuda/compiler.py +++ b/numba/cuda/compiler.py @@ -1,7 +1,8 @@ from numba.core.typing.templates import ConcreteTemplate from numba.core import types, typing, funcdesc, config, compiler -from numba.core.compiler import (CompilerBase, DefaultPassBuilder, - compile_result, Flags, Option) +from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase, + DefaultPassBuilder, Flags, Option, + CompileResult) from numba.core.compiler_lock import global_compiler_lock from numba.core.compiler_machinery import (LoweringPass, AnalysisPass, PassManager, register_pass) @@ -29,6 +30,33 @@ class CUDAFlags(Flags): ) +# The CUDACompileResult (CCR) has a specially-defined entry point equal to its +# id. This is because the entry point is used as a key into a dict of +# overloads by the base dispatcher. The id of the CCR is the only small and +# unique property of a CompileResult in the CUDA target (cf. the CPU target, +# which uses its entry_point, which is a pointer value). +# +# This does feel a little hackish, and there are two ways in which this could +# be improved: +# +# 1. We could change the core of Numba so that each CompileResult has its own +# unique ID that can be used as a key - e.g. a count, similar to the way in +# which types have unique counts. +# 2. At some future time when kernel launch uses a compiled function, the entry +# point will no longer need to be a synthetic value, but will instead be a +# pointer to the compiled function as in the CPU target. + +class CUDACompileResult(CompileResult): + @property + def entry_point(self): + return id(self) + + +def cuda_compile_result(**entries): + entries = sanitize_compile_result_entries(entries) + return CUDACompileResult(**entries) + + @register_pass(mutates_CFG=True, analysis_only=False) class CUDABackend(LoweringPass): @@ -44,7 +72,7 @@ def run_pass(self, state): lowered = state['cr'] signature = typing.signature(state.return_type, *state.args) - state.cr = compile_result( + state.cr = cuda_compile_result( typing_context=state.typingctx, target_context=state.targetctx, typing_error=state.status.fail_reason, diff --git a/numba/cuda/decorators.py b/numba/cuda/decorators.py index 4a4bed69f83..ba901a7c344 100644 --- a/numba/cuda/decorators.py +++ b/numba/cuda/decorators.py @@ -2,7 +2,7 @@ from numba.core import types, config, sigutils from numba.core.errors import DeprecationError, NumbaInvalidConfigWarning from numba.cuda.compiler import declare_device_function -from numba.cuda.dispatcher import Dispatcher +from numba.cuda.dispatcher import CUDADispatcher from numba.cuda.simulator.kernel import FakeCUDAKernel @@ -69,6 +69,7 @@ def jit(func_or_sig=None, device=False, inline=False, link=[], debug=None, debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug fastmath = kws.get('fastmath', False) + extensions = kws.get('extensions', []) if debug and opt: msg = ("debug=True with opt=True (the default) " @@ -97,7 +98,19 @@ def _jit(func): targetoptions['opt'] = opt targetoptions['fastmath'] = fastmath targetoptions['device'] = device - return Dispatcher(func, [func_or_sig], targetoptions=targetoptions) + targetoptions['extensions'] = extensions + + disp = CUDADispatcher(func, targetoptions=targetoptions) + + if device: + disp.compile_device(argtypes) + else: + disp.compile(argtypes) + + disp._specialized = True + disp.disable_compile() + + return disp return _jit else: @@ -124,9 +137,8 @@ def autojitwrapper(func): targetoptions['link'] = link targetoptions['fastmath'] = fastmath targetoptions['device'] = device - sigs = None - return Dispatcher(func_or_sig, sigs, - targetoptions=targetoptions) + targetoptions['extensions'] = extensions + return CUDADispatcher(func_or_sig, targetoptions=targetoptions) def declare_device(name, sig): diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index ad9d0d25223..e55c4bf173f 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -1,5 +1,3 @@ -import collections -import inspect import numpy as np import os import sys @@ -8,15 +6,13 @@ from numba.core import config, serialize, sigutils, types, typing, utils from numba.core.compiler_lock import global_compiler_lock -from numba.core.dispatcher import CompilingCounter, OmittedArg +from numba.core.dispatcher import Dispatcher from numba.core.errors import NumbaPerformanceWarning -from numba.core.typeconv.rules import default_type_manager -from numba.core.typing.templates import AbstractTemplate from numba.core.typing.typeof import Purpose, typeof from numba.cuda.api import get_current_device from numba.cuda.args import wrap_arg -from numba.cuda.compiler import compile_cuda +from numba.cuda.compiler import compile_cuda, CUDACompiler from numba.cuda.cudadrv import driver from numba.cuda.cudadrv.devices import get_context from numba.cuda.cudadrv.libs import get_cudalib @@ -47,6 +43,21 @@ def __init__(self, py_func, argtypes, link=None, debug=False, super().__init__() + # _DispatcherBase.nopython_signatures() expects this attribute to be + # present, because it assumes an overload is a CompileResult. In the + # CUDA target, _Kernel instances are stored instead, so we provide this + # attribute here to avoid duplicating nopython_signatures() in the CUDA + # target with slight modifications. + self.objectmode = False + + # The finalizer constructed by _DispatcherBase._make_finalizer also + # expects overloads to be a CompileResult. It uses the entry_point to + # remove a CompileResult from a target context. However, since we never + # insert kernels into a target context (there is no need because they + # cannot be called by other functions, only through the dispatcher) it + # suffices to pretend we have an entry point of None. + self.entry_point = None + self.py_func = py_func self.argtypes = argtypes self.debug = debug @@ -386,11 +397,11 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs): class ForAll(object): - def __init__(self, kernel, ntasks, tpb, stream, sharedmem): + def __init__(self, dispatcher, ntasks, tpb, stream, sharedmem): if ntasks < 0: raise ValueError("Can't create ForAll with negative task count: %s" % ntasks) - self.kernel = kernel + self.dispatcher = dispatcher self.ntasks = ntasks self.thread_per_block = tpb self.stream = stream @@ -400,16 +411,17 @@ def __call__(self, *args): if self.ntasks == 0: return - if self.kernel.specialized: - kernel = self.kernel + if self.dispatcher.specialized: + specialized = self.dispatcher else: - kernel = self.kernel.specialize(*args) - blockdim = self._compute_thread_per_block(kernel) + specialized = self.dispatcher.specialize(*args) + blockdim = self._compute_thread_per_block(specialized) griddim = (self.ntasks + blockdim - 1) // blockdim - return kernel[griddim, blockdim, self.stream, self.sharedmem](*args) + return specialized[griddim, blockdim, self.stream, + self.sharedmem](*args) - def _compute_thread_per_block(self, kernel): + def _compute_thread_per_block(self, dispatcher): tpb = self.thread_per_block # Prefer user-specified config if tpb != 0: @@ -417,11 +429,11 @@ def _compute_thread_per_block(self, kernel): # Else, ask the driver to give a good config else: ctx = get_context() - # Kernel is specialized, so there's only one definition - get it so - # we can get the cufunc from the code library - defn = next(iter(kernel.overloads.values())) + # Dispatcher is specialized, so there's only one definition - get + # it so we can get the cufunc from the code library + kernel = next(iter(dispatcher.overloads.values())) kwargs = dict( - func=defn._codelibrary.get_cufunc(), + func=kernel._codelibrary.get_cufunc(), b2d_func=0, # dynamic-shared memory is constant to blksz memsize=self.sharedmem, blocksizelimit=1024, @@ -454,7 +466,7 @@ def __call__(self, *args): self.stream, self.sharedmem) -class Dispatcher(_dispatcher.Dispatcher, serialize.ReduceMixin): +class CUDADispatcher(Dispatcher, serialize.ReduceMixin): ''' CUDA Dispatcher object. When configured and called, the dispatcher will specialize itself for the given arguments (if no suitable specialized @@ -472,111 +484,27 @@ class Dispatcher(_dispatcher.Dispatcher, serialize.ReduceMixin): targetdescr = cuda_target - def __init__(self, py_func, sigs, targetoptions): - self.py_func = py_func - self.sigs = [] - self.link = targetoptions.pop('link', (),) - self._can_compile = True + def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler): + super().__init__(py_func, targetoptions=targetoptions, + pipeline_class=pipeline_class) self._type = self._numba_type_ - # The compiling counter is only used when compiling device functions as - # it is used to detect recursion - recursion is not possible when - # compiling a kernel. - self._compiling_counter = CompilingCounter() - - # Specializations for given sets of argument types - self.specializations = {} - - # A mapping of signatures to compile results - self.overloads = collections.OrderedDict() - - self.targetoptions = targetoptions - - # defensive copy - self.targetoptions['extensions'] = \ - list(self.targetoptions.get('extensions', [])) - - self.typingctx = self.targetdescr.typing_context - - self._tm = default_type_manager - - pysig = utils.pysignature(py_func) - arg_count = len(pysig.parameters) - argnames = tuple(pysig.parameters) - default_values = self.py_func.__defaults__ or () - defargs = tuple(OmittedArg(val) for val in default_values) - can_fallback = False # CUDA cannot fallback to object mode - - try: - lastarg = list(pysig.parameters.values())[-1] - except IndexError: - has_stararg = False - else: - has_stararg = lastarg.kind == lastarg.VAR_POSITIONAL + # The following properties are for specialization of CUDADispatchers. A + # specialized CUDADispatcher is one that is compiled for exactly one + # set of argument types, and bypasses some argument type checking for + # faster kernel launches. - exact_match_required = False + # Is this a specialized dispatcher? + self._specialized = False - _dispatcher.Dispatcher.__init__(self, self._tm.get_pointer(), - arg_count, self._fold_args, argnames, - defargs, can_fallback, has_stararg, - exact_match_required) - - if sigs: - if len(sigs) > 1: - raise TypeError("Only one signature supported at present") - if targetoptions.get('device'): - argtypes, restype = sigutils.normalize_signature(sigs[0]) - self.compile_device(argtypes) - else: - self.compile(sigs[0]) - - self._can_compile = False - - if targetoptions.get('device'): - self._register_device_function() - - def _register_device_function(self): - dispatcher = self - pyfunc = self.py_func - - class device_function_template(AbstractTemplate): - key = dispatcher - - def generic(self, args, kws): - assert not kws - return dispatcher.compile(args).signature - - def get_template_info(cls): - basepath = os.path.dirname( - os.path.dirname(os.path.dirname(cuda.__file__))) - code, firstlineno = inspect.getsourcelines(pyfunc) - path = inspect.getsourcefile(pyfunc) - sig = str(utils.pysignature(pyfunc)) - info = { - 'kind': "overload", - 'name': getattr(cls.key, '__name__', "unknown"), - 'sig': sig, - 'filename': utils.safe_relpath(path, start=basepath), - 'lines': (firstlineno, firstlineno + len(code) - 1), - 'docstring': pyfunc.__doc__ - } - return info - - from .descriptor import cuda_target - typingctx = cuda_target.typing_context - typingctx.insert_user_function(dispatcher, device_function_template) + # If we produced specialized dispatchers, we cache them for each set of + # argument types + self.specializations = {} @property def _numba_type_(self): return cuda_types.CUDADispatcher(self) - @property - def is_compiling(self): - """ - Whether a specialization is currently being compiled. - """ - return self._compiling_counter - def configure(self, griddim, blockdim, stream=0, sharedmem=0): griddim, blockdim = normalize_kernel_dimensions(griddim, blockdim) return _LaunchConfiguration(self, griddim, blockdim, stream, sharedmem) @@ -587,7 +515,7 @@ def __getitem__(self, args): return self.configure(*args) def forall(self, ntasks, tpb=0, stream=0, sharedmem=0): - """Returns a 1D-configured kernel for a given number of tasks. + """Returns a 1D-configured dispatcher for a given number of tasks. This assumes that: @@ -599,11 +527,12 @@ def forall(self, ntasks, tpb=0, stream=0, sharedmem=0): :param ntasks: The number of tasks. :param tpb: The size of a block. An appropriate value is chosen if this parameter is not supplied. - :param stream: The stream on which the configured kernel will be + :param stream: The stream on which the configured dispatcher will be launched. :param sharedmem: The number of bytes of dynamic shared memory required by the kernel. - :return: A configured kernel, ready to launch on a set of arguments.""" + :return: A configured dispatcher, ready to launch on a set of + arguments.""" return ForAll(self, ntasks, tpb=tpb, stream=stream, sharedmem=sharedmem) @@ -649,17 +578,6 @@ def _compile_for_args(self, *args, **kws): argtypes = [self.typeof_pyval(a) for a in args] return self.compile(tuple(argtypes)) - def _search_new_conversions(self, *args, **kws): - # Based on _DispatcherBase._search_new_conversions - assert not kws - args = [self.typeof_pyval(a) for a in args] - found = False - for sig in self.nopython_signatures: - conv = self.typingctx.install_possible_conversions(args, sig.args) - if conv: - found = True - return found - def typeof_pyval(self, val): # Based on _DispatcherBase.typeof_pyval, but differs from it to support # the CUDA Array Interface. @@ -674,11 +592,6 @@ def typeof_pyval(self, val): else: raise - @property - def nopython_signatures(self): - # Based on _DispatcherBase.nopython_signatures - return [kernel.signature for kernel in self.overloads.values()] - def specialize(self, *args): ''' Create a new instance of this dispatcher specialized for the given @@ -695,21 +608,20 @@ def specialize(self, *args): return specialization targetoptions = self.targetoptions - targetoptions['link'] = self.link - specialization = Dispatcher(self.py_func, [types.void(*argtypes)], - targetoptions) + specialization = CUDADispatcher(self.py_func, + targetoptions=targetoptions) + specialization.compile(argtypes) + specialization.disable_compile() + specialization._specialized = True self.specializations[cc, argtypes] = specialization return specialization - def disable_compile(self, val=True): - self._can_compile = not val - @property def specialized(self): """ True if the Dispatcher has been specialized. """ - return len(self.sigs) == 1 and not self._can_compile + return self._specialized def get_regs_per_thread(self, signature=None): ''' @@ -731,7 +643,10 @@ def get_regs_per_thread(self, signature=None): for sig, overload in self.overloads.items()} def get_call_template(self, args, kws): - # Copied and simplified from _DispatcherBase.get_call_template. + # Originally copied from _DispatcherBase.get_call_template. This + # version deviates slightly from the _DispatcherBase version in order + # to force casts when calling device functions. See e.g. + # TestDeviceFunc.test_device_casting, added in PR #7496. """ Get a typing.ConcreteTemplate for this dispatcher and the given *args* and *kws* types. This allows resolution of the return type. @@ -755,14 +670,6 @@ def get_call_template(self, args, kws): return call_template, pysig, args, kws - def get_overload(self, sig): - # We give the id of the overload (a CompileResult) because this is used - # as a key into a dict of overloads, and this is the only small and - # unique property of a CompileResult on CUDA (c.f. the CPU target, - # which uses its entry_point, which is a pointer value). - args, return_type = sigutils.normalize_signature(sig) - return id(self.overloads[args]) - def compile_device(self, args): """Compile the device function for the given argument types. @@ -785,9 +692,8 @@ def compile_device(self, args): inline=inline, nvvm_options=nvvm_options) self.overloads[args] = cres - # The inserted function uses the id of the CompileResult as a key, - # consistent with get_overload() above. - cres.target_context.insert_user_function(id(cres), cres.fndesc, + cres.target_context.insert_user_function(cres.entry_point, + cres.fndesc, [cres.library]) else: cres = self.overloads[args] @@ -808,7 +714,7 @@ def compile(self, sig): if kernel is None: if not self._can_compile: raise RuntimeError("Compilation disabled") - kernel = _Kernel(self.py_func, argtypes, link=self.link, + kernel = _Kernel(self.py_func, argtypes, **self.targetoptions) # Inspired by _DispatcherBase.add_overload, but differs slightly # because we're inserting a _Kernel object instead of a compiled @@ -818,7 +724,6 @@ def compile(self, sig): self.overloads[argtypes] = kernel kernel.bind() - self.sigs.append(sig) return kernel def inspect_llvm(self, signature=None): @@ -911,11 +816,11 @@ def bind(self): defn.bind() @classmethod - def _rebuild(cls, py_func, sigs, targetoptions): + def _rebuild(cls, py_func, targetoptions): """ Rebuild an instance. """ - instance = cls(py_func, sigs, targetoptions) + instance = cls(py_func, targetoptions) return instance def _reduce_states(self): @@ -923,5 +828,5 @@ def _reduce_states(self): Reduce the instance for serialization. Compiled definitions are discarded. """ - return dict(py_func=self.py_func, sigs=self.sigs, + return dict(py_func=self.py_func, targetoptions=self.targetoptions) diff --git a/numba/cuda/initialize.py b/numba/cuda/initialize.py index c9d1bc13d6e..0c9343aaa29 100644 --- a/numba/cuda/initialize.py +++ b/numba/cuda/initialize.py @@ -3,7 +3,7 @@ def initialize_all(): import numba.cuda.models # noqa: F401 from numba import cuda - from numba.cuda.dispatcher import Dispatcher + from numba.cuda.dispatcher import CUDADispatcher from numba.core.target_extension import (target_registry, dispatcher_registry, jit_registry) @@ -14,4 +14,4 @@ def cuda_jit_device(*args, **kwargs): cuda_target = target_registry["cuda"] jit_registry[cuda_target] = cuda_jit_device - dispatcher_registry[cuda_target] = Dispatcher + dispatcher_registry[cuda_target] = CUDADispatcher diff --git a/numba/cuda/target.py b/numba/cuda/target.py index 09f29e6bf69..98773eba54a 100644 --- a/numba/cuda/target.py +++ b/numba/cuda/target.py @@ -2,8 +2,8 @@ import llvmlite.binding as ll from llvmlite import ir -from numba.core import (typing, types, dispatcher, debuginfo, itanium_mangler, - cgutils) +from numba.core import typing, types, debuginfo, itanium_mangler, cgutils +from numba.core.dispatcher import Dispatcher from numba.core.utils import cached_property from numba.core.base import BaseContext from numba.core.callconv import MinimalCallConv @@ -29,8 +29,10 @@ def load_additional_registries(self): self.install_registry(enumdecl.registry) def resolve_value_type(self, val): - # treat dispatcher object as another device function - if isinstance(val, dispatcher.Dispatcher): + # treat other dispatcher object as another device function + from numba.cuda.dispatcher import CUDADispatcher + if (isinstance(val, Dispatcher) and not + isinstance(val, CUDADispatcher)): try: # use cached device function val = val.__dispatcher @@ -42,9 +44,7 @@ def resolve_value_type(self, val): targetoptions['device'] = True targetoptions['debug'] = targetoptions.get('debug', False) targetoptions['opt'] = targetoptions.get('opt', True) - sigs = None - from numba.cuda.dispatcher import Dispatcher - disp = Dispatcher(val, sigs, targetoptions) + disp = CUDADispatcher(val.py_func, targetoptions) # cache the device function for future use and to avoid # duplicated copy of the same function. val.__dispatcher = disp diff --git a/numba/cuda/tests/cudapy/test_dispatcher.py b/numba/cuda/tests/cudapy/test_dispatcher.py index a08cd4f474b..7090db221ac 100644 --- a/numba/cuda/tests/cudapy/test_dispatcher.py +++ b/numba/cuda/tests/cudapy/test_dispatcher.py @@ -272,6 +272,21 @@ def pi_sin_array(x, n): self.assertIsInstance(regs_per_thread, int) self.assertGreater(regs_per_thread, 0) + def test_dispatcher_docstring(self): + # Ensure that CUDA-jitting a function preserves its docstring. See + # Issue #5902: https://github.com/numba/numba/issues/5902 + + @cuda.jit + def add_kernel(a, b): + """Add two integers, kernel version""" + + @cuda.jit(device=True) + def add_device(a, b): + """Add two integers, device version""" + + self.assertEqual("Add two integers, kernel version", add_kernel.__doc__) + self.assertEqual("Add two integers, device version", add_device.__doc__) + if __name__ == '__main__': unittest.main() diff --git a/numba/cuda/tests/cudapy/test_errors.py b/numba/cuda/tests/cudapy/test_errors.py index bbd8f5fd518..c20fb8dccdf 100644 --- a/numba/cuda/tests/cudapy/test_errors.py +++ b/numba/cuda/tests/cudapy/test_errors.py @@ -71,9 +71,7 @@ def kernel_func(): with self.assertRaises(TypingError) as raises: kernel_func[1, 1]() excstr = str(raises.exception) - self.assertIn("resolving callee type: " - "type(