From d7f58eddf8c78951b85bb036a441a3986df8042c Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 12:18:04 +0000 Subject: [PATCH 01/21] CUDA Dispatcher: inherit from _DispatcherBase Current test status: ``` Ran 1266 tests in 102.919s FAILED (failures=1, errors=5, skipped=12, expected failures=7) ``` --- numba/cuda/dispatcher.py | 64 +++++++++++++++------------------------- 1 file changed, 24 insertions(+), 40 deletions(-) diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index 94cc817f7fb..b76d6bae01b 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -1,4 +1,3 @@ -import collections import inspect import numpy as np import os @@ -8,9 +7,8 @@ from numba.core import config, serialize, sigutils, types, typing, utils from numba.core.compiler_lock import global_compiler_lock -from numba.core.dispatcher import CompilingCounter, OmittedArg +from numba.core.dispatcher import _DispatcherBase from numba.core.errors import NumbaPerformanceWarning -from numba.core.typeconv.rules import default_type_manager from numba.core.typing.templates import AbstractTemplate from numba.core.typing.typeof import Purpose, typeof @@ -446,7 +444,7 @@ def __call__(self, *args): self.stream, self.sharedmem) -class Dispatcher(_dispatcher.Dispatcher, serialize.ReduceMixin): +class Dispatcher(_DispatcherBase, serialize.ReduceMixin): ''' CUDA Dispatcher object. When configured and called, the dispatcher will specialize itself for the given arguments (if no suitable specialized @@ -465,54 +463,33 @@ class Dispatcher(_dispatcher.Dispatcher, serialize.ReduceMixin): targetdescr = cuda_target def __init__(self, py_func, sigs, targetoptions): - self.py_func = py_func + self.typingctx = self.targetdescr.typing_context + self.targetctx = self.targetdescr.target_context + + pysig = utils.pysignature(py_func) + arg_count = len(pysig.parameters) + can_fallback = False # CUDA cannot fallback to object mode + + _DispatcherBase.__init__(self, arg_count, py_func, pysig, can_fallback, + exact_match_required=False) + + # TODO: Check if this fixes the cuda docstring jit issue + functools.update_wrapper(self, py_func) + + self.targetoptions = targetoptions + self.sigs = [] self.link = targetoptions.pop('link', (),) self._can_compile = True self._type = self._numba_type_ - # The compiling counter is only used when compiling device functions as - # it is used to detect recursion - recursion is not possible when - # compiling a kernel. - self._compiling_counter = CompilingCounter() - # Specializations for given sets of argument types self.specializations = {} - # A mapping of signatures to compile results - self.overloads = collections.OrderedDict() - - self.targetoptions = targetoptions - # defensive copy self.targetoptions['extensions'] = \ list(self.targetoptions.get('extensions', [])) - self.typingctx = self.targetdescr.typing_context - - self._tm = default_type_manager - - pysig = utils.pysignature(py_func) - arg_count = len(pysig.parameters) - argnames = tuple(pysig.parameters) - default_values = self.py_func.__defaults__ or () - defargs = tuple(OmittedArg(val) for val in default_values) - can_fallback = False # CUDA cannot fallback to object mode - - try: - lastarg = list(pysig.parameters.values())[-1] - except IndexError: - has_stararg = False - else: - has_stararg = lastarg.kind == lastarg.VAR_POSITIONAL - - exact_match_required = False - - _dispatcher.Dispatcher.__init__(self, self._tm.get_pointer(), - arg_count, self._fold_args, argnames, - defargs, can_fallback, has_stararg, - exact_match_required) - if sigs: if len(sigs) > 1: raise TypeError("Only one signature supported at present") @@ -527,6 +504,13 @@ def __init__(self, py_func, sigs, targetoptions): if targetoptions.get('device'): self._register_device_function() + def _make_finalizer(self): + # Dummy finalizer whilst _DispatcherBase assumes the existence of a + # finalizer + def finalizer(): + pass + return finalizer + def _register_device_function(self): dispatcher = self pyfunc = self.py_func From 31c1a6436a0a0c00507c9481143e63cb463aa7ba Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 13:39:20 +0000 Subject: [PATCH 02/21] [WIP] Fix CUDA target tests --- numba/cuda/compiler.py | 25 ++++++++++++++++++++++++- numba/cuda/dispatcher.py | 20 ++++++++++++-------- numba/cuda/target.py | 2 +- numba/cuda/tests/cudapy/test_errors.py | 2 +- 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py index 0002c6bea8b..deb154efb92 100644 --- a/numba/cuda/compiler.py +++ b/numba/cuda/compiler.py @@ -1,7 +1,7 @@ from numba.core.typing.templates import ConcreteTemplate from numba.core import types, typing, funcdesc, config, compiler from numba.core.compiler import (CompilerBase, DefaultPassBuilder, - compile_result, Flags, Option) + Flags, Option, CompileResult, CR_FIELDS) from numba.core.compiler_lock import global_compiler_lock from numba.core.compiler_machinery import (LoweringPass, AnalysisPass, PassManager, register_pass) @@ -29,6 +29,28 @@ class CUDAFlags(Flags): ) +class CUDACompileResult(CompileResult): + @property + def entry_point(self): + return id(self) + + +def compile_result(**kws): + keys = set(kws.keys()) + fieldset = set(CR_FIELDS) + badnames = keys - fieldset + if badnames: + raise NameError(*badnames) + missing = fieldset - keys + for k in missing: + kws[k] = None + # Avoid keeping alive traceback variables + err = kws['typing_error'] + if err is not None: + kws['typing_error'] = err.with_traceback(None) + return CUDACompileResult(**kws) + + @register_pass(mutates_CFG=True, analysis_only=False) class CUDABackend(LoweringPass): @@ -54,6 +76,7 @@ def run_pass(self, state): signature=signature, fndesc=lowered.fndesc, ) + return True diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index b76d6bae01b..0c03a2fc05e 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -731,13 +731,16 @@ def get_call_template(self, args, kws): return call_template, pysig, args, kws - def get_overload(self, sig): - # We give the id of the overload (a CompileResult) because this is used - # as a key into a dict of overloads, and this is the only small and - # unique property of a CompileResult on CUDA (c.f. the CPU target, - # which uses its entry_point, which is a pointer value). - args, return_type = sigutils.normalize_signature(sig) - return id(self.overloads[args]) +# def get_overload(self, sig): +# # We give the id of the overload (a CompileResult) because this is used +# # as a key into a dict of overloads, and this is the only small and +# # unique property of a CompileResult on CUDA (c.f. the CPU target, +# # which uses its entry_point, which is a pointer value). +# args, return_type = sigutils.normalize_signature(sig) +# return id(self.overloads[args]) + + def __repr__(self): + return f"numba.cuda.dispatcher.Dispatcher({self.py_func})" def compile_device(self, args): """Compile the device function for the given argument types. @@ -763,7 +766,8 @@ def compile_device(self, args): # The inserted function uses the id of the CompileResult as a key, # consistent with get_overload() above. - cres.target_context.insert_user_function(id(cres), cres.fndesc, + cres.target_context.insert_user_function(cres.entry_point, + cres.fndesc, [cres.library]) else: cres = self.overloads[args] diff --git a/numba/cuda/target.py b/numba/cuda/target.py index 3d63dde49f0..59cf27eb4b8 100644 --- a/numba/cuda/target.py +++ b/numba/cuda/target.py @@ -42,7 +42,7 @@ def resolve_value_type(self, val): targetoptions['opt'] = targetoptions.get('opt', True) sigs = None from numba.cuda.dispatcher import Dispatcher - disp = Dispatcher(val, sigs, targetoptions) + disp = Dispatcher(val.py_func, sigs, targetoptions) # cache the device function for future use and to avoid # duplicated copy of the same function. val.__dispatcher = disp diff --git a/numba/cuda/tests/cudapy/test_errors.py b/numba/cuda/tests/cudapy/test_errors.py index bbd8f5fd518..e13197f7d8f 100644 --- a/numba/cuda/tests/cudapy/test_errors.py +++ b/numba/cuda/tests/cudapy/test_errors.py @@ -72,7 +72,7 @@ def kernel_func(): kernel_func[1, 1]() excstr = str(raises.exception) self.assertIn("resolving callee type: " - "type( Date: Fri, 4 Feb 2022 15:31:32 +0000 Subject: [PATCH 03/21] Some tidy-up --- numba/cuda/compiler.py | 6 ++++++ numba/cuda/dispatcher.py | 11 +---------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py index deb154efb92..31dd4fb02c2 100644 --- a/numba/cuda/compiler.py +++ b/numba/cuda/compiler.py @@ -29,6 +29,12 @@ class CUDAFlags(Flags): ) +# FIXME: Update this comment +# We give the id of the overload (a CompileResult) because this is used +# as a key into a dict of overloads, and this is the only small and +# unique property of a CompileResult on CUDA (c.f. the CPU target, +# which uses its entry_point, which is a pointer value). + class CUDACompileResult(CompileResult): @property def entry_point(self): diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index 0c03a2fc05e..4c3d18435c8 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -731,14 +731,7 @@ def get_call_template(self, args, kws): return call_template, pysig, args, kws -# def get_overload(self, sig): -# # We give the id of the overload (a CompileResult) because this is used -# # as a key into a dict of overloads, and this is the only small and -# # unique property of a CompileResult on CUDA (c.f. the CPU target, -# # which uses its entry_point, which is a pointer value). -# args, return_type = sigutils.normalize_signature(sig) -# return id(self.overloads[args]) - + # XXX: Delete this and call the class CUDADispatcher def __repr__(self): return f"numba.cuda.dispatcher.Dispatcher({self.py_func})" @@ -764,8 +757,6 @@ def compile_device(self, args): inline=inline, nvvm_options=nvvm_options) self.overloads[args] = cres - # The inserted function uses the id of the CompileResult as a key, - # consistent with get_overload() above. cres.target_context.insert_user_function(cres.entry_point, cres.fndesc, [cres.library]) From 70dbacd9dbffebaa35bc7285388a2f9c4c3cd72c Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 16:07:40 +0000 Subject: [PATCH 04/21] [WIP] CUDA Dispatcher inherits from uber_Dispatcher Test results: ``` Ran 1266 tests in 127.002s FAILED (failures=66, errors=17, skipped=12, expected failures=7) ``` --- numba/cuda/dispatcher.py | 28 ++++++++++------------------ numba/cuda/target.py | 13 +++++++------ 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index 4c3d18435c8..8b7d435d5cc 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -7,14 +7,14 @@ from numba.core import config, serialize, sigutils, types, typing, utils from numba.core.compiler_lock import global_compiler_lock -from numba.core.dispatcher import _DispatcherBase +from numba.core.dispatcher import Dispatcher as uber_Dispatcher from numba.core.errors import NumbaPerformanceWarning from numba.core.typing.templates import AbstractTemplate from numba.core.typing.typeof import Purpose, typeof from numba.cuda.api import get_current_device from numba.cuda.args import wrap_arg -from numba.cuda.compiler import compile_cuda +from numba.cuda.compiler import compile_cuda, CUDACompiler from numba.cuda.cudadrv import driver from numba.cuda.cudadrv.devices import get_context from numba.cuda.cudadrv.libs import get_cudalib @@ -444,7 +444,7 @@ def __call__(self, *args): self.stream, self.sharedmem) -class Dispatcher(_DispatcherBase, serialize.ReduceMixin): +class Dispatcher(uber_Dispatcher, serialize.ReduceMixin): ''' CUDA Dispatcher object. When configured and called, the dispatcher will specialize itself for the given arguments (if no suitable specialized @@ -462,21 +462,13 @@ class Dispatcher(_DispatcherBase, serialize.ReduceMixin): targetdescr = cuda_target - def __init__(self, py_func, sigs, targetoptions): - self.typingctx = self.targetdescr.typing_context - self.targetctx = self.targetdescr.target_context - - pysig = utils.pysignature(py_func) - arg_count = len(pysig.parameters) - can_fallback = False # CUDA cannot fallback to object mode - - _DispatcherBase.__init__(self, arg_count, py_func, pysig, can_fallback, - exact_match_required=False) - + def __init__(self, py_func, sigs, targetoptions, + pipeline_class=CUDACompiler): # TODO: Check if this fixes the cuda docstring jit issue - functools.update_wrapper(self, py_func) - self.targetoptions = targetoptions + super().__init__(py_func, pipeline_class=pipeline_class) + + # CUDA-specific stuff - hopefully some of it can be removed ASAP self.sigs = [] self.link = targetoptions.pop('link', (),) @@ -501,8 +493,8 @@ def __init__(self, py_func, sigs, targetoptions): self._can_compile = False - if targetoptions.get('device'): - self._register_device_function() + #if targetoptions.get('device'): + # self._register_device_function() def _make_finalizer(self): # Dummy finalizer whilst _DispatcherBase assumes the existence of a diff --git a/numba/cuda/target.py b/numba/cuda/target.py index 59cf27eb4b8..17b990c6816 100644 --- a/numba/cuda/target.py +++ b/numba/cuda/target.py @@ -2,8 +2,8 @@ import llvmlite.binding as ll from llvmlite import ir -from numba.core import (typing, types, dispatcher, debuginfo, itanium_mangler, - cgutils) +from numba.core import typing, types, debuginfo, itanium_mangler, cgutils +from numba.core.dispatcher import Dispatcher as uber_Dispatcher from numba.core.utils import cached_property from numba.core.base import BaseContext from numba.core.callconv import MinimalCallConv @@ -27,8 +27,10 @@ def load_additional_registries(self): self.install_registry(libdevicedecl.registry) def resolve_value_type(self, val): - # treat dispatcher object as another device function - if isinstance(val, dispatcher.Dispatcher): + # treat other dispatcher object as another device function + from numba.cuda.dispatcher import Dispatcher as CUDADispatcher + if (isinstance(val, uber_Dispatcher) and not + isinstance(val, CUDADispatcher)): try: # use cached device function val = val.__dispatcher @@ -41,8 +43,7 @@ def resolve_value_type(self, val): targetoptions['debug'] = targetoptions.get('debug', False) targetoptions['opt'] = targetoptions.get('opt', True) sigs = None - from numba.cuda.dispatcher import Dispatcher - disp = Dispatcher(val.py_func, sigs, targetoptions) + disp = CUDADispatcher(val.py_func, sigs, targetoptions) # cache the device function for future use and to avoid # duplicated copy of the same function. val.__dispatcher = disp From 922624fc2fc7c39a285fee34bbfef879c435ad04 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 16:15:16 +0000 Subject: [PATCH 05/21] Wire in target options in CUDA dispatcher Test results now: ``` Ran 1266 tests in 105.203s OK (skipped=12, expected failures=7) ``` --- numba/cuda/dispatcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index 8b7d435d5cc..5f99680b7a5 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -466,7 +466,8 @@ def __init__(self, py_func, sigs, targetoptions, pipeline_class=CUDACompiler): # TODO: Check if this fixes the cuda docstring jit issue - super().__init__(py_func, pipeline_class=pipeline_class) + super().__init__(py_func, targetoptions=targetoptions, + pipeline_class=pipeline_class) # CUDA-specific stuff - hopefully some of it can be removed ASAP From ed2e17a67660b2eca60de83ad05be60828be911f Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 16:33:51 +0000 Subject: [PATCH 06/21] Delete some dead code --- numba/cuda/dispatcher.py | 43 ---------------------------------------- 1 file changed, 43 deletions(-) diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index 5f99680b7a5..aba0a34b21f 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -1,4 +1,3 @@ -import inspect import numpy as np import os import sys @@ -9,7 +8,6 @@ from numba.core.compiler_lock import global_compiler_lock from numba.core.dispatcher import Dispatcher as uber_Dispatcher from numba.core.errors import NumbaPerformanceWarning -from numba.core.typing.templates import AbstractTemplate from numba.core.typing.typeof import Purpose, typeof from numba.cuda.api import get_current_device @@ -494,9 +492,6 @@ def __init__(self, py_func, sigs, targetoptions, self._can_compile = False - #if targetoptions.get('device'): - # self._register_device_function() - def _make_finalizer(self): # Dummy finalizer whilst _DispatcherBase assumes the existence of a # finalizer @@ -504,48 +499,10 @@ def finalizer(): pass return finalizer - def _register_device_function(self): - dispatcher = self - pyfunc = self.py_func - - class device_function_template(AbstractTemplate): - key = dispatcher - - def generic(self, args, kws): - assert not kws - return dispatcher.compile(args).signature - - def get_template_info(cls): - basepath = os.path.dirname( - os.path.dirname(os.path.dirname(cuda.__file__))) - code, firstlineno = inspect.getsourcelines(pyfunc) - path = inspect.getsourcefile(pyfunc) - sig = str(utils.pysignature(pyfunc)) - info = { - 'kind': "overload", - 'name': getattr(cls.key, '__name__', "unknown"), - 'sig': sig, - 'filename': utils.safe_relpath(path, start=basepath), - 'lines': (firstlineno, firstlineno + len(code) - 1), - 'docstring': pyfunc.__doc__ - } - return info - - from .descriptor import cuda_target - typingctx = cuda_target.typing_context - typingctx.insert_user_function(dispatcher, device_function_template) - @property def _numba_type_(self): return cuda_types.CUDADispatcher(self) - @property - def is_compiling(self): - """ - Whether a specialization is currently being compiled. - """ - return self._compiling_counter - def configure(self, griddim, blockdim, stream=0, sharedmem=0): griddim, blockdim = normalize_kernel_dimensions(griddim, blockdim) return _LaunchConfiguration(self, griddim, blockdim, stream, sharedmem) From 8238610e1edd886ed2e57d0df438cb03479991f1 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 17:07:10 +0000 Subject: [PATCH 07/21] Remove sigs from cuda dispatcher --- numba/cuda/decorators.py | 15 +++++++++++---- numba/cuda/dispatcher.py | 40 ++++++++++++++++++++-------------------- numba/cuda/target.py | 3 +-- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/numba/cuda/decorators.py b/numba/cuda/decorators.py index 4a4bed69f83..d49826542e0 100644 --- a/numba/cuda/decorators.py +++ b/numba/cuda/decorators.py @@ -97,7 +97,16 @@ def _jit(func): targetoptions['opt'] = opt targetoptions['fastmath'] = fastmath targetoptions['device'] = device - return Dispatcher(func, [func_or_sig], targetoptions=targetoptions) + disp = Dispatcher(func, targetoptions=targetoptions) + if device: + disp.compile_device(argtypes) + disp._specialized = True + else: + disp.compile(argtypes) + disp._specialized = True + disp.disable_compile() + + return disp return _jit else: @@ -124,9 +133,7 @@ def autojitwrapper(func): targetoptions['link'] = link targetoptions['fastmath'] = fastmath targetoptions['device'] = device - sigs = None - return Dispatcher(func_or_sig, sigs, - targetoptions=targetoptions) + return Dispatcher(func_or_sig, targetoptions=targetoptions) def declare_device(name, sig): diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index aba0a34b21f..3211d020fff 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -460,8 +460,7 @@ class Dispatcher(uber_Dispatcher, serialize.ReduceMixin): targetdescr = cuda_target - def __init__(self, py_func, sigs, targetoptions, - pipeline_class=CUDACompiler): + def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler): # TODO: Check if this fixes the cuda docstring jit issue super().__init__(py_func, targetoptions=targetoptions, @@ -469,9 +468,9 @@ def __init__(self, py_func, sigs, targetoptions, # CUDA-specific stuff - hopefully some of it can be removed ASAP - self.sigs = [] + self._specialized = False self.link = targetoptions.pop('link', (),) - self._can_compile = True + #self._can_compile = True self._type = self._numba_type_ # Specializations for given sets of argument types @@ -481,16 +480,16 @@ def __init__(self, py_func, sigs, targetoptions, self.targetoptions['extensions'] = \ list(self.targetoptions.get('extensions', [])) - if sigs: - if len(sigs) > 1: - raise TypeError("Only one signature supported at present") - if targetoptions.get('device'): - argtypes, restype = sigutils.normalize_signature(sigs[0]) - self.compile_device(argtypes) - else: - self.compile(sigs[0]) + #if sigs: + # if len(sigs) > 1: + # raise TypeError("Only one signature supported at present") + # if targetoptions.get('device'): + # argtypes, restype = sigutils.normalize_signature(sigs[0]) + # self.compile_device(argtypes) + # else: + # self.compile(sigs[0]) - self._can_compile = False + # self._can_compile = False def _make_finalizer(self): # Dummy finalizer whilst _DispatcherBase assumes the existence of a @@ -622,8 +621,10 @@ def specialize(self, *args): targetoptions = self.targetoptions targetoptions['link'] = self.link - specialization = Dispatcher(self.py_func, [types.void(*argtypes)], - targetoptions) + specialization = Dispatcher(self.py_func, targetoptions=targetoptions) + specialization.compile(argtypes) + specialization.disable_compile() + specialization._specialized = True self.specializations[cc, argtypes] = specialization return specialization @@ -635,7 +636,7 @@ def specialized(self): """ True if the Dispatcher has been specialized. """ - return len(self.sigs) == 1 and not self._can_compile + return self._specialized def get_regs_per_thread(self, signature=None): ''' @@ -739,7 +740,6 @@ def compile(self, sig): self.overloads[argtypes] = kernel kernel.bind() - self.sigs.append(sig) return kernel def inspect_llvm(self, signature=None): @@ -832,11 +832,11 @@ def bind(self): defn.bind() @classmethod - def _rebuild(cls, py_func, sigs, targetoptions): + def _rebuild(cls, py_func, targetoptions): """ Rebuild an instance. """ - instance = cls(py_func, sigs, targetoptions) + instance = cls(py_func, targetoptions) return instance def _reduce_states(self): @@ -844,5 +844,5 @@ def _reduce_states(self): Reduce the instance for serialization. Compiled definitions are discarded. """ - return dict(py_func=self.py_func, sigs=self.sigs, + return dict(py_func=self.py_func, targetoptions=self.targetoptions) diff --git a/numba/cuda/target.py b/numba/cuda/target.py index 17b990c6816..c7a4e4baf11 100644 --- a/numba/cuda/target.py +++ b/numba/cuda/target.py @@ -42,8 +42,7 @@ def resolve_value_type(self, val): targetoptions['device'] = True targetoptions['debug'] = targetoptions.get('debug', False) targetoptions['opt'] = targetoptions.get('opt', True) - sigs = None - disp = CUDADispatcher(val.py_func, sigs, targetoptions) + disp = CUDADispatcher(val.py_func, targetoptions) # cache the device function for future use and to avoid # duplicated copy of the same function. val.__dispatcher = disp From 53e98fb828355eb66460932546dfda3bbf59a637 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 17:30:45 +0000 Subject: [PATCH 08/21] Some refactoring --- numba/cuda/decorators.py | 5 +++++ numba/cuda/dispatcher.py | 20 +------------------- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/numba/cuda/decorators.py b/numba/cuda/decorators.py index d49826542e0..43290c89fea 100644 --- a/numba/cuda/decorators.py +++ b/numba/cuda/decorators.py @@ -69,6 +69,7 @@ def jit(func_or_sig=None, device=False, inline=False, link=[], debug=None, debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug fastmath = kws.get('fastmath', False) + extensions = kws.get('extensions', []) if debug and opt: msg = ("debug=True with opt=True (the default) " @@ -97,7 +98,10 @@ def _jit(func): targetoptions['opt'] = opt targetoptions['fastmath'] = fastmath targetoptions['device'] = device + targetoptions['extensions'] = extensions disp = Dispatcher(func, targetoptions=targetoptions) + # TODO: Support multiple signatures by compiling in a loop over + # signatures. if device: disp.compile_device(argtypes) disp._specialized = True @@ -133,6 +137,7 @@ def autojitwrapper(func): targetoptions['link'] = link targetoptions['fastmath'] = fastmath targetoptions['device'] = device + targetoptions['extensions'] = extensions return Dispatcher(func_or_sig, targetoptions=targetoptions) diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index 3211d020fff..bc7d68119bc 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -469,28 +469,11 @@ def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler): # CUDA-specific stuff - hopefully some of it can be removed ASAP self._specialized = False - self.link = targetoptions.pop('link', (),) - #self._can_compile = True self._type = self._numba_type_ # Specializations for given sets of argument types self.specializations = {} - # defensive copy - self.targetoptions['extensions'] = \ - list(self.targetoptions.get('extensions', [])) - - #if sigs: - # if len(sigs) > 1: - # raise TypeError("Only one signature supported at present") - # if targetoptions.get('device'): - # argtypes, restype = sigutils.normalize_signature(sigs[0]) - # self.compile_device(argtypes) - # else: - # self.compile(sigs[0]) - - # self._can_compile = False - def _make_finalizer(self): # Dummy finalizer whilst _DispatcherBase assumes the existence of a # finalizer @@ -620,7 +603,6 @@ def specialize(self, *args): return specialization targetoptions = self.targetoptions - targetoptions['link'] = self.link specialization = Dispatcher(self.py_func, targetoptions=targetoptions) specialization.compile(argtypes) specialization.disable_compile() @@ -730,7 +712,7 @@ def compile(self, sig): if kernel is None: if not self._can_compile: raise RuntimeError("Compilation disabled") - kernel = _Kernel(self.py_func, argtypes, link=self.link, + kernel = _Kernel(self.py_func, argtypes, **self.targetoptions) # Inspired by _DispatcherBase.add_overload, but differs slightly # because we're inserting a _Kernel object instead of a compiled From e307e9a732013161b91dddeed8cbadd744619d2b Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 17:41:16 +0000 Subject: [PATCH 09/21] Some renaming to closer align ForAll with reality --- numba/cuda/dispatcher.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index bc7d68119bc..dd0cf9d94ed 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -374,11 +374,11 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs): class ForAll(object): - def __init__(self, kernel, ntasks, tpb, stream, sharedmem): + def __init__(self, dispatcher, ntasks, tpb, stream, sharedmem): if ntasks < 0: raise ValueError("Can't create ForAll with negative task count: %s" % ntasks) - self.kernel = kernel + self.dispatcher = dispatcher self.ntasks = ntasks self.thread_per_block = tpb self.stream = stream @@ -388,16 +388,17 @@ def __call__(self, *args): if self.ntasks == 0: return - if self.kernel.specialized: - kernel = self.kernel + if self.dispatcher.specialized: + specialized = self.dispatcher else: - kernel = self.kernel.specialize(*args) - blockdim = self._compute_thread_per_block(kernel) + specialized = self.dispatcher.specialize(*args) + blockdim = self._compute_thread_per_block(specialized) griddim = (self.ntasks + blockdim - 1) // blockdim - return kernel[griddim, blockdim, self.stream, self.sharedmem](*args) + return specialized[griddim, blockdim, self.stream, + self.sharedmem](*args) - def _compute_thread_per_block(self, kernel): + def _compute_thread_per_block(self, dispatcher): tpb = self.thread_per_block # Prefer user-specified config if tpb != 0: @@ -405,11 +406,11 @@ def _compute_thread_per_block(self, kernel): # Else, ask the driver to give a good config else: ctx = get_context() - # Kernel is specialized, so there's only one definition - get it so - # we can get the cufunc from the code library - defn = next(iter(kernel.overloads.values())) + # Dispatcher is specialized, so there's only one definition - get + # it so we can get the cufunc from the code library + kernel = next(iter(dispatcher.overloads.values())) kwargs = dict( - func=defn._codelibrary.get_cufunc(), + func=kernel._codelibrary.get_cufunc(), b2d_func=0, # dynamic-shared memory is constant to blksz memsize=self.sharedmem, blocksizelimit=1024, @@ -495,7 +496,7 @@ def __getitem__(self, args): return self.configure(*args) def forall(self, ntasks, tpb=0, stream=0, sharedmem=0): - """Returns a 1D-configured kernel for a given number of tasks. + """Returns a 1D-configured dispatcher for a given number of tasks. This assumes that: @@ -507,11 +508,12 @@ def forall(self, ntasks, tpb=0, stream=0, sharedmem=0): :param ntasks: The number of tasks. :param tpb: The size of a block. An appropriate value is chosen if this parameter is not supplied. - :param stream: The stream on which the configured kernel will be + :param stream: The stream on which the configured dispatcher will be launched. :param sharedmem: The number of bytes of dynamic shared memory required by the kernel. - :return: A configured kernel, ready to launch on a set of arguments.""" + :return: A configured dispatcher, ready to launch on a set of + arguments.""" return ForAll(self, ntasks, tpb=tpb, stream=stream, sharedmem=sharedmem) From 703d22255ebc9a15ba5e79b9819bd66da67b6481 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 17:47:39 +0000 Subject: [PATCH 10/21] Delete _search_new_conversions and nopython_signatures from CUDA dispatcher --- numba/cuda/dispatcher.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index dd0cf9d94ed..b74a430d560 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -43,6 +43,10 @@ def __init__(self, py_func, argtypes, link=None, debug=False, super().__init__() + # Emulate a CompileResult so that _DispatcherBase.nopython_signatures + # can be used as-is + self.objectmode = False + self.py_func = py_func self.argtypes = argtypes self.debug = debug @@ -559,17 +563,6 @@ def _compile_for_args(self, *args, **kws): argtypes = [self.typeof_pyval(a) for a in args] return self.compile(tuple(argtypes)) - def _search_new_conversions(self, *args, **kws): - # Based on _DispatcherBase._search_new_conversions - assert not kws - args = [self.typeof_pyval(a) for a in args] - found = False - for sig in self.nopython_signatures: - conv = self.typingctx.install_possible_conversions(args, sig.args) - if conv: - found = True - return found - def typeof_pyval(self, val): # Based on _DispatcherBase.typeof_pyval, but differs from it to support # the CUDA Array Interface. @@ -584,11 +577,6 @@ def typeof_pyval(self, val): else: raise - @property - def nopython_signatures(self): - # Based on _DispatcherBase.nopython_signatures - return [kernel.signature for kernel in self.overloads.values()] - def specialize(self, *args): ''' Create a new instance of this dispatcher specialized for the given From 13aa50bdb3d7c56d1c41213028bb14dbd03ab553 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 17:53:31 +0000 Subject: [PATCH 11/21] Remove CUDA Dispatchers disable_compile and add a note to get_call_template --- numba/cuda/dispatcher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index b74a430d560..fbac4426f02 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -600,9 +600,6 @@ def specialize(self, *args): self.specializations[cc, argtypes] = specialization return specialization - def disable_compile(self, val=True): - self._can_compile = not val - @property def specialized(self): """ @@ -631,6 +628,9 @@ def get_regs_per_thread(self, signature=None): def get_call_template(self, args, kws): # Copied and simplified from _DispatcherBase.get_call_template. + # + # This seems to have some necessary differences to the _DispatcherBase + # version to force casts where necessay? XXX """ Get a typing.ConcreteTemplate for this dispatcher and the given *args* and *kws* types. This allows resolution of the return type. From a11269ea19598edfb364ad4f0b6c2a98858d3380 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 21:57:05 +0000 Subject: [PATCH 12/21] Clarify comment on CUDA Compile Result --- numba/cuda/compiler.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py index 31dd4fb02c2..dc24d2a8919 100644 --- a/numba/cuda/compiler.py +++ b/numba/cuda/compiler.py @@ -29,11 +29,21 @@ class CUDAFlags(Flags): ) -# FIXME: Update this comment -# We give the id of the overload (a CompileResult) because this is used -# as a key into a dict of overloads, and this is the only small and -# unique property of a CompileResult on CUDA (c.f. the CPU target, -# which uses its entry_point, which is a pointer value). +# The CUDACompileResult has a specially-defined entry point equal to its id. +# This is because the entry point is used as a key into a dict of overloads by +# the base dispatcher. The id of the CCR is the only small and unique property +# of a CompileResult in the CUDA target (c.f. the CPU target, which uses its +# entry_point, which is a pointer value). +# +# This does feel a little hackish, and there are two ways in which this could +# be improved: +# +# 1. We could change the core of Numba so that each CompileResult has its own +# unique ID that can be used as a key - e.g. a count, similar to the way in +# which types have unique counts. +# 2. At some future time when kernel launch uses a compiled function, the entry +# point will no longer need to be a synthetic value, but will instead be a +# pointer to the compiled function as in the CPU target. class CUDACompileResult(CompileResult): @property From 18c553f3b1aa6e9f05a8efd68ca7505cdfc96639 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 22:08:08 +0000 Subject: [PATCH 13/21] Remove duplication in compile_result functions --- numba/core/compiler.py | 17 +++++++++++------ numba/cuda/compiler.py | 25 +++++++------------------ 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/numba/core/compiler.py b/numba/core/compiler.py index 0bf12b0dbf1..85a5fdc8265 100644 --- a/numba/core/compiler.py +++ b/numba/core/compiler.py @@ -269,20 +269,25 @@ def dump(self, tab=''): ]) -def compile_result(**kws): - keys = set(kws.keys()) +def sanitize_compile_result_entries(entries): + keys = set(entries.keys()) fieldset = set(CR_FIELDS) badnames = keys - fieldset if badnames: raise NameError(*badnames) missing = fieldset - keys for k in missing: - kws[k] = None + entries[k] = None # Avoid keeping alive traceback variables - err = kws['typing_error'] + err = entries['typing_error'] if err is not None: - kws['typing_error'] = err.with_traceback(None) - return CompileResult(**kws) + entries['typing_error'] = err.with_traceback(None) + return entries + + +def compile_result(**entries): + entries = sanitize_compile_result_entries(entries) + return CompileResult(**entries) def compile_isolated(func, args, return_type=None, flags=DEFAULT_FLAGS, diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py index dc24d2a8919..cb2cfcb4f21 100644 --- a/numba/cuda/compiler.py +++ b/numba/cuda/compiler.py @@ -1,7 +1,8 @@ from numba.core.typing.templates import ConcreteTemplate from numba.core import types, typing, funcdesc, config, compiler -from numba.core.compiler import (CompilerBase, DefaultPassBuilder, - Flags, Option, CompileResult, CR_FIELDS) +from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase, + DefaultPassBuilder, Flags, Option, + CompileResult) from numba.core.compiler_lock import global_compiler_lock from numba.core.compiler_machinery import (LoweringPass, AnalysisPass, PassManager, register_pass) @@ -51,20 +52,9 @@ def entry_point(self): return id(self) -def compile_result(**kws): - keys = set(kws.keys()) - fieldset = set(CR_FIELDS) - badnames = keys - fieldset - if badnames: - raise NameError(*badnames) - missing = fieldset - keys - for k in missing: - kws[k] = None - # Avoid keeping alive traceback variables - err = kws['typing_error'] - if err is not None: - kws['typing_error'] = err.with_traceback(None) - return CUDACompileResult(**kws) +def cuda_compile_result(**entries): + entries = sanitize_compile_result_entries(entries) + return CUDACompileResult(**entries) @register_pass(mutates_CFG=True, analysis_only=False) @@ -82,7 +72,7 @@ def run_pass(self, state): lowered = state['cr'] signature = typing.signature(state.return_type, *state.args) - state.cr = compile_result( + state.cr = cuda_compile_result( typing_context=state.typingctx, target_context=state.targetctx, typing_error=state.status.fail_reason, @@ -92,7 +82,6 @@ def run_pass(self, state): signature=signature, fndesc=lowered.fndesc, ) - return True From 3ebcf2637744f2ef8a6a7c1bc7273ebb2a051b77 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 22:12:32 +0000 Subject: [PATCH 14/21] Refactor / tidy up decorators.py --- numba/cuda/decorators.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/numba/cuda/decorators.py b/numba/cuda/decorators.py index 43290c89fea..ca25767230e 100644 --- a/numba/cuda/decorators.py +++ b/numba/cuda/decorators.py @@ -99,15 +99,15 @@ def _jit(func): targetoptions['fastmath'] = fastmath targetoptions['device'] = device targetoptions['extensions'] = extensions + disp = Dispatcher(func, targetoptions=targetoptions) - # TODO: Support multiple signatures by compiling in a loop over - # signatures. + if device: disp.compile_device(argtypes) - disp._specialized = True else: disp.compile(argtypes) - disp._specialized = True + + disp._specialized = True disp.disable_compile() return disp From 4f5b36152deeb27853346bbe3019f0fe0d274442 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 22:29:56 +0000 Subject: [PATCH 15/21] Rename CUDA dispatcher to CUDADispatcher Also update some comments. --- numba/cuda/decorators.py | 6 +++--- numba/cuda/dispatcher.py | 19 ++++++++----------- numba/cuda/initialize.py | 4 ++-- numba/cuda/target.py | 6 +++--- numba/cuda/tests/cudapy/test_errors.py | 4 +--- 5 files changed, 17 insertions(+), 22 deletions(-) diff --git a/numba/cuda/decorators.py b/numba/cuda/decorators.py index ca25767230e..ba901a7c344 100644 --- a/numba/cuda/decorators.py +++ b/numba/cuda/decorators.py @@ -2,7 +2,7 @@ from numba.core import types, config, sigutils from numba.core.errors import DeprecationError, NumbaInvalidConfigWarning from numba.cuda.compiler import declare_device_function -from numba.cuda.dispatcher import Dispatcher +from numba.cuda.dispatcher import CUDADispatcher from numba.cuda.simulator.kernel import FakeCUDAKernel @@ -100,7 +100,7 @@ def _jit(func): targetoptions['device'] = device targetoptions['extensions'] = extensions - disp = Dispatcher(func, targetoptions=targetoptions) + disp = CUDADispatcher(func, targetoptions=targetoptions) if device: disp.compile_device(argtypes) @@ -138,7 +138,7 @@ def autojitwrapper(func): targetoptions['fastmath'] = fastmath targetoptions['device'] = device targetoptions['extensions'] = extensions - return Dispatcher(func_or_sig, targetoptions=targetoptions) + return CUDADispatcher(func_or_sig, targetoptions=targetoptions) def declare_device(name, sig): diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index fbac4426f02..0f041ec5529 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -6,7 +6,7 @@ from numba.core import config, serialize, sigutils, types, typing, utils from numba.core.compiler_lock import global_compiler_lock -from numba.core.dispatcher import Dispatcher as uber_Dispatcher +from numba.core.dispatcher import Dispatcher from numba.core.errors import NumbaPerformanceWarning from numba.core.typing.typeof import Purpose, typeof @@ -447,7 +447,7 @@ def __call__(self, *args): self.stream, self.sharedmem) -class Dispatcher(uber_Dispatcher, serialize.ReduceMixin): +class CUDADispatcher(Dispatcher, serialize.ReduceMixin): ''' CUDA Dispatcher object. When configured and called, the dispatcher will specialize itself for the given arguments (if no suitable specialized @@ -593,7 +593,8 @@ def specialize(self, *args): return specialization targetoptions = self.targetoptions - specialization = Dispatcher(self.py_func, targetoptions=targetoptions) + specialization = CUDADispatcher(self.py_func, + targetoptions=targetoptions) specialization.compile(argtypes) specialization.disable_compile() specialization._specialized = True @@ -627,10 +628,10 @@ def get_regs_per_thread(self, signature=None): for sig, overload in self.overloads.items()} def get_call_template(self, args, kws): - # Copied and simplified from _DispatcherBase.get_call_template. - # - # This seems to have some necessary differences to the _DispatcherBase - # version to force casts where necessay? XXX + # Originally copied from _DispatcherBase.get_call_template. This + # version deviates slightly from the _DispatcherBase version in order + # to force casts when calling device functions. See e.g. + # TestDeviceFunc.test_device_casting, added in PR #7496. """ Get a typing.ConcreteTemplate for this dispatcher and the given *args* and *kws* types. This allows resolution of the return type. @@ -654,10 +655,6 @@ def get_call_template(self, args, kws): return call_template, pysig, args, kws - # XXX: Delete this and call the class CUDADispatcher - def __repr__(self): - return f"numba.cuda.dispatcher.Dispatcher({self.py_func})" - def compile_device(self, args): """Compile the device function for the given argument types. diff --git a/numba/cuda/initialize.py b/numba/cuda/initialize.py index c9d1bc13d6e..0c9343aaa29 100644 --- a/numba/cuda/initialize.py +++ b/numba/cuda/initialize.py @@ -3,7 +3,7 @@ def initialize_all(): import numba.cuda.models # noqa: F401 from numba import cuda - from numba.cuda.dispatcher import Dispatcher + from numba.cuda.dispatcher import CUDADispatcher from numba.core.target_extension import (target_registry, dispatcher_registry, jit_registry) @@ -14,4 +14,4 @@ def cuda_jit_device(*args, **kwargs): cuda_target = target_registry["cuda"] jit_registry[cuda_target] = cuda_jit_device - dispatcher_registry[cuda_target] = Dispatcher + dispatcher_registry[cuda_target] = CUDADispatcher diff --git a/numba/cuda/target.py b/numba/cuda/target.py index c7a4e4baf11..c1800c558cb 100644 --- a/numba/cuda/target.py +++ b/numba/cuda/target.py @@ -3,7 +3,7 @@ from llvmlite import ir from numba.core import typing, types, debuginfo, itanium_mangler, cgutils -from numba.core.dispatcher import Dispatcher as uber_Dispatcher +from numba.core.dispatcher import Dispatcher from numba.core.utils import cached_property from numba.core.base import BaseContext from numba.core.callconv import MinimalCallConv @@ -28,8 +28,8 @@ def load_additional_registries(self): def resolve_value_type(self, val): # treat other dispatcher object as another device function - from numba.cuda.dispatcher import Dispatcher as CUDADispatcher - if (isinstance(val, uber_Dispatcher) and not + from numba.cuda.dispatcher import CUDADispatcher + if (isinstance(val, Dispatcher) and not isinstance(val, CUDADispatcher)): try: # use cached device function diff --git a/numba/cuda/tests/cudapy/test_errors.py b/numba/cuda/tests/cudapy/test_errors.py index e13197f7d8f..c20fb8dccdf 100644 --- a/numba/cuda/tests/cudapy/test_errors.py +++ b/numba/cuda/tests/cudapy/test_errors.py @@ -71,9 +71,7 @@ def kernel_func(): with self.assertRaises(TypingError) as raises: kernel_func[1, 1]() excstr = str(raises.exception) - self.assertIn("resolving callee type: " - "type(numba.cuda.dispatcher.Dispatcher", - excstr) + self.assertIn("resolving callee type: type(CUDADispatcher", excstr) self.assertIn("NameError: name 'floor' is not defined", excstr) From 078f2d958a0b8a0a70e4c349a053c17362175ed0 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 22:56:35 +0000 Subject: [PATCH 16/21] Clarify some comments --- numba/cuda/dispatcher.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index 0f041ec5529..bf966b26efb 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -43,8 +43,11 @@ def __init__(self, py_func, argtypes, link=None, debug=False, super().__init__() - # Emulate a CompileResult so that _DispatcherBase.nopython_signatures - # can be used as-is + # _DispatcherBase.nopython_signatures() expects this attribute to be + # present, because it assumes an overload is a CompileResult. In the + # CUDA target, _Kernel instances are stored instead, so we provide this + # attribute here to avoid duplicating nopython_signatures() in the CUDA + # target with slight modifications. self.objectmode = False self.py_func = py_func @@ -466,17 +469,20 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin): targetdescr = cuda_target def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler): - # TODO: Check if this fixes the cuda docstring jit issue - super().__init__(py_func, targetoptions=targetoptions, pipeline_class=pipeline_class) + self._type = self._numba_type_ - # CUDA-specific stuff - hopefully some of it can be removed ASAP + # The following properties are for specialization of CUDADisptachers. A + # specialized CUDADispatcher is one that is compiled for exactly one + # set of argument types, and bypasses some argument type checking for + # faster kernel launches. + # Is this a specialized dispatcher? self._specialized = False - self._type = self._numba_type_ - # Specializations for given sets of argument types + # If we produced specialized dispatchers, we cache them for each set of + # argument types self.specializations = {} def _make_finalizer(self): From 778f7e23846c8b0a92491e4ed84b6f312b1ebf50 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 23:11:23 +0000 Subject: [PATCH 17/21] Fix ref to CUDADispatcher in docs --- docs/source/cuda-reference/kernel.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/cuda-reference/kernel.rst b/docs/source/cuda-reference/kernel.rst index 71476ab991f..4095b5b1767 100644 --- a/docs/source/cuda-reference/kernel.rst +++ b/docs/source/cuda-reference/kernel.rst @@ -56,7 +56,7 @@ This is similar to launch configuration in CUDA C/C++: Dispatcher objects also provide several utility methods for inspection and creating a specialized instance: -.. autoclass:: numba.cuda.dispatcher.Dispatcher +.. autoclass:: numba.cuda.dispatcher.CUDADispatcher :members: inspect_asm, inspect_llvm, inspect_sass, inspect_types, get_regs_per_thread, specialize, specialized, extensions, forall From 5d6384c58e531509fca7a190fc17357397ec2558 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 23:12:27 +0000 Subject: [PATCH 18/21] CUDA Dispatcher: use default finalizer - For device functions, this works exactly like on the CPU target because device functions are CompileResult objects that are inserted into the target context (and therefore need removing by the finalizer). - For kernels, we give them a dummy entry point because they were never inserted and don't need removing (similar to with object mode functions). --- numba/cuda/dispatcher.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index bf966b26efb..c4ea9fbb95e 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -50,6 +50,14 @@ def __init__(self, py_func, argtypes, link=None, debug=False, # target with slight modifications. self.objectmode = False + # The finalizer constructed by _DispatcherBase._make_finalizer also + # expects overloads to be a CompileResult. It uses the entry_point to + # remove a CompileResult from a target context. However, since we never + # insert kernels into a target context (there is no need because they + # cannot be called by other functions, only through the dispatcher) it + # suffices to pretend we have an entry point of None. + self.entry_point = None + self.py_func = py_func self.argtypes = argtypes self.debug = debug @@ -485,13 +493,6 @@ def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler): # argument types self.specializations = {} - def _make_finalizer(self): - # Dummy finalizer whilst _DispatcherBase assumes the existence of a - # finalizer - def finalizer(): - pass - return finalizer - @property def _numba_type_(self): return cuda_types.CUDADispatcher(self) From 4d9cff78dc97e291cd5fa599e864ed80d91595f7 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 4 Feb 2022 23:20:20 +0000 Subject: [PATCH 19/21] Add test for Issue #5902 --- numba/cuda/tests/cudapy/test_dispatcher.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/numba/cuda/tests/cudapy/test_dispatcher.py b/numba/cuda/tests/cudapy/test_dispatcher.py index a08cd4f474b..7090db221ac 100644 --- a/numba/cuda/tests/cudapy/test_dispatcher.py +++ b/numba/cuda/tests/cudapy/test_dispatcher.py @@ -272,6 +272,21 @@ def pi_sin_array(x, n): self.assertIsInstance(regs_per_thread, int) self.assertGreater(regs_per_thread, 0) + def test_dispatcher_docstring(self): + # Ensure that CUDA-jitting a function preserves its docstring. See + # Issue #5902: https://github.com/numba/numba/issues/5902 + + @cuda.jit + def add_kernel(a, b): + """Add two integers, kernel version""" + + @cuda.jit(device=True) + def add_device(a, b): + """Add two integers, device version""" + + self.assertEqual("Add two integers, kernel version", add_kernel.__doc__) + self.assertEqual("Add two integers, device version", add_device.__doc__) + if __name__ == '__main__': unittest.main() From ef5b4dce41cfbf6dfe4269626b9afb7b775f7fc1 Mon Sep 17 00:00:00 2001 From: Graham Markall <535640+gmarkall@users.noreply.github.com> Date: Fri, 11 Feb 2022 09:47:13 +0000 Subject: [PATCH 20/21] Fix typos (PR #7815 review) Co-authored-by: stuartarchibald --- numba/cuda/compiler.py | 4 ++-- numba/cuda/dispatcher.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py index cb2cfcb4f21..12d6c696a3e 100644 --- a/numba/cuda/compiler.py +++ b/numba/cuda/compiler.py @@ -30,10 +30,10 @@ class CUDAFlags(Flags): ) -# The CUDACompileResult has a specially-defined entry point equal to its id. +# The CUDACompileResult (CCR) has a specially-defined entry point equal to its id. # This is because the entry point is used as a key into a dict of overloads by # the base dispatcher. The id of the CCR is the only small and unique property -# of a CompileResult in the CUDA target (c.f. the CPU target, which uses its +# of a CompileResult in the CUDA target (cf. the CPU target, which uses its # entry_point, which is a pointer value). # # This does feel a little hackish, and there are two ways in which this could diff --git a/numba/cuda/dispatcher.py b/numba/cuda/dispatcher.py index c4ea9fbb95e..0eaa5c124ae 100644 --- a/numba/cuda/dispatcher.py +++ b/numba/cuda/dispatcher.py @@ -481,7 +481,7 @@ def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler): pipeline_class=pipeline_class) self._type = self._numba_type_ - # The following properties are for specialization of CUDADisptachers. A + # The following properties are for specialization of CUDADispatchers. A # specialized CUDADispatcher is one that is compiled for exactly one # set of argument types, and bypasses some argument type checking for # faster kernel launches. From 499623fdb62663fb8c4fa4f96cb52901e0f4517e Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 11 Feb 2022 09:51:39 +0000 Subject: [PATCH 21/21] Fix line length in cuda/compiler.py --- numba/cuda/compiler.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py index 12d6c696a3e..bb7ea3e4f6b 100644 --- a/numba/cuda/compiler.py +++ b/numba/cuda/compiler.py @@ -30,11 +30,11 @@ class CUDAFlags(Flags): ) -# The CUDACompileResult (CCR) has a specially-defined entry point equal to its id. -# This is because the entry point is used as a key into a dict of overloads by -# the base dispatcher. The id of the CCR is the only small and unique property -# of a CompileResult in the CUDA target (cf. the CPU target, which uses its -# entry_point, which is a pointer value). +# The CUDACompileResult (CCR) has a specially-defined entry point equal to its +# id. This is because the entry point is used as a key into a dict of +# overloads by the base dispatcher. The id of the CCR is the only small and +# unique property of a CompileResult in the CUDA target (cf. the CPU target, +# which uses its entry_point, which is a pointer value). # # This does feel a little hackish, and there are two ways in which this could # be improved: