|
208 | 208 |
|
209 | 209 | ## host-side kernels |
210 | 210 |
|
| 211 | +# XXX: storing the function instance, but not the arguments, is inconsistent. |
| 212 | +# either store the instance and args, making this object directly callable, |
| 213 | +# or store neither and cache it when getting it directly from GPUCompiler. |
| 214 | + |
211 | 215 | struct HostKernel{F,TT} <: AbstractKernel{F,TT} |
212 | 216 | f::F |
213 | 217 | ctx::CuContext |
@@ -294,9 +298,10 @@ when function changes, or when different types or keyword arguments are provided |
294 | 298 | target = CUDACompilerTarget(cuda.device; kwargs...) |
295 | 299 | params = CUDACompilerParams() |
296 | 300 | job = CompilerJob(target, source, params) |
297 | | - return GPUCompiler.cached_compilation(cache, job, |
298 | | - cufunction_compile, |
299 | | - cufunction_link)::HostKernel{F,tt} |
| 301 | + res = GPUCompiler.cached_compilation(cache, job, |
| 302 | + cufunction_compile, |
| 303 | + cufunction_link) |
| 304 | + HostKernel{F,tt}(f, cuda.context, res.mod, res.fun, res.state) |
300 | 305 | end |
301 | 306 |
|
302 | 307 | # XXX: does this need a lock? we'll only write to it when we have the typeinf lock. |
|
461 | 466 | exception_ptr = create_exceptions!(mod) |
462 | 467 | state = KernelState(exception_ptr) |
463 | 468 |
|
464 | | - return HostKernel{typeof(job.source.f),job.source.tt}(job.source.f, ctx, mod, fun, state) |
| 469 | + return (; mod, fun, state) |
465 | 470 | end |
466 | 471 |
|
467 | 472 | function (kernel::HostKernel)(args...; threads::CuDim=1, blocks::CuDim=1, kwargs...) |
|
0 commit comments