Adapt to GPUCompiler 0.15 changes (#1488)

maleadt · web-flow · commit 494a01c49486 · 2022-05-10T16:39:28.000+02:00
* Bump GPUCompiler.

* Adapt to FunctionSpec change.

GPUCompiler doesn't know about the function instance anymore,
so create the HostKernel when we still do.
diff --git a/Manifest.toml b/Manifest.toml
@@ -89,9 +89,9 @@ version = "8.3.2"
 
 [[GPUCompiler]]
 deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "556190e1e0ea3e37d83059fc9aa576f1e2104375"
+git-tree-sha1 = "05374e47bb136db517b33f62fbe852adf8deb0be"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.14.1"
+version = "0.15.1"
 
 [[InteractiveUtils]]
 deps = ["Markdown"]
diff --git a/Project.toml b/Project.toml
@@ -33,7 +33,7 @@ BFloat16s = "0.2"
 CEnum = "0.2, 0.3, 0.4"
 ExprTools = "0.1"
 GPUArrays = "8.3.2"
-GPUCompiler = "0.14"
+GPUCompiler = "0.15.1"
 LLVM = "4.5.3"
 Random123 = "1.2"
 RandomNumbers = "1.5.3"
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
@@ -208,6 +208,10 @@ end
 
 ## host-side kernels
 
+# XXX: storing the function instance, but not the arguments, is inconsistent.
+#      either store the instance and args, making this object directly callable,
+#      or store neither and cache it when getting it directly from GPUCompiler.
+
 struct HostKernel{F,TT} <: AbstractKernel{F,TT}
     f::F
     ctx::CuContext
@@ -294,9 +298,10 @@ when function changes, or when different types or keyword arguments are provided
     target = CUDACompilerTarget(cuda.device; kwargs...)
     params = CUDACompilerParams()
     job = CompilerJob(target, source, params)
-    return GPUCompiler.cached_compilation(cache, job,
-                                          cufunction_compile,
-                                          cufunction_link)::HostKernel{F,tt}
+    res = GPUCompiler.cached_compilation(cache, job,
+                                         cufunction_compile,
+                                         cufunction_link)
+    HostKernel{F,tt}(f, cuda.context, res.mod, res.fun, res.state)
 end
 
 # XXX: does this need a lock? we'll only write to it when we have the typeinf lock.
@@ -461,7 +466,7 @@ end
     exception_ptr = create_exceptions!(mod)
     state = KernelState(exception_ptr)
 
-    return HostKernel{typeof(job.source.f),job.source.tt}(job.source.f, ctx, mod, fun, state)
+    return (; mod, fun, state)
 end
 
 function (kernel::HostKernel)(args...; threads::CuDim=1, blocks::CuDim=1, kwargs...)
diff --git a/src/compiler/reflection.jl b/src/compiler/reflection.jl
@@ -143,10 +143,10 @@ function return_type(@nospecialize(func), @nospecialize(tt))
     job = CompilerJob(target, source, params)
     interp = GPUCompiler.get_interpreter(job)
     if VERSION >= v"1.8-"
-        sig = Base.signature_type(job.source.f, job.source.tt)
+        sig = Base.signature_type(func, tt)
         Core.Compiler.return_type(interp, sig)
     else
-        Core.Compiler.return_type(interp, job.source.f, job.source.tt)
+        Core.Compiler.return_type(interp, func, tt)
     end
 end
 
diff --git a/test/execution.jl b/test/execution.jl
@@ -100,7 +100,7 @@ end
     end)))
 
     @test CUDA.return_type(identity, Tuple{Int}) === Int
-    @test CUDA.return_type(CUDA.sin, Tuple{Float32}) === Float32
+    @test CUDA.return_type(sin, Tuple{Float32}) === Float32
     @test CUDA.return_type(getindex, Tuple{CuDeviceArray{Float32,1,1},Int32}) === Float32
     @test CUDA.return_type(getindex, Tuple{Base.RefValue{Integer}}) === Integer
 end