fix rebase

glou-nes · glou-nes · commit 17d67f8e8f6f · 2025-02-06T23:34:48.000+01:00
diff --git a/ext/ReactantCUDAExt.jl b/ext/ReactantCUDAExt.jl
@@ -5,18 +5,22 @@ using Reactant:
     Reactant, TracedRArray, AnyTracedRArray, AnyConcreteRArray, MLIR, TracedRNumber
 using ReactantCore: @trace
 using KernelAbstractions: KernelAbstractions
+import KernelAbstractions as KA
 using Libdl
+const ReactantKernelAbstractionsExt = Base.get_extension(
+    Reactant, :ReactantKernelAbstractionsExt
+)
+const ReactantBackend = ReactantKernelAbstractionsExt.ReactantBackend
 
 using Adapt
 
-KernelAbstractions.get_backend(::AnyTracedRArray) = CUDABackend()
-KernelAbstractions.get_backend(::AnyConcreteRArray) = CUDABackend()
-
 struct CuTracedArray{T,N,A,Size} <: DenseArray{T,N}
     ptr::Core.LLVMPtr{T,A}
 
     function CuTracedArray{T,N,A,Size}(xs::TracedRArray) where {T,N,A,Size}
-        push!(Reactant.Compiler.context_gc_vector[MLIR.IR.context()], xs)
+        gc_vec = Reactant.Compiler.context_gc_vector[MLIR.IR.context()]
+        push!(gc_vec, xs)
+        @assert gc_vec[end] === xs
         ptr = Base.reinterpret(Core.LLVMPtr{T,CUDA.AS.Global}, Base.pointer_from_objref(xs))
         return new(ptr)
     end
@@ -261,6 +265,78 @@ function Adapt.adapt_structure(
     )
 end
 
+function threads_to_workgroupsize(threads, ndrange)
+    total = 1
+    return map(ndrange) do n
+        x = min(div(threads, total), n)
+        total *= x
+        return x
+    end
+end
+
+function ka_with_reactant(ndrange, workgroupsize, obj, args...)
+    backend = KA.backend(obj)
+
+    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(
+        obj, ndrange, workgroupsize
+    )
+    # this might not be the final context, since we may tune the workgroupsize
+    ctx = KA.mkcontext(obj, ndrange, iterspace)
+
+    # If the kernel is statically sized we can tell the compiler about that
+    if KA.workgroupsize(obj) <: KA.StaticSize
+        maxthreads = prod(KA.get(KA.workgroupsize(obj)))
+    else
+        maxthreads = nothing
+    end
+
+    kernel = CUDA.@cuda launch = false always_inline = backend.always_inline maxthreads =
+        maxthreads obj.f(ctx, args...)
+
+    # figure out the optimal workgroupsize automatically
+    if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing
+        if !Reactant.Compiler.PartitionKA[]
+            threads = prod(ndrange)
+        else
+            config = CUDA.launch_configuration(kernel.fun; max_threads=prod(ndrange))
+            if backend.prefer_blocks
+                # Prefer blocks over threads
+                threads = min(prod(ndrange), config.threads)
+                # XXX: Some kernels performs much better with all blocks active
+                cu_blocks = max(cld(prod(ndrange), threads), config.blocks)
+                threads = cld(prod(ndrange), cu_blocks)
+            else
+                threads = config.threads
+            end
+            workgroupsize = threads_to_workgroupsize(threads, ndrange)
+            iterspace, dynamic = KA.partition(obj, ndrange, workgroupsize)
+        end
+        ctx = KA.mkcontext(obj, ndrange, iterspace)
+    end
+
+    blocks = length(KA.blocks(iterspace))
+    threads = length(KA.workitems(iterspace))
+
+    if blocks == 0
+        return nothing
+    end
+
+    # Launch kernel
+    kernel(ctx, args...; threads, blocks)
+
+    return nothing
+end
+
+Reactant.@reactant_overlay @noinline function (obj::KA.Kernel{ReactantBackend})(
+    args...; ndrange=nothing, workgroupsize=nothing
+)
+    return Reactant.call_with_reactant(
+        ka_with_reactant, ndrange, workgroupsize, obj, args...
+    )
+end
+
+Adapt.adapt_storage(to::KA.ConstAdaptor, a::CuTracedArray) = Base.Experimental.Const(a)
+
 function recudaconvert(arg)
     return adapt(ReactantKernelAdaptor(), arg)
 end
@@ -618,8 +694,7 @@ Reactant.@reactant_overlay @noinline function (func::LLVMFunc{F,tt})(
             array_ty = MLIR.IR.Type(MLIR.API.mlirLLVMArrayTypeGet(MLIR.IR.Type(Int8), sz))
             cdata = MLIR.IR.result(
                 MLIR.Dialects.llvm.mlir_constant(;
-                    res=array_ty,
-                    value=MLIR.IR.DenseElementsAttribute(to_bytes(a)), #TODO: mlir_constant cannot be processed by the julia generator atm.
+                    res=array_ty, value=MLIR.IR.DenseElementsAttribute(to_bytes(a))
                 ),
                 1,
             )
@@ -841,45 +916,21 @@ function Reactant.make_tracer(
 end
 
 function __init__()
-    if isdefined(CUDA.CUDA_Driver_jll, :libcuda) && CUDA.CUDA_Driver_jll.libcuda !== nothing
-        handle = Reactant.XLA.Libdl.dlopen(CUDA.CUDA_Driver_jll.libcuda; throw_error=false)
-        if handle === nothing
-            handle = C_NULL
-        end
-        ptr1 = Reactant.XLA.Libdl.dlsym(handle, "cuLaunchKernel"; throw_error=false)
-        if ptr1 === nothing
-            ptr1 = C_NULL
-        end
-        ptr2 = Reactant.XLA.Libdl.dlsym(handle, "cuModuleLoadData"; throw_error=false)
-        if ptr2 === nothing
-            ptr2 = C_NULL
-        end
-        ptr3 = Reactant.XLA.Libdl.dlsym(handle, "cuModuleGetFunction"; throw_error=false)
-        if ptr3 === nothing
-            ptr3 = C_NULL
-        end
-        Reactant.Compiler.cuLaunch[] = Base.reinterpret(UInt, ptr1)
-        Reactant.Compiler.cuModule[] = Base.reinterpret(UInt, ptr2)
-        Reactant.Compiler.cuFunc[] = Base.reinterpret(UInt, ptr3)
-        ptr4 = Reactant.XLA.Libdl.dlsym(handle, "cuStreamSynchronize"; throw_error=false)
-        if ptr4 === nothing
-            ptr4 = C_NULL
-        end
-        Reactant.Compiler.cuSync[] = Base.reinterpret(UInt, ptr4)
-    end
     if CUDA.functional()
         target = CUDA._compiler_config(CUDA.device()).target
         Reactant.Compiler.cubinChip[] = "sm_$(target.cap.major)$(target.cap.minor)"
     end
     return nothing
 end
 
-@static if !Sys.isapple() && Sys.ARCH != :aarch64
+# In Julia v1.11.3 precompiling this module caches bad code:
+# <https://github.com/EnzymeAD/Reactant.jl/issues/614>.
+@static if !Sys.isapple()
     Reactant.PrecompileTools.@setup_workload begin
         Reactant.initialize_dialect()
         client = Reactant.XLA.CPUClient(; checkcount=false)
         Reactant.PrecompileTools.@compile_workload begin
-            @static if Reactant.precompilation_supported()
+            @static if Reactant.precompilation_supported() && VERSION != v"1.11.3"
                 function square_kernel!(x)
                     i = CUDA.threadIdx().x
                     x[i] *= x[i]