Skip to content

Commit

Permalink
fixes, getting arrays to work
Browse files Browse the repository at this point in the history
  • Loading branch information
VarLad committed Jan 12, 2025
1 parent 8d54bbc commit a5418be
Show file tree
Hide file tree
Showing 11 changed files with 133 additions and 86 deletions.
6 changes: 3 additions & 3 deletions lib/cl/CL.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,19 @@ Base.:(==)(a::CLObject, b::CLObject) = pointer(a) == pointer(b)
Base.hash(obj::CLObject, h::UInt) = hash(pointer(obj), h)

# API wrappers
include("intelfns.jl")
include("error.jl")
include("platform.jl")
include("device.jl")
include("context.jl")
include("cmdqueue.jl")
include("event.jl")
include("pointer.jl")
include("intelfns.jl")
include("memory.jl")
# include("buffer.jl")
# include("svm.jl")
# include("program.jl")
# include("kernel.jl")
include("program.jl")
include("kernel.jl")

include("state.jl")

Expand Down
15 changes: 10 additions & 5 deletions lib/cl/intelfns.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,29 @@ end
function ext_clMemFreeINTEL(context, ptr)
ocl_intel = ocl_extension("clMemFreeINTEL")

@ccall $ocl_intel(context::cl.cl_context, ptr::Ptr{Cvoid})::cl.cl_int
@ccall $ocl_intel(context::cl.cl_context, ptr::PtrOrCLPtr{Cvoid})::cl.cl_int
end

function ext_clMemBlockingFreeINTEL(context, ptr)
ocl_intel = ocl_extension("clMemBlockingFreeINTEL")

@ccall $ocl_intel(context::cl.cl_context, ptr::Ptr{Cvoid})::cl.cl_int
@ccall $ocl_intel(context::cl.cl_context, ptr::PtrOrCLPtr{Cvoid})::cl.cl_int
end

function ext_clGetMemAllocInfoINTEL(context, ptr, param_name, param_value_size, param_value, param_value_size_ret)
ocl_intel = ocl_extension("clGetMemAllocInfoINTEL")

@ccall $ocl_intel(context::cl.cl_context, ptr::Ptr{Cvoid}, param_name::cl.cl_mem_info_intel, param_value_size::Csize_t, param_value::Ptr{Cvoid}, param_value_size_ret::Ptr{Csize_t})::cl.cl_int
@ccall $ocl_intel(context::cl.cl_context, ptr::PtrOrCLPtr{Cvoid}, param_name::cl.cl_mem_info_intel, param_value_size::Csize_t, param_value::Ptr{Cvoid}, param_value_size_ret::Ptr{Csize_t})::cl.cl_int
end

function ext_clEnqueueMemcpyINTEL(command_queue, blocking, dst_ptr, src_ptr, size, num_events_in_wait_list, event_wait_list, event)

ocl_intel = ocl_extension("clEnqueueMemcpyINTEL")

@ccall $ocl_intel(command_queue::cl_command_queue, blocking::cl_bool, dst_ptr::Ptr{Cvoid}, src_ptr::Ptr{Cvoid}, size::Csize_t, num_events_in_wait_list::cl_uint, event_wait_list::Ptr{cl_event}, event::Ptr{cl_event})::cl_int
@ccall $ocl_intel(command_queue::cl_command_queue, blocking::cl_bool, dst_ptr::PtrOrCLPtr{Cvoid}, src_ptr::PtrOrCLPtr{Cvoid}, size::Csize_t, num_events_in_wait_list::cl_uint, event_wait_list::Ptr{cl_event}, event::Ptr{cl_event})::cl_int
end

function ext_clSetKernelArgMemPointerINTEL(kernel, arg_index, arg_value)
ocl_intel = ocl_extension("clSetKernelArgMemPointerINTEL")

@ccall $ocl_intel(kernel::cl_kernel, arg_index::cl_uint, arg_value::PtrOrCLPtr{Cvoid})::cl_int
end
16 changes: 8 additions & 8 deletions lib/cl/kernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ end

# SVMBuffers
## when passing using `cl.call`
function set_arg!(k::Kernel, idx::Integer, arg::SVMBuffer)
clSetKernelArgSVMPointer(k, cl_uint(idx-1), arg.ptr)
function set_arg!(k::Kernel, idx::Integer, arg::Union{HostBuffer, DeviceBuffer, SharedBuffer})
ext_clSetKernelArgMemPointerINTEL(k, cl_uint(idx-1), arg.ptr)
return k
end
## when passing with `clcall`, which has pre-converted the buffer
Expand All @@ -74,13 +74,13 @@ function set_arg!(k::Kernel, idx::Integer, arg::Union{Ptr,Core.LLVMPtr})
# `Core.LLVMPtr`, which _is_ pointer-valued. We retain this handling for `Ptr`
# for users passing pointers to OpenCL C, and because `Ptr` is pointer-valued
# starting with Julia 1.12.
clSetKernelArgSVMPointer(k, cl_uint(idx-1), arg)
ext_clSetKernelArgMemPointerINTEL(k, cl_uint(idx-1), arg)
end
return k
end

# regular buffers
function set_arg!(k::Kernel, idx::Integer, arg::AbstractMemory)
function set_arg!(k::Kernel, idx::Integer, arg::AbstractBuffer)
arg_boxed = Ref(arg.id)
clSetKernelArg(k, cl_uint(idx-1), sizeof(cl_mem), arg_boxed)
return k
Expand Down Expand Up @@ -177,11 +177,11 @@ end

function call(k::Kernel, args...; global_size=(1,), local_size=nothing,
global_work_offset=nothing, wait_on::Vector{Event}=Event[],
svm_pointers::Vector{Ptr{Cvoid}}=Ptr{Cvoid}[])
pointers::Vector{CLPtr}=CLPtr[])
set_args!(k, args...)
if !isempty(svm_pointers)
clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_SVM_PTRS,
sizeof(svm_pointers), svm_pointers)
if !isempty(pointers)
clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL,
sizeof(pointers), pointers)
end
enqueue_kernel(k, global_size, local_size; global_work_offset, wait_on)
end
Expand Down
11 changes: 11 additions & 0 deletions lib/cl/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,14 @@ function lookup_alloc(ctx::Context, ptr::Union{Ptr,CLPtr})
end
end
=#

function enqueue_usm_memcpy(dst::Union{CLPtr, Ptr}, src::Union{CLPtr, Ptr}, nbytes::Integer; queu::CmdQueue=queue(), blocking::Bool=false,
wait_for::Vector{Event}=Event[])
n_evts = length(wait_for)
evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for]
GC.@preserve wait_for begin
ret_evt = Ref{cl_event}()
ext_clEnqueueMemcpyINTEL(queu, blocking, dst, src, nbytes, n_evts, evt_ids, ret_evt)
@return_event ret_evt[]
end
end
19 changes: 14 additions & 5 deletions src/OpenCL.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,27 @@ include("../lib/cl/CL.jl")
export cl

# device functionality
include("device/runtime.jl")
import SPIRVIntrinsics
SPIRVIntrinsics.@import_all
SPIRVIntrinsics.@reexport_public

include("device/runtime.jl")
include("device/array.jl")
include("device/quirks.jl")
# include("util.jl")
include("pool.jl")

# high level implementation
include("memory.jl")
include("pool.jl")
include("array_new.jl")

# compiler implementation
include("compiler/compilation.jl")
include("compiler/execution.jl")
include("compiler/reflection.jl")

# integrations and specialized functionality
include("util.jl")
include("mapreduce.jl")
include("gpuarrays.jl")
#=
# compiler implementation
# include("compiler/compilation.jl")
Expand All @@ -39,9 +48,9 @@ include("util.jl")
include("array.jl")
include("mapreduce.jl")
include("gpuarrays.jl")
=#

include("OpenCLKernels.jl")
import .OpenCLKernels: OpenCLBackend
export OpenCLBackend
=#
end
39 changes: 18 additions & 21 deletions src/OpenCLKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ KA.zeros(::OpenCLBackend, ::Type{T}, dims::Tuple) where T = OpenCL.zeros(T, dims
KA.ones(::OpenCLBackend, ::Type{T}, dims::Tuple) where T = OpenCL.ones(T, dims)

KA.get_backend(::CLArray) = OpenCLBackend()
# TODO should be non-blocking
KA.synchronize(::OpenCLBackend) = cl.finish(cl.queue())
KA.supports_float64(::OpenCLBackend) = false # XXX: this is platform/device dependent
KA.supports_float64(::OpenCLBackend) = false # TODO: Check if this is device dependent

Adapt.adapt_storage(::OpenCLBackend, a::Array) = Adapt.adapt(CLArray, a)
Adapt.adapt_storage(::OpenCLBackend, a::CLArray) = a
Expand Down Expand Up @@ -82,18 +83,16 @@ function threads_to_workgroupsize(threads, ndrange)
end

function (obj::KA.Kernel{OpenCLBackend})(args...; ndrange=nothing, workgroupsize=nothing)
ndrange, workgroupsize, iterspace, dynamic =
KA.launch_config(obj, ndrange, workgroupsize)

ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, workgroupsize)
# this might not be the final context, since we may tune the workgroupsize
ctx = KA.mkcontext(obj, ndrange, iterspace)
kernel = @opencl launch=false obj.f(ctx, args...)

# figure out the optimal workgroupsize automatically
if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing
wg_info = cl.work_group_info(kernel.fun, cl.device())
wg_size_nd = threads_to_workgroupsize(wg_info.size, ndrange)
iterspace, dynamic = KA.partition(obj, ndrange, wg_size_nd)
items = OpenCL.launch_configuration(kernel)
workgroupsize = threads_to_workgroupsize(items, ndrange)
iterspace, dynamic = KA.partition(obj, ndrange, workgroupsize)
ctx = KA.mkcontext(obj, ndrange, iterspace)
end

Expand All @@ -105,9 +104,7 @@ function (obj::KA.Kernel{OpenCLBackend})(args...; ndrange=nothing, workgroupsize
end

# Launch kernel
global_size = groups * items
local_size = items
kernel(ctx, args...; global_size, local_size)
kernel(ctx, args...; items, groups)

return nothing
end
Expand All @@ -116,32 +113,32 @@ end
## Indexing Functions

@device_override @inline function KA.__index_Local_Linear(ctx)
return get_local_id(1)
return get_local_id()
end

@device_override @inline function KA.__index_Group_Linear(ctx)
return get_group_id(1)
return get_group_id()
end

@device_override @inline function KA.__index_Global_Linear(ctx)
return get_global_id(1)
return get_global_id()
end

@device_override @inline function KA.__index_Local_Cartesian(ctx)
@inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)]
@inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id()]
end

@device_override @inline function KA.__index_Group_Cartesian(ctx)
@inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)]
@inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id()]
end

@device_override @inline function KA.__index_Global_Cartesian(ctx)
return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(), get_local_id())
end

@device_override @inline function KA.__validindex(ctx)
if KA.__dynamic_checkbounds(ctx)
I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(), get_local_id())
return I in KA.__ndrange(ctx)
else
return true
Expand All @@ -152,8 +149,8 @@ end
## Shared and Scratch Memory

@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
ptr = SPIRVIntrinsics.emit_localmemory(T, Val(prod(Dims)))
CLDeviceArray(Dims, ptr)
ptr = OpenCL.emit_localmemory(T, Val(prod(Dims)))
oneDeviceArray(Dims, ptr)
end

@device_override @inline function KA.Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
Expand All @@ -168,12 +165,12 @@ end
end

@device_override @inline function KA.__print(args...)
SPIRVIntrinsics._print(args...)
OpenCL._print(args...)
end


## Other

KA.argconvert(::KA.Kernel{OpenCLBackend}, arg) = clconvert(arg)
KA.argconvert(::KA.Kernel{OpenCLBackend}, arg) = kernel_convert(arg)

end
14 changes: 14 additions & 0 deletions src/array_new.jl
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,20 @@ end
Base.copyto!(dest::DenseCLArray{T}, src::DenseCLArray{T}) where {T} =
copyto!(dest, 1, src, 1, length(src))

for (srcty, dstty) in [(:Array, :CLArray), (:CLArray, :Array), (:CLArray, :CLArray)]
@eval begin
function Base.unsafe_copyto!(dst::$dstty{T}, dst_off::Int,
src::$srcty{T}, src_off::Int,
N::Int; blocking::Bool=true) where T
nbytes = N * sizeof(T)
cl.enqueue_usm_memcpy(pointer(dst, dst_off), pointer(src, src_off), nbytes;
blocking)
end
Base.unsafe_copyto!(dst::$dstty, src::$srcty, N; kwargs...) =
unsafe_copyto!(dst, 1, src, 1, N; kwargs...)
end
end

function Base.unsafe_copyto!(ctx::cl.Context, dev::cl.Device,
dest::DenseCLArray{T}, doffs, src::Array{T}, soffs, n) where T
GC.@preserve src dest unsafe_copyto!(ctx, dev, pointer(dest, doffs), pointer(src, soffs), n)
Expand Down
44 changes: 14 additions & 30 deletions src/compiler/execution.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
export @opencl, clfunction, clconvert
export @opencl, clfunction, kernel_convert


## high-level @opencl interface
Expand Down Expand Up @@ -60,8 +60,8 @@ macro opencl(ex...)
quote
$f_var = $f
GC.@preserve $(vars...) $f_var begin
$kernel_f = $clconvert($f_var)
$kernel_args = map($clconvert, ($(var_exprs...),))
$kernel_f = $kernel_convert($f_var)
$kernel_args = map($kernel_convert, ($(var_exprs...),))
$kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...}
$kernel = $clfunction($kernel_f, $kernel_tt; $(compiler_kwargs...))
if $launch
Expand All @@ -81,22 +81,14 @@ end

## argument conversion

struct KernelAdaptor
svm_pointers::Vector{Ptr{Cvoid}}
end
struct KernelAdaptor end

# assume directly-passed pointers are SVM pointers
function Adapt.adapt_storage(to::KernelAdaptor, ptr::Ptr{T}) where {T}
push!(to.svm_pointers, ptr)
return ptr
end
# convert oneAPI host pointers to device pointers
Adapt.adapt_storage(to::KernelAdaptor, p::CLPtr{T}) where {T} = reinterpret(Ptr{T}, p)

# convert SVM buffers to their GPU address
function Adapt.adapt_storage(to::KernelAdaptor, buf::cl.SVMBuffer)
ptr = pointer(buf)
push!(to.svm_pointers, ptr)
return ptr
end
# convert oneAPI host arrays to device arrays
Adapt.adapt_storage(::KernelAdaptor, xs::CLArray{T,N}) where {T,N} =
Base.unsafe_convert(CLDeviceArray{T,N,AS.Global}, xs)

# Base.RefValue isn't GPU compatible, so provide a compatible alternative
# TODO: port improvements from CUDA.jl
Expand All @@ -119,32 +111,25 @@ Adapt.adapt_structure(to::KernelAdaptor,
Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes)

"""
clconvert(x, [pointers])
kernel_convert(x)
This function is called for every argument to be passed to a kernel, allowing it to be
converted to a GPU-friendly format. By default, the function does nothing and returns the
input object `x` as-is.
Do not add methods to this function, but instead extend the underlying Adapt.jl package and
register methods for the the `OpenCL.KernelAdaptor` type.
The `pointers` argument is used to collect pointers to indirect SVM buffers, which need to
be registered with OpenCL before invoking the kernel.
"""
function clconvert(arg, pointers::Vector{Ptr{Cvoid}}=Ptr{Cvoid}[])
adapt(KernelAdaptor(pointers), arg)
end

kernel_convert(arg) = adapt(KernelAdaptor(), arg)


## abstract kernel functionality

abstract type AbstractKernel{F,TT} end

@inline @generated function (kernel::AbstractKernel{F,TT})(args...;
call_kwargs...) where {F,TT}
@inline @generated function call(kernel::AbstractKernel{F,TT}, args...; call_kwargs...) where {F,TT}
sig = Tuple{F, TT.parameters...} # Base.signature_type with a function type
args = (:(kernel.f), (:( clconvert(args[$i], svm_pointers) ) for i in 1:length(args))...)
args = (:(kernel.f), (:( args[$i] ) for i in 1:length(args))...)

# filter out ghost arguments that shouldn't be passed
predicate = dt -> isghosttype(dt) || Core.Compiler.isconstType(dt)
Expand All @@ -164,8 +149,7 @@ abstract type AbstractKernel{F,TT} end
call_tt = Base.to_tuple_type(call_t)

quote
svm_pointers = Ptr{Cvoid}[]
clcall(kernel.fun, $call_tt, $(call_args...); svm_pointers, call_kwargs...)
clcall(kernel.fun, $call_tt, $(call_args...); call_kwargs...)
end
end

Expand Down
8 changes: 0 additions & 8 deletions src/gpuarrays.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,3 @@
# GPUArrays.jl interface

function GPUArrays.derive(::Type{T}, a::CLArray, dims::Dims{N}, offset::Int) where {T,N}
ref = copy(a.data)
offset = (a.offset * Base.elsize(a)) ÷ sizeof(T) + offset
CLArray{T,N}(ref, dims; offset)
end

const GLOBAL_RNGs = Dict{cl.Device,GPUArrays.RNG}()
function GPUArrays.default_rng(::Type{<:CLArray})
dev = cl.device()
Expand Down
Loading

0 comments on commit a5418be

Please sign in to comment.