Skip to content

Commit

Permalink
apply runic formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
VarLad committed Jan 13, 2025
1 parent b4fffad commit 4ac21c7
Show file tree
Hide file tree
Showing 68 changed files with 3,856 additions and 2,974 deletions.
42 changes: 25 additions & 17 deletions examples/hands_on_opencl/ex04/vadd_chain.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
using OpenCL

# tolerance used in floating point comparisons
TOL = 1e-3
TOL = 1.0e-3

# length of vectors a, b, c
LENGTH = 1024
Expand Down Expand Up @@ -41,7 +41,7 @@ __kernel void vadd(
# create a compute context

# create the compute program and build it
program = cl.Program(source=kernelsource) |> cl.build!
program = cl.Program(source = kernelsource) |> cl.build!

#create a, b, e, and g vectors and fill with random float values
#create empty vectors for c, d, and f
Expand All @@ -62,14 +62,14 @@ h_g = rand(Float32, LENGTH)
# {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)}

# Create the input (a, b, e, g) arrays in device memory and copy data from host
d_a = CLArray(h_a; access=:r)
d_b = CLArray(h_b; access=:r)
d_e = CLArray(h_e; access=:r)
d_g = CLArray(h_g; access=:r)
d_a = CLArray(h_a; access = :r)
d_b = CLArray(h_b; access = :r)
d_e = CLArray(h_e; access = :r)
d_g = CLArray(h_g; access = :r)
# Create the output (c, d, f) array in device memory
d_c = CLArray{Float32}(undef, LENGTH; access=:w)
d_d = CLArray{Float32}(undef, LENGTH; access=:w)
d_f = CLArray{Float32}(undef, LENGTH; access=:w)
d_c = CLArray{Float32}(undef, LENGTH; access = :w)
d_d = CLArray{Float32}(undef, LENGTH; access = :w)
d_f = CLArray{Float32}(undef, LENGTH; access = :w)

# create the kernel
vadd = cl.Kernel(program, "vadd")
Expand All @@ -81,12 +81,18 @@ vadd = cl.Kernel(program, "vadd")
# here we call the kernel with work size set to the number of elements and no local
# work size. This enables the opencl runtime to optimize the local size for simple
# kernels
clcall(vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
d_a, d_b, d_c, LENGTH; global_size=size(h_a))
clcall(vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
d_e, d_c, d_d, LENGTH; global_size=size(h_e))
clcall(vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
d_g, d_d, d_f, LENGTH; global_size=size(h_g))
clcall(
vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
d_a, d_b, d_c, LENGTH; global_size = size(h_a)
)
clcall(
vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
d_e, d_c, d_d, LENGTH; global_size = size(h_e)
)
clcall(
vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
d_g, d_d, d_f, LENGTH; global_size = size(h_g)
)

# copy back the results from the compute device
# copy!(queue, dst, src) follows same interface as julia's built in copy!
Expand All @@ -100,8 +106,10 @@ for i in 1:LENGTH
if tmp^2 < TOL^2
global correct += 1
else
println("tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) ",
"h_e $(h_e[i]) h_g $(h_g[i]) h_f $(h_f[i])")
println(
"tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) ",
"h_e $(h_e[i]) h_g $(h_g[i]) h_f $(h_f[i])"
)
end
end

Expand Down
18 changes: 10 additions & 8 deletions examples/hands_on_opencl/ex05/vadd_abc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
using OpenCL

# tolerance used in floating point comparisons
TOL = 1e-3
TOL = 1.0e-3

# length of vectors a, b, c
LENGTH = 1024
Expand All @@ -39,27 +39,29 @@ __kernel void vadd(
}"

# create the compute program and build it
program = cl.Program(source=kernelsource) |> cl.build!
program = cl.Program(source = kernelsource) |> cl.build!

# create a, b and c vectors and fill with random float values
# (the result array will be created when reading back from the device)
h_a = rand(Float32, LENGTH)
h_b = rand(Float32, LENGTH)
h_c = rand(Float32, LENGTH)

d_a = CLArray(h_a; access=:r)
d_b = CLArray(h_b; access=:r)
d_c = CLArray(h_c; access=:r)
d_a = CLArray(h_a; access = :r)
d_b = CLArray(h_b; access = :r)
d_c = CLArray(h_c; access = :r)

# create the output (r) buffer in device memory
d_r = CLArray{Float32}(undef, LENGTH; access=:w)
d_r = CLArray{Float32}(undef, LENGTH; access = :w)

# create the kernel
vadd = cl.Kernel(program, "vadd")

# execute the kernel over the entire range of the input
clcall(vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
d_a, d_b, d_c, d_r, UInt32(LENGTH); global_size=size(h_a))
clcall(
vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
d_a, d_b, d_c, d_r, UInt32(LENGTH); global_size = size(h_a)
)

# read the results back from the compute device
h_r = Array(d_r)
Expand Down
14 changes: 7 additions & 7 deletions examples/hands_on_opencl/ex06/helper.jl
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
import Base: error

function error(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}) where T
cval = Float32(Pdim * AVAL * BVAL)
errsq = 0f0
function error(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}) where {T}
cval = Float32(Pdim * AVAL * BVAL)
errsq = 0.0f0
for i in 1:Ndim
for j in 1:Mdim
err = C[(i-1)*Ndim+j] - cval
err = C[(i - 1) * Ndim + j] - cval
errsq += err^2
end
end
return errsq
end

function results(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time) where T
mflops = 2.0 * Mdim * Ndim * Pdim/(1000000.0* run_time)
function results(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time) where {T}
mflops = 2.0 * Mdim * Ndim * Pdim / (1000000.0 * run_time)
println("$run_time seconds at $mflops MFLOPS")
errsq = error(Mdim, Ndim, Pdim, C)
if isnan(errsq) || errsq > TOL
return if isnan(errsq) || errsq > TOL
println("Errors in multiplication: $errsq")
end
end
27 changes: 16 additions & 11 deletions examples/hands_on_opencl/ex06/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,20 @@ h_B = fill(Float32(BVAL), sizeB)
h_C = Vector{Float32}(undef, sizeC)

# %20 improvment using @inbounds
function seq_mat_mul_sdot(Mdim::Int, Ndim::Int, Pdim::Int,
A::Array{T}, B::Array{T}, C::Array{T}) where T
function seq_mat_mul_sdot(
Mdim::Int, Ndim::Int, Pdim::Int,
A::Array{T}, B::Array{T}, C::Array{T}
) where {T}
for i in 1:Ndim
for j in 1:Mdim
tmp = zero(Float32)
for k in 1:Pdim
@inbounds tmp += A[(i-1)*Ndim+k] * B[(k-1)*Pdim+j]
@inbounds tmp += A[(i - 1) * Ndim + k] * B[(k - 1) * Pdim + j]
end
@inbounds C[(i-1)*Ndim+j] = tmp
@inbounds C[(i - 1) * Ndim + j] = tmp
end
end
return
end

@info("=== Julia, matix mult (dot prod), order $ORDER ===")
Expand All @@ -105,11 +108,11 @@ for i in 1:COUNT
end

# create OpenCL arrays
d_a = CLArray(h_A; access=:r)
d_b = CLArray(h_B; access=:r)
d_c = CLArray{Float32}(undef, length(h_C); access=:w)
d_a = CLArray(h_A; access = :r)
d_b = CLArray(h_B; access = :r)
d_c = CLArray{Float32}(undef, length(h_C); access = :w)

prg = cl.Program(source=kernel_source) |> cl.build!
prg = cl.Program(source = kernel_source) |> cl.build!
mmul = cl.Kernel(prg, "mmul")

@info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====")
Expand All @@ -122,12 +125,14 @@ for i in 1:COUNT
# You can enable profiling events on the queue
# by calling the constructor with the :profile flag
cl.queue!(:profile) do
evt = clcall(mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size)
evt = clcall(
mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size
)
wait(evt)

# profiling events are measured in ns
run_time = evt.profile_duration / 1e9
run_time = evt.profile_duration / 1.0e9
cl.copy!(h_C, d_c)
results(Mdim, Ndim, Pdim, h_C, run_time)
end
Expand Down
14 changes: 7 additions & 7 deletions examples/hands_on_opencl/ex07/helper.jl
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
import Base: error

function error(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}) where T
cval = Float32(Pdim * AVAL * BVAL)
errsq = 0f0
function error(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}) where {T}
cval = Float32(Pdim * AVAL * BVAL)
errsq = 0.0f0
for i in 1:Ndim
for j in 1:Mdim
err = C[(i-1)*Ndim+j] - cval
err = C[(i - 1) * Ndim + j] - cval
errsq += err^2
end
end
return errsq
end

function results(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time) where T
mflops = 2.0 * Mdim * Ndim * Pdim/(1000000.0* run_time)
function results(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time) where {T}
mflops = 2.0 * Mdim * Ndim * Pdim / (1000000.0 * run_time)
println("$run_time seconds at $mflops MFLOPS")
errsq = error(Mdim, Ndim, Pdim, C)
if isnan(errsq) || errsq > TOL
return if isnan(errsq) || errsq > TOL
println("Errors in multiplication: $errsq")
end
end
65 changes: 37 additions & 28 deletions examples/hands_on_opencl/ex07/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,20 @@ h_B = fill(Float32(BVAL), sizeB)
h_C = Vector{Float32}(undef, sizeC)

# %20 improvment using @inbounds
function seq_mat_mul_sdot(Mdim::Int, Ndim::Int, Pdim::Int,
A::Array{T}, B::Array{T}, C::Array{T}) where T
function seq_mat_mul_sdot(
Mdim::Int, Ndim::Int, Pdim::Int,
A::Array{T}, B::Array{T}, C::Array{T}
) where {T}
for i in 1:Ndim
for j in 1:Mdim
tmp = zero(Float32)
for k in 1:Pdim
@inbounds tmp += A[(i-1)*Ndim+k] * B[(k-1)*Pdim+j]
@inbounds tmp += A[(i - 1) * Ndim + k] * B[(k - 1) * Pdim + j]
end
@inbounds C[(i-1)*Ndim+j] = tmp
@inbounds C[(i - 1) * Ndim + j] = tmp
end
end
return
end

@info("=== Julia, matix mult (dot prod), order $ORDER ===")
Expand All @@ -86,29 +89,31 @@ for i in 1:COUNT
end

# create OpenCL array
d_a = CLArray(h_A; access=:r)
d_b = CLArray(h_B; access=:r)
d_c = CLArray{Float32}(undef, length(h_C); access=:w)
d_a = CLArray(h_A; access = :r)
d_b = CLArray(h_B; access = :r)
d_c = CLArray{Float32}(undef, length(h_C); access = :w)

#--------------------------------------------------------------------------------
# OpenCL matrix multiplication ... Naive
#--------------------------------------------------------------------------------

kernel_source = read(joinpath(src_dir, "C_elem.cl"), String)
prg = cl.Program(source=kernel_source) |> cl.build!
prg = cl.Program(source = kernel_source) |> cl.build!
mmul = cl.Kernel(prg, "mmul")

@info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====")

for i in 1:COUNT
fill!(h_C, 0.0)
cl.queue!(:profile) do
evt = clcall(mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size=(Ndim, Mdim))
evt = clcall(
mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size = (Ndim, Mdim)
)
wait(evt)

# profiling events are measured in ns
run_time = evt.profile_duration / 1e9
run_time = evt.profile_duration / 1.0e9
cl.copy!(h_C, d_c)
results(Mdim, Ndim, Pdim, h_C, run_time)
end
Expand All @@ -119,7 +124,7 @@ end
#--------------------------------------------------------------------------------

kernel_source = read(joinpath(src_dir, "C_row.cl"), String)
prg = cl.Program(source=kernel_source) |> cl.build!
prg = cl.Program(source = kernel_source) |> cl.build!
mmul = cl.Kernel(prg, "mmul")

@info("=== OpenCL, matrix mult, C row per work item, order $Ndim ====")
Expand All @@ -130,12 +135,14 @@ for i in 1:COUNT
local_size = (div(ORDER, 16),)

cl.queue!(:profile) do
evt = clcall(mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size, local_size)
evt = clcall(
mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size, local_size
)
wait(evt)

# profiling events are measured in ns
run_time = evt.profile_duration / 1e9
run_time = evt.profile_duration / 1.0e9
cl.copy!(h_C, d_c)
results(Mdim, Ndim, Pdim, h_C, run_time)
end
Expand All @@ -145,27 +152,29 @@ end
# OpenCL matrix multiplication ... C row per work item, A row in pivate memory
#--------------------------------------------------------------------------------
kernel_source = read(joinpath(src_dir, "C_row_priv.cl"), String)
prg = cl.Program(source=kernel_source) |> cl.build!
prg = cl.Program(source = kernel_source) |> cl.build!
mmul = cl.Kernel(prg, "mmul")
wk_size = cl.device().max_work_group_size
if Ndim * (ORDER ÷ 16) >= wk_size
@warn("Specified work_size $(Ndim * (ORDER ÷ 16)) is bigger than $wk_size")
else

@info("=== OpenCL, matrix mult, C row, A row in priv mem, order $Ndim ====")
@info("=== OpenCL, matrix mult, C row, A row in priv mem, order $Ndim ====")

for i in 1:COUNT
fill!(h_C, 0.0)
for i in 1:COUNT
fill!(h_C, 0.0)

cl.queue!(:profile) do
evt = clcall(mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size=Ndim, local_size=ORDER)
wait(evt)
cl.queue!(:profile) do
evt = clcall(
mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size = Ndim, local_size = ORDER
)
wait(evt)

# profiling events are measured in ns
run_time = evt.profile_duration / 1e9
cl.copy!(h_C, d_c)
results(Mdim, Ndim, Pdim, h_C, run_time)
# profiling events are measured in ns
run_time = evt.profile_duration / 1.0e9
cl.copy!(h_C, d_c)
results(Mdim, Ndim, Pdim, h_C, run_time)
end
end
end
end
Loading

0 comments on commit 4ac21c7

Please sign in to comment.