apply runic formatting

JuliaGPU · Jan 13, 2025 · 4ac21c7 · 4ac21c7
1 parent b4fffad
commit 4ac21c7
Show file tree

Hide file tree

Showing 68 changed files with 3,856 additions and 2,974 deletions.
diff --git a/examples/hands_on_opencl/ex04/vadd_chain.jl b/examples/hands_on_opencl/ex04/vadd_chain.jl
@@ -13,7 +13,7 @@
 using OpenCL
 
 # tolerance used in floating point comparisons
-TOL = 1e-3
+TOL = 1.0e-3
 
 # length of vectors a, b, c
 LENGTH = 1024
@@ -41,7 +41,7 @@ __kernel void vadd(
 # create a compute context
 
 # create the compute program and build it
-program = cl.Program(source=kernelsource) |> cl.build!
+program = cl.Program(source = kernelsource) |> cl.build!
 
 #create a, b, e, and g vectors and fill with random float values
 #create empty vectors for c, d, and f
@@ -62,14 +62,14 @@ h_g = rand(Float32, LENGTH)
 # {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)}
 
 # Create the input (a, b, e, g) arrays in device memory and copy data from host
-d_a = CLArray(h_a; access=:r)
-d_b = CLArray(h_b; access=:r)
-d_e = CLArray(h_e; access=:r)
-d_g = CLArray(h_g; access=:r)
+d_a = CLArray(h_a; access = :r)
+d_b = CLArray(h_b; access = :r)
+d_e = CLArray(h_e; access = :r)
+d_g = CLArray(h_g; access = :r)
 # Create the output (c, d, f) array in device memory
-d_c = CLArray{Float32}(undef, LENGTH; access=:w)
-d_d = CLArray{Float32}(undef, LENGTH; access=:w)
-d_f = CLArray{Float32}(undef, LENGTH; access=:w)
+d_c = CLArray{Float32}(undef, LENGTH; access = :w)
+d_d = CLArray{Float32}(undef, LENGTH; access = :w)
+d_f = CLArray{Float32}(undef, LENGTH; access = :w)
 
 # create the kernel
 vadd = cl.Kernel(program, "vadd")
@@ -81,12 +81,18 @@ vadd = cl.Kernel(program, "vadd")
 # here we call the kernel with work size set to the number of elements and no local
 # work size. This enables the opencl runtime to optimize the local size for simple
 # kernels
-clcall(vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
-       d_a, d_b, d_c, LENGTH; global_size=size(h_a))
-clcall(vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
-       d_e, d_c, d_d, LENGTH; global_size=size(h_e))
-clcall(vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
-       d_g, d_d, d_f, LENGTH; global_size=size(h_g))
+clcall(
+    vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
+    d_a, d_b, d_c, LENGTH; global_size = size(h_a)
+)
+clcall(
+    vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
+    d_e, d_c, d_d, LENGTH; global_size = size(h_e)
+)
+clcall(
+    vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
+    d_g, d_d, d_f, LENGTH; global_size = size(h_g)
+)
 
 # copy back the results from the compute device
 # copy!(queue, dst, src) follows same interface as julia's built in copy!
@@ -100,8 +106,10 @@ for i in 1:LENGTH
     if tmp^2 < TOL^2
         global correct += 1
     else
-        println("tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) ",
-                "h_e $(h_e[i]) h_g $(h_g[i]) h_f $(h_f[i])")
+        println(
+            "tmp $tmp h_a $(h_a[i]) h_b $(h_b[i]) ",
+            "h_e $(h_e[i]) h_g $(h_g[i]) h_f $(h_f[i])"
+        )
     end
 end
 

diff --git a/examples/hands_on_opencl/ex05/vadd_abc.jl b/examples/hands_on_opencl/ex05/vadd_abc.jl
@@ -13,7 +13,7 @@
 using OpenCL
 
 # tolerance used in floating point comparisons
-TOL = 1e-3
+TOL = 1.0e-3
 
 # length of vectors a, b, c
 LENGTH = 1024
@@ -39,27 +39,29 @@ __kernel void vadd(
 }"
 
 # create the compute program and build it
-program = cl.Program(source=kernelsource) |> cl.build!
+program = cl.Program(source = kernelsource) |> cl.build!
 
 # create a, b and c vectors and fill with random float values
 # (the result array will be created when reading back from the device)
 h_a = rand(Float32, LENGTH)
 h_b = rand(Float32, LENGTH)
 h_c = rand(Float32, LENGTH)
 
-d_a = CLArray(h_a; access=:r)
-d_b = CLArray(h_b; access=:r)
-d_c = CLArray(h_c; access=:r)
+d_a = CLArray(h_a; access = :r)
+d_b = CLArray(h_b; access = :r)
+d_c = CLArray(h_c; access = :r)
 
 # create the output (r) buffer in device memory
-d_r = CLArray{Float32}(undef, LENGTH; access=:w)
+d_r = CLArray{Float32}(undef, LENGTH; access = :w)
 
 # create the kernel
 vadd = cl.Kernel(program, "vadd")
 
 # execute the kernel over the entire range of the input
-clcall(vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
-       d_a, d_b, d_c, d_r, UInt32(LENGTH); global_size=size(h_a))
+clcall(
+    vadd, Tuple{Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cuint},
+    d_a, d_b, d_c, d_r, UInt32(LENGTH); global_size = size(h_a)
+)
 
 # read the results back from the compute device
 h_r = Array(d_r)

diff --git a/examples/hands_on_opencl/ex06/helper.jl b/examples/hands_on_opencl/ex06/helper.jl
@@ -1,22 +1,22 @@
 import Base: error
 
-function error(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}) where T
-    cval  = Float32(Pdim * AVAL * BVAL)
-    errsq = 0f0
+function error(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}) where {T}
+    cval = Float32(Pdim * AVAL * BVAL)
+    errsq = 0.0f0
     for i in 1:Ndim
         for j in 1:Mdim
-            err = C[(i-1)*Ndim+j] - cval
+            err = C[(i - 1) * Ndim + j] - cval
             errsq += err^2
         end
     end
     return errsq
 end
 
-function results(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time) where T
-    mflops = 2.0 * Mdim * Ndim * Pdim/(1000000.0* run_time)
+function results(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time) where {T}
+    mflops = 2.0 * Mdim * Ndim * Pdim / (1000000.0 * run_time)
     println("$run_time seconds at $mflops MFLOPS")
     errsq = error(Mdim, Ndim, Pdim, C)
-    if isnan(errsq) || errsq > TOL
+    return if isnan(errsq) || errsq > TOL
         println("Errors in multiplication: $errsq")
     end
 end
diff --git a/examples/hands_on_opencl/ex06/matmul.jl b/examples/hands_on_opencl/ex06/matmul.jl
@@ -78,17 +78,20 @@ h_B = fill(Float32(BVAL), sizeB)
 h_C = Vector{Float32}(undef, sizeC)
 
 # %20 improvment using @inbounds
-function seq_mat_mul_sdot(Mdim::Int, Ndim::Int, Pdim::Int,
-                          A::Array{T}, B::Array{T}, C::Array{T}) where T
+function seq_mat_mul_sdot(
+        Mdim::Int, Ndim::Int, Pdim::Int,
+        A::Array{T}, B::Array{T}, C::Array{T}
+    ) where {T}
     for i in 1:Ndim
         for j in 1:Mdim
             tmp = zero(Float32)
             for k in 1:Pdim
-                @inbounds tmp += A[(i-1)*Ndim+k] * B[(k-1)*Pdim+j]
+                @inbounds tmp += A[(i - 1) * Ndim + k] * B[(k - 1) * Pdim + j]
             end
-            @inbounds C[(i-1)*Ndim+j] = tmp
+            @inbounds C[(i - 1) * Ndim + j] = tmp
         end
     end
+    return
 end
 
 @info("=== Julia, matix mult (dot prod), order $ORDER ===")
@@ -105,11 +108,11 @@ for i in 1:COUNT
 end
 
 # create OpenCL arrays
-d_a = CLArray(h_A; access=:r)
-d_b = CLArray(h_B; access=:r)
-d_c = CLArray{Float32}(undef, length(h_C); access=:w)
+d_a = CLArray(h_A; access = :r)
+d_b = CLArray(h_B; access = :r)
+d_c = CLArray{Float32}(undef, length(h_C); access = :w)
 
-prg  = cl.Program(source=kernel_source) |> cl.build!
+prg = cl.Program(source = kernel_source) |> cl.build!
 mmul = cl.Kernel(prg, "mmul")
 
 @info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====")
@@ -122,12 +125,14 @@ for i in 1:COUNT
     # You can enable profiling events on the queue
     # by calling the constructor with the :profile flag
     cl.queue!(:profile) do
-        evt = clcall(mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
-                     Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size)
+        evt = clcall(
+            mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
+            Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size
+        )
         wait(evt)
 
         # profiling events are measured in ns
-        run_time = evt.profile_duration / 1e9
+        run_time = evt.profile_duration / 1.0e9
         cl.copy!(h_C, d_c)
         results(Mdim, Ndim, Pdim, h_C, run_time)
     end

diff --git a/examples/hands_on_opencl/ex07/helper.jl b/examples/hands_on_opencl/ex07/helper.jl
@@ -1,22 +1,22 @@
 import Base: error
 
-function error(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}) where T
-    cval  = Float32(Pdim * AVAL * BVAL)
-    errsq = 0f0
+function error(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}) where {T}
+    cval = Float32(Pdim * AVAL * BVAL)
+    errsq = 0.0f0
     for i in 1:Ndim
         for j in 1:Mdim
-            err = C[(i-1)*Ndim+j] - cval
+            err = C[(i - 1) * Ndim + j] - cval
             errsq += err^2
         end
     end
     return errsq
 end
 
-function results(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time) where T
-    mflops = 2.0 * Mdim * Ndim * Pdim/(1000000.0* run_time)
+function results(Mdim::Int, Ndim::Int, Pdim::Int, C::Array{T}, run_time) where {T}
+    mflops = 2.0 * Mdim * Ndim * Pdim / (1000000.0 * run_time)
     println("$run_time seconds at $mflops MFLOPS")
     errsq = error(Mdim, Ndim, Pdim, C)
-    if isnan(errsq) || errsq > TOL
+    return if isnan(errsq) || errsq > TOL
         println("Errors in multiplication: $errsq")
     end
 end
diff --git a/examples/hands_on_opencl/ex07/matmul.jl b/examples/hands_on_opencl/ex07/matmul.jl
@@ -59,17 +59,20 @@ h_B = fill(Float32(BVAL), sizeB)
 h_C = Vector{Float32}(undef, sizeC)
 
 # %20 improvment using @inbounds
-function seq_mat_mul_sdot(Mdim::Int, Ndim::Int, Pdim::Int,
-                          A::Array{T}, B::Array{T}, C::Array{T}) where T
+function seq_mat_mul_sdot(
+        Mdim::Int, Ndim::Int, Pdim::Int,
+        A::Array{T}, B::Array{T}, C::Array{T}
+    ) where {T}
     for i in 1:Ndim
         for j in 1:Mdim
             tmp = zero(Float32)
             for k in 1:Pdim
-                @inbounds tmp += A[(i-1)*Ndim+k] * B[(k-1)*Pdim+j]
+                @inbounds tmp += A[(i - 1) * Ndim + k] * B[(k - 1) * Pdim + j]
             end
-            @inbounds C[(i-1)*Ndim+j] = tmp
+            @inbounds C[(i - 1) * Ndim + j] = tmp
         end
     end
+    return
 end
 
 @info("=== Julia, matix mult (dot prod), order $ORDER ===")
@@ -86,29 +89,31 @@ for i in 1:COUNT
 end
 
 # create OpenCL array
-d_a = CLArray(h_A; access=:r)
-d_b = CLArray(h_B; access=:r)
-d_c = CLArray{Float32}(undef, length(h_C); access=:w)
+d_a = CLArray(h_A; access = :r)
+d_b = CLArray(h_B; access = :r)
+d_c = CLArray{Float32}(undef, length(h_C); access = :w)
 
 #--------------------------------------------------------------------------------
 # OpenCL matrix multiplication ... Naive
 #--------------------------------------------------------------------------------
 
 kernel_source = read(joinpath(src_dir, "C_elem.cl"), String)
-prg  = cl.Program(source=kernel_source) |> cl.build!
+prg = cl.Program(source = kernel_source) |> cl.build!
 mmul = cl.Kernel(prg, "mmul")
 
 @info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====")
 
 for i in 1:COUNT
     fill!(h_C, 0.0)
     cl.queue!(:profile) do
-        evt = clcall(mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
-                     Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size=(Ndim, Mdim))
+        evt = clcall(
+            mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
+            Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size = (Ndim, Mdim)
+        )
         wait(evt)
 
         # profiling events are measured in ns
-        run_time = evt.profile_duration / 1e9
+        run_time = evt.profile_duration / 1.0e9
         cl.copy!(h_C, d_c)
         results(Mdim, Ndim, Pdim, h_C, run_time)
     end
@@ -119,7 +124,7 @@ end
 #--------------------------------------------------------------------------------
 
 kernel_source = read(joinpath(src_dir, "C_row.cl"), String)
-prg  = cl.Program(source=kernel_source) |> cl.build!
+prg = cl.Program(source = kernel_source) |> cl.build!
 mmul = cl.Kernel(prg, "mmul")
 
 @info("=== OpenCL, matrix mult, C row per work item, order $Ndim ====")
@@ -130,12 +135,14 @@ for i in 1:COUNT
     local_size = (div(ORDER, 16),)
 
     cl.queue!(:profile) do
-        evt = clcall(mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
-                     Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size, local_size)
+        evt = clcall(
+            mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
+            Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size, local_size
+        )
         wait(evt)
 
         # profiling events are measured in ns
-        run_time = evt.profile_duration / 1e9
+        run_time = evt.profile_duration / 1.0e9
         cl.copy!(h_C, d_c)
         results(Mdim, Ndim, Pdim, h_C, run_time)
     end
@@ -145,27 +152,29 @@ end
 # OpenCL matrix multiplication ... C row per work item, A row in pivate memory
 #--------------------------------------------------------------------------------
 kernel_source = read(joinpath(src_dir, "C_row_priv.cl"), String)
-prg  = cl.Program(source=kernel_source) |> cl.build!
+prg = cl.Program(source = kernel_source) |> cl.build!
 mmul = cl.Kernel(prg, "mmul")
 wk_size = cl.device().max_work_group_size
 if Ndim * (ORDER ÷ 16) >= wk_size
     @warn("Specified work_size $(Ndim * (ORDER ÷ 16)) is bigger than $wk_size")
 else
 
-@info("=== OpenCL, matrix mult, C row, A row in priv mem, order $Ndim ====")
+    @info("=== OpenCL, matrix mult, C row, A row in priv mem, order $Ndim ====")
 
-for i in 1:COUNT
-    fill!(h_C, 0.0)
+    for i in 1:COUNT
+        fill!(h_C, 0.0)
 
-    cl.queue!(:profile) do
-        evt = clcall(mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
-                     Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size=Ndim, local_size=ORDER)
-        wait(evt)
+        cl.queue!(:profile) do
+            evt = clcall(
+                mmul, Tuple{Int32, Int32, Int32, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}},
+                Mdim, Ndim, Pdim, d_a, d_b, d_c; global_size = Ndim, local_size = ORDER
+            )
+            wait(evt)
 
-        # profiling events are measured in ns
-        run_time = evt.profile_duration / 1e9
-        cl.copy!(h_C, d_c)
-        results(Mdim, Ndim, Pdim, h_C, run_time)
+            # profiling events are measured in ns
+            run_time = evt.profile_duration / 1.0e9
+            cl.copy!(h_C, d_c)
+            results(Mdim, Ndim, Pdim, h_C, run_time)
+        end
     end
 end
-end