[Do not merge] Test KernelIntrinsics #2944

christiangnrd · 2025-10-22T14:56:20Z

[only tests]
[only benchmarks]

github-actions · 2025-10-22T14:57:06Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.

diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 1ad6cc116..db94f3d36 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -160,29 +160,29 @@ end
 
 KI.argconvert(::CUDABackend, arg) = cudaconvert(arg)
 
-function KI.kernel_function(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::CUDABackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = cufunction(f, tt; name, kwargs...)
-    KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+    return KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
 end
 
 function (obj::KI.Kernel{CUDABackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    obj.kern(args...; threads=workgroupsize, blocks=numworkgroups)
+    obj.kern(args...; threads = workgroupsize, blocks = numworkgroups)
     return nothing
 end
 
 
-function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
+function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int = typemax(Int))::Int
     kernel_config = launch_configuration(kernel.kern.fun)
 
-    Int(min(kernel_config.threads, max_work_items))
+    return Int(min(kernel_config.threads, max_work_items))
 end
 function KI.max_work_group_size(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
 end
 function KI.multiprocessor_count(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
 end
 
 ## indexing
@@ -197,7 +197,7 @@ end
 end
 
 @device_override @inline function KI.get_global_id()
-    return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
+    return (; x = Int((blockIdx().x - 1) * blockDim().x + threadIdx().x), y = Int((blockIdx().y - 1) * blockDim().y + threadIdx().y), z = Int((blockIdx().z - 1) * blockDim().z + threadIdx().z))
 end
 
 @device_override @inline function KI.get_local_size()
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 051ecc11e..d238bb8a2 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -22,9 +22,9 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     temp = CuDynamicSharedArray(T, (2*threads,))
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     if j > length(Rother)
         return
@@ -105,9 +105,9 @@ function aggregate_partial_scan(op::Function, output::AbstractArray,
     block = KI.get_group_id().x
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     @inbounds if i <= length(Rdim) && j <= length(Rother)
         I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index 7d72d90a1..063c736ed 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -73,8 +73,8 @@ end
         @inbounds global_random_counters()[warpId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId%UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 97a4176b4..6fccff91e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -294,8 +294,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         end
 
         partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
-                    threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
-                    # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+            threads = partial_threads, blocks = partial_blocks, shmem = partial_shmem
+        )
+        # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2f2c4300b..1e674d3be 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,9 +4,14 @@ using SparseArrays
 
 include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
 
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
-    "CPU synchronization",
-    "fallback test: callable types",]))
+Testsuite.testsuite(
+    () -> CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests = Set(
+        [
+            "CPU synchronization",
+            "fallback test: callable types",
+        ]
+    )
+)
 for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
     Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 4865baa36..f7740b61d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 @static if VERSION < v"1.11"
     using Pkg
-    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+    Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
 end
 
 using Distributed

src/mapreduce.jl

src/CUDAKernels.jl

github-actions

CUDA.jl Benchmarks

Benchmark suite	Current: `180f4a5`	Previous: `1d1be49`	Ratio
`latency/precompile`	`65360103745` ns	`56764175729.5` ns	`1.15`
`latency/ttfp`	`8508210510.5` ns	`8338765591` ns	`1.02`
`latency/import`	`4685973409` ns	`4497026184` ns	`1.04`
`integration/volumerhs`	`9609320.5` ns	`9623403` ns	`1.00`
`integration/byval/slices=1`	`146644` ns	`147327` ns	`1.00`
`integration/byval/slices=3`	`425680` ns	`426202` ns	`1.00`
`integration/byval/reference`	`145025.5` ns	`145288` ns	`1.00`
`integration/byval/slices=2`	`286085` ns	`286646` ns	`1.00`
`integration/cudadevrt`	`103528` ns	`103542` ns	`1.00`
`kernel/indexing`	`14258` ns	`14224` ns	`1.00`
`kernel/indexing_checked`	`14929.5` ns	`15126` ns	`0.99`
`kernel/occupancy`	`670.1582278481013` ns	`670.1301369863014` ns	`1.00`
`kernel/launch`	`2136.4` ns	`2124.4` ns	`1.01`
`kernel/rand`	`14792` ns	`14865` ns	`1.00`
`array/reverse/1d`	`19790` ns	`19923` ns	`0.99`
`array/reverse/2dL_inplace`	`66720` ns	`66890` ns	`1.00`
`array/reverse/1dL`	`69874` ns	`70173` ns	`1.00`
`array/reverse/2d`	`21844` ns	`21888` ns	`1.00`
`array/reverse/1d_inplace`	`11348` ns	`9715` ns	`1.17`
`array/reverse/2d_inplace`	`13262` ns	`13402` ns	`0.99`
`array/reverse/2dL`	`73796.5` ns	`73837` ns	`1.00`
`array/reverse/1dL_inplace`	`66709` ns	`66895` ns	`1.00`
`array/copy`	`20939` ns	`20939` ns	`1`
`array/iteration/findall/int`	`159242` ns	`158678` ns	`1.00`
`array/iteration/findall/bool`	`141881` ns	`140617` ns	`1.01`
`array/iteration/findfirst/int`	`161153` ns	`162792` ns	`0.99`
`array/iteration/findfirst/bool`	`161312.5` ns	`163326` ns	`0.99`
`array/iteration/scalar`	`70538` ns	`73757.5` ns	`0.96`
`array/iteration/logical`	`217032.5` ns	`218431.5` ns	`0.99`
`array/iteration/findmin/1d`	`49861.5` ns	`51927` ns	`0.96`
`array/iteration/findmin/2d`	`97019` ns	`96989` ns	`1.00`
`array/reductions/reduce/Int64/1d`	`48000` ns	`44257` ns	`1.08`
`array/reductions/reduce/Int64/dims=1`	`49800` ns	`50794` ns	`0.98`
`array/reductions/reduce/Int64/dims=2`	`68982` ns	`61630` ns	`1.12`
`array/reductions/reduce/Int64/dims=1L`	`88990` ns	`89077` ns	`1.00`
`array/reductions/reduce/Int64/dims=2L`	`90220` ns	`88486.5` ns	`1.02`
`array/reductions/reduce/Float32/1d`	`36714` ns	`37397.5` ns	`0.98`
`array/reductions/reduce/Float32/dims=1`	`47222.5` ns	`41826.5` ns	`1.13`
`array/reductions/reduce/Float32/dims=2`	`64405` ns	`59832` ns	`1.08`
`array/reductions/reduce/Float32/dims=1L`	`53545.5` ns	`52417` ns	`1.02`
`array/reductions/reduce/Float32/dims=2L`	`73889.5` ns	`72316` ns	`1.02`
`array/reductions/mapreduce/Int64/1d`	`46259.5` ns	`43845` ns	`1.06`
`array/reductions/mapreduce/Int64/dims=1`	`49456.5` ns	`45366.5` ns	`1.09`
`array/reductions/mapreduce/Int64/dims=2`	`68910` ns	`61499` ns	`1.12`
`array/reductions/mapreduce/Int64/dims=1L`	`88985.5` ns	`88928` ns	`1.00`
`array/reductions/mapreduce/Int64/dims=2L`	`90437` ns	`88227` ns	`1.03`
`array/reductions/mapreduce/Float32/1d`	`36765` ns	`37393.5` ns	`0.98`
`array/reductions/mapreduce/Float32/dims=1`	`48192` ns	`50536` ns	`0.95`
`array/reductions/mapreduce/Float32/dims=2`	`62947` ns	`59948` ns	`1.05`
`array/reductions/mapreduce/Float32/dims=1L`	`53879` ns	`52655` ns	`1.02`
`array/reductions/mapreduce/Float32/dims=2L`	`73500` ns	`72123.5` ns	`1.02`
`array/broadcast`	`19952` ns	`20094` ns	`0.99`
`array/copyto!/gpu_to_gpu`	`11454` ns	`12939` ns	`0.89`
`array/copyto!/cpu_to_gpu`	`215164` ns	`218766.5` ns	`0.98`
`array/copyto!/gpu_to_cpu`	`282301` ns	`286506` ns	`0.99`
`array/accumulate/Int64/1d`	`127397` ns	`125248` ns	`1.02`
`array/accumulate/Int64/dims=1`	`85196` ns	`83707` ns	`1.02`
`array/accumulate/Int64/dims=2`	`158769` ns	`157937.5` ns	`1.01`
`array/accumulate/Int64/dims=1L`	`1795083` ns	`1710672` ns	`1.05`
`array/accumulate/Int64/dims=2L`	`973567` ns	`967046.5` ns	`1.01`
`array/accumulate/Float32/1d`	`111567` ns	`109958` ns	`1.01`
`array/accumulate/Float32/dims=1`	`81756` ns	`80681` ns	`1.01`
`array/accumulate/Float32/dims=2`	`149503.5` ns	`147571.5` ns	`1.01`
`array/accumulate/Float32/dims=1L`	`1713669` ns	`1619100` ns	`1.06`
`array/accumulate/Float32/dims=2L`	`715021` ns	`698632` ns	`1.02`
`array/construct`	`1247.6` ns	`1292.8` ns	`0.97`
`array/random/randn/Float32`	`45613.5` ns	`45085.5` ns	`1.01`
`array/random/randn!/Float32`	`25126` ns	`25182` ns	`1.00`
`array/random/rand!/Int64`	`27413` ns	`27387` ns	`1.00`
`array/random/rand!/Float32`	`8706` ns	`8796.333333333334` ns	`0.99`
`array/random/rand/Int64`	`30653.5` ns	`29906` ns	`1.02`
`array/random/rand/Float32`	`12880` ns	`13172` ns	`0.98`
`array/permutedims/4d`	`52750` ns	`55451` ns	`0.95`
`array/permutedims/2d`	`54303.5` ns	`54366.5` ns	`1.00`
`array/permutedims/3d`	`54265` ns	`55262` ns	`0.98`
`array/sorting/1d`	`2759316.5` ns	`2758900.5` ns	`1.00`
`array/sorting/by`	`3368651.5` ns	`3345477` ns	`1.01`
`array/sorting/2d`	`1088246` ns	`1082617` ns	`1.01`
`cuda/synchronization/stream/auto`	`1020.5` ns	`1052` ns	`0.97`
`cuda/synchronization/stream/nonblocking`	`7056.8` ns	`7628.6` ns	`0.93`
`cuda/synchronization/stream/blocking`	`817.49` ns	`835.3333333333334` ns	`0.98`
`cuda/synchronization/context/auto`	`1158.9` ns	`1173.3` ns	`0.99`
`cuda/synchronization/context/nonblocking`	`8459.8` ns	`8290.1` ns	`1.02`
`cuda/synchronization/context/blocking`	`887.6607142857143` ns	`927.025` ns	`0.96`

This comment was automatically generated by workflow using github-action-benchmark.

christiangnrd force-pushed the intrinsics branch from edaeb41 to 8e3e1d4 Compare October 22, 2025 15:00

christiangnrd commented Oct 22, 2025

View reviewed changes

src/mapreduce.jl Outdated Show resolved Hide resolved

christiangnrd commented Oct 22, 2025

View reviewed changes

src/mapreduce.jl Outdated Show resolved Hide resolved

christiangnrd commented Oct 22, 2025

View reviewed changes

src/CUDAKernels.jl Outdated Show resolved Hide resolved

christiangnrd force-pushed the intrinsics branch from 497ef42 to 506e02d Compare October 22, 2025 17:23

github-actions bot reviewed Oct 22, 2025

View reviewed changes

christiangnrd force-pushed the intrinsics branch 2 times, most recently from aef3728 to fef539a Compare November 6, 2025 14:51

christiangnrd force-pushed the intrinsics branch 5 times, most recently from 9219357 to e2d7489 Compare November 18, 2025 03:15

christiangnrd added 3 commits November 18, 2025 18:43

KernelIntrinsics

9e1275d

Temp CI

533aa6e

Dogfood

180f4a5

christiangnrd force-pushed the intrinsics branch from e2d7489 to 180f4a5 Compare November 18, 2025 22:44

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Do not merge] Test KernelIntrinsics #2944

[Do not merge] Test KernelIntrinsics #2944

Uh oh!

christiangnrd commented Oct 22, 2025

Uh oh!

github-actions bot commented Oct 22, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions bot left a comment •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

[Do not merge] Test KernelIntrinsics #2944

Are you sure you want to change the base?

[Do not merge] Test KernelIntrinsics #2944

Uh oh!

Conversation

christiangnrd commented Oct 22, 2025

Uh oh!

github-actions bot commented Oct 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions bot left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

CUDA.jl Benchmarks

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

github-actions bot commented Oct 22, 2025 •

edited

Loading

github-actions bot left a comment •

edited

Loading