Skip to content

Conversation

@christiangnrd
Copy link
Member

[only tests]
[only benchmarks]

@github-actions
Copy link
Contributor

github-actions bot commented Oct 22, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.
diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 1ad6cc116..db94f3d36 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -160,29 +160,29 @@ end
 
 KI.argconvert(::CUDABackend, arg) = cudaconvert(arg)
 
-function KI.kernel_function(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::CUDABackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = cufunction(f, tt; name, kwargs...)
-    KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+    return KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
 end
 
 function (obj::KI.Kernel{CUDABackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    obj.kern(args...; threads=workgroupsize, blocks=numworkgroups)
+    obj.kern(args...; threads = workgroupsize, blocks = numworkgroups)
     return nothing
 end
 
 
-function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
+function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int = typemax(Int))::Int
     kernel_config = launch_configuration(kernel.kern.fun)
 
-    Int(min(kernel_config.threads, max_work_items))
+    return Int(min(kernel_config.threads, max_work_items))
 end
 function KI.max_work_group_size(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
 end
 function KI.multiprocessor_count(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
 end
 
 ## indexing
@@ -197,7 +197,7 @@ end
 end
 
 @device_override @inline function KI.get_global_id()
-    return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
+    return (; x = Int((blockIdx().x - 1) * blockDim().x + threadIdx().x), y = Int((blockIdx().y - 1) * blockDim().y + threadIdx().y), z = Int((blockIdx().z - 1) * blockDim().z + threadIdx().z))
 end
 
 @device_override @inline function KI.get_local_size()
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 051ecc11e..d238bb8a2 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -22,9 +22,9 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     temp = CuDynamicSharedArray(T, (2*threads,))
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     if j > length(Rother)
         return
@@ -105,9 +105,9 @@ function aggregate_partial_scan(op::Function, output::AbstractArray,
     block = KI.get_group_id().x
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     @inbounds if i <= length(Rdim) && j <= length(Rother)
         I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index 7d72d90a1..063c736ed 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -73,8 +73,8 @@ end
         @inbounds global_random_counters()[warpId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId%UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 97a4176b4..6fccff91e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -294,8 +294,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         end
 
         partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
-                    threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
-                    # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+            threads = partial_threads, blocks = partial_blocks, shmem = partial_shmem
+        )
+        # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2f2c4300b..1e674d3be 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,9 +4,14 @@ using SparseArrays
 
 include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
 
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
-    "CPU synchronization",
-    "fallback test: callable types",]))
+Testsuite.testsuite(
+    () -> CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests = Set(
+        [
+            "CPU synchronization",
+            "fallback test: callable types",
+        ]
+    )
+)
 for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
     Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 4865baa36..f7740b61d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 @static if VERSION < v"1.11"
     using Pkg
-    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+    Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
 end
 
 using Distributed

Copy link
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA.jl Benchmarks

Benchmark suite Current: 180f4a5 Previous: 1d1be49 Ratio
latency/precompile 65360103745 ns 56764175729.5 ns 1.15
latency/ttfp 8508210510.5 ns 8338765591 ns 1.02
latency/import 4685973409 ns 4497026184 ns 1.04
integration/volumerhs 9609320.5 ns 9623403 ns 1.00
integration/byval/slices=1 146644 ns 147327 ns 1.00
integration/byval/slices=3 425680 ns 426202 ns 1.00
integration/byval/reference 145025.5 ns 145288 ns 1.00
integration/byval/slices=2 286085 ns 286646 ns 1.00
integration/cudadevrt 103528 ns 103542 ns 1.00
kernel/indexing 14258 ns 14224 ns 1.00
kernel/indexing_checked 14929.5 ns 15126 ns 0.99
kernel/occupancy 670.1582278481013 ns 670.1301369863014 ns 1.00
kernel/launch 2136.4 ns 2124.4 ns 1.01
kernel/rand 14792 ns 14865 ns 1.00
array/reverse/1d 19790 ns 19923 ns 0.99
array/reverse/2dL_inplace 66720 ns 66890 ns 1.00
array/reverse/1dL 69874 ns 70173 ns 1.00
array/reverse/2d 21844 ns 21888 ns 1.00
array/reverse/1d_inplace 11348 ns 9715 ns 1.17
array/reverse/2d_inplace 13262 ns 13402 ns 0.99
array/reverse/2dL 73796.5 ns 73837 ns 1.00
array/reverse/1dL_inplace 66709 ns 66895 ns 1.00
array/copy 20939 ns 20939 ns 1
array/iteration/findall/int 159242 ns 158678 ns 1.00
array/iteration/findall/bool 141881 ns 140617 ns 1.01
array/iteration/findfirst/int 161153 ns 162792 ns 0.99
array/iteration/findfirst/bool 161312.5 ns 163326 ns 0.99
array/iteration/scalar 70538 ns 73757.5 ns 0.96
array/iteration/logical 217032.5 ns 218431.5 ns 0.99
array/iteration/findmin/1d 49861.5 ns 51927 ns 0.96
array/iteration/findmin/2d 97019 ns 96989 ns 1.00
array/reductions/reduce/Int64/1d 48000 ns 44257 ns 1.08
array/reductions/reduce/Int64/dims=1 49800 ns 50794 ns 0.98
array/reductions/reduce/Int64/dims=2 68982 ns 61630 ns 1.12
array/reductions/reduce/Int64/dims=1L 88990 ns 89077 ns 1.00
array/reductions/reduce/Int64/dims=2L 90220 ns 88486.5 ns 1.02
array/reductions/reduce/Float32/1d 36714 ns 37397.5 ns 0.98
array/reductions/reduce/Float32/dims=1 47222.5 ns 41826.5 ns 1.13
array/reductions/reduce/Float32/dims=2 64405 ns 59832 ns 1.08
array/reductions/reduce/Float32/dims=1L 53545.5 ns 52417 ns 1.02
array/reductions/reduce/Float32/dims=2L 73889.5 ns 72316 ns 1.02
array/reductions/mapreduce/Int64/1d 46259.5 ns 43845 ns 1.06
array/reductions/mapreduce/Int64/dims=1 49456.5 ns 45366.5 ns 1.09
array/reductions/mapreduce/Int64/dims=2 68910 ns 61499 ns 1.12
array/reductions/mapreduce/Int64/dims=1L 88985.5 ns 88928 ns 1.00
array/reductions/mapreduce/Int64/dims=2L 90437 ns 88227 ns 1.03
array/reductions/mapreduce/Float32/1d 36765 ns 37393.5 ns 0.98
array/reductions/mapreduce/Float32/dims=1 48192 ns 50536 ns 0.95
array/reductions/mapreduce/Float32/dims=2 62947 ns 59948 ns 1.05
array/reductions/mapreduce/Float32/dims=1L 53879 ns 52655 ns 1.02
array/reductions/mapreduce/Float32/dims=2L 73500 ns 72123.5 ns 1.02
array/broadcast 19952 ns 20094 ns 0.99
array/copyto!/gpu_to_gpu 11454 ns 12939 ns 0.89
array/copyto!/cpu_to_gpu 215164 ns 218766.5 ns 0.98
array/copyto!/gpu_to_cpu 282301 ns 286506 ns 0.99
array/accumulate/Int64/1d 127397 ns 125248 ns 1.02
array/accumulate/Int64/dims=1 85196 ns 83707 ns 1.02
array/accumulate/Int64/dims=2 158769 ns 157937.5 ns 1.01
array/accumulate/Int64/dims=1L 1795083 ns 1710672 ns 1.05
array/accumulate/Int64/dims=2L 973567 ns 967046.5 ns 1.01
array/accumulate/Float32/1d 111567 ns 109958 ns 1.01
array/accumulate/Float32/dims=1 81756 ns 80681 ns 1.01
array/accumulate/Float32/dims=2 149503.5 ns 147571.5 ns 1.01
array/accumulate/Float32/dims=1L 1713669 ns 1619100 ns 1.06
array/accumulate/Float32/dims=2L 715021 ns 698632 ns 1.02
array/construct 1247.6 ns 1292.8 ns 0.97
array/random/randn/Float32 45613.5 ns 45085.5 ns 1.01
array/random/randn!/Float32 25126 ns 25182 ns 1.00
array/random/rand!/Int64 27413 ns 27387 ns 1.00
array/random/rand!/Float32 8706 ns 8796.333333333334 ns 0.99
array/random/rand/Int64 30653.5 ns 29906 ns 1.02
array/random/rand/Float32 12880 ns 13172 ns 0.98
array/permutedims/4d 52750 ns 55451 ns 0.95
array/permutedims/2d 54303.5 ns 54366.5 ns 1.00
array/permutedims/3d 54265 ns 55262 ns 0.98
array/sorting/1d 2759316.5 ns 2758900.5 ns 1.00
array/sorting/by 3368651.5 ns 3345477 ns 1.01
array/sorting/2d 1088246 ns 1082617 ns 1.01
cuda/synchronization/stream/auto 1020.5 ns 1052 ns 0.97
cuda/synchronization/stream/nonblocking 7056.8 ns 7628.6 ns 0.93
cuda/synchronization/stream/blocking 817.49 ns 835.3333333333334 ns 0.98
cuda/synchronization/context/auto 1158.9 ns 1173.3 ns 0.99
cuda/synchronization/context/nonblocking 8459.8 ns 8290.1 ns 1.02
cuda/synchronization/context/blocking 887.6607142857143 ns 927.025 ns 0.96

This comment was automatically generated by workflow using github-action-benchmark.

@christiangnrd christiangnrd force-pushed the intrinsics branch 2 times, most recently from aef3728 to fef539a Compare November 6, 2025 14:51
@christiangnrd christiangnrd force-pushed the intrinsics branch 5 times, most recently from 9219357 to e2d7489 Compare November 18, 2025 03:15
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant