-
Notifications
You must be signed in to change notification settings - Fork 256
[Do not merge] Test KernelIntrinsics #2944
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
christiangnrd
wants to merge
3
commits into
JuliaGPU:master
Choose a base branch
from
christiangnrd:intrinsics
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Contributor
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 1ad6cc116..db94f3d36 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -160,29 +160,29 @@ end
KI.argconvert(::CUDABackend, arg) = cudaconvert(arg)
-function KI.kernel_function(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::CUDABackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
kern = cufunction(f, tt; name, kwargs...)
- KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+ return KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
end
function (obj::KI.Kernel{CUDABackend})(args...; numworkgroups = 1, workgroupsize = 1)
KI.check_launch_args(numworkgroups, workgroupsize)
- obj.kern(args...; threads=workgroupsize, blocks=numworkgroups)
+ obj.kern(args...; threads = workgroupsize, blocks = numworkgroups)
return nothing
end
-function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
+function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int = typemax(Int))::Int
kernel_config = launch_configuration(kernel.kern.fun)
- Int(min(kernel_config.threads, max_work_items))
+ return Int(min(kernel_config.threads, max_work_items))
end
function KI.max_work_group_size(::CUDABackend)::Int
- Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+ return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
end
function KI.multiprocessor_count(::CUDABackend)::Int
- Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+ return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
end
## indexing
@@ -197,7 +197,7 @@ end
end
@device_override @inline function KI.get_global_id()
- return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
+ return (; x = Int((blockIdx().x - 1) * blockDim().x + threadIdx().x), y = Int((blockIdx().y - 1) * blockDim().y + threadIdx().y), z = Int((blockIdx().z - 1) * blockDim().z + threadIdx().z))
end
@device_override @inline function KI.get_local_size()
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 051ecc11e..d238bb8a2 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -22,9 +22,9 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
temp = CuDynamicSharedArray(T, (2*threads,))
# iterate the main dimension using threads and the first block dimension
- i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+ i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
# iterate the other dimensions using the remaining block dimensions
- j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+ j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
if j > length(Rother)
return
@@ -105,9 +105,9 @@ function aggregate_partial_scan(op::Function, output::AbstractArray,
block = KI.get_group_id().x
# iterate the main dimension using threads and the first block dimension
- i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+ i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
# iterate the other dimensions using the remaining block dimensions
- j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+ j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
@inbounds if i <= length(Rdim) && j <= length(Rother)
I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index 7d72d90a1..063c736ed 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -73,8 +73,8 @@ end
@inbounds global_random_counters()[warpId]
elseif field === :ctr2
globalId = KI.get_global_id().x +
- (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
- (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+ (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+ (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
globalId%UInt32
end::UInt32
end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 97a4176b4..6fccff91e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -294,8 +294,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
end
partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
- threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
- # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+ threads = partial_threads, blocks = partial_blocks, shmem = partial_shmem
+ )
+ # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
GPUArrays.mapreducedim!(identity, op, R, partial; init)
end
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2f2c4300b..1e674d3be 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,9 +4,14 @@ using SparseArrays
include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
- "CPU synchronization",
- "fallback test: callable types",]))
+Testsuite.testsuite(
+ () -> CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests = Set(
+ [
+ "CPU synchronization",
+ "fallback test: callable types",
+ ]
+ )
+)
for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
end
diff --git a/test/runtests.jl b/test/runtests.jl
index 4865baa36..f7740b61d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
@static if VERSION < v"1.11"
using Pkg
- Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+ Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
end
using Distributed |
edaeb41 to
8e3e1d4
Compare
christiangnrd
commented
Oct 22, 2025
christiangnrd
commented
Oct 22, 2025
christiangnrd
commented
Oct 22, 2025
497ef42 to
506e02d
Compare
Contributor
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA.jl Benchmarks
| Benchmark suite | Current: 180f4a5 | Previous: 1d1be49 | Ratio |
|---|---|---|---|
latency/precompile |
65360103745 ns |
56764175729.5 ns |
1.15 |
latency/ttfp |
8508210510.5 ns |
8338765591 ns |
1.02 |
latency/import |
4685973409 ns |
4497026184 ns |
1.04 |
integration/volumerhs |
9609320.5 ns |
9623403 ns |
1.00 |
integration/byval/slices=1 |
146644 ns |
147327 ns |
1.00 |
integration/byval/slices=3 |
425680 ns |
426202 ns |
1.00 |
integration/byval/reference |
145025.5 ns |
145288 ns |
1.00 |
integration/byval/slices=2 |
286085 ns |
286646 ns |
1.00 |
integration/cudadevrt |
103528 ns |
103542 ns |
1.00 |
kernel/indexing |
14258 ns |
14224 ns |
1.00 |
kernel/indexing_checked |
14929.5 ns |
15126 ns |
0.99 |
kernel/occupancy |
670.1582278481013 ns |
670.1301369863014 ns |
1.00 |
kernel/launch |
2136.4 ns |
2124.4 ns |
1.01 |
kernel/rand |
14792 ns |
14865 ns |
1.00 |
array/reverse/1d |
19790 ns |
19923 ns |
0.99 |
array/reverse/2dL_inplace |
66720 ns |
66890 ns |
1.00 |
array/reverse/1dL |
69874 ns |
70173 ns |
1.00 |
array/reverse/2d |
21844 ns |
21888 ns |
1.00 |
array/reverse/1d_inplace |
11348 ns |
9715 ns |
1.17 |
array/reverse/2d_inplace |
13262 ns |
13402 ns |
0.99 |
array/reverse/2dL |
73796.5 ns |
73837 ns |
1.00 |
array/reverse/1dL_inplace |
66709 ns |
66895 ns |
1.00 |
array/copy |
20939 ns |
20939 ns |
1 |
array/iteration/findall/int |
159242 ns |
158678 ns |
1.00 |
array/iteration/findall/bool |
141881 ns |
140617 ns |
1.01 |
array/iteration/findfirst/int |
161153 ns |
162792 ns |
0.99 |
array/iteration/findfirst/bool |
161312.5 ns |
163326 ns |
0.99 |
array/iteration/scalar |
70538 ns |
73757.5 ns |
0.96 |
array/iteration/logical |
217032.5 ns |
218431.5 ns |
0.99 |
array/iteration/findmin/1d |
49861.5 ns |
51927 ns |
0.96 |
array/iteration/findmin/2d |
97019 ns |
96989 ns |
1.00 |
array/reductions/reduce/Int64/1d |
48000 ns |
44257 ns |
1.08 |
array/reductions/reduce/Int64/dims=1 |
49800 ns |
50794 ns |
0.98 |
array/reductions/reduce/Int64/dims=2 |
68982 ns |
61630 ns |
1.12 |
array/reductions/reduce/Int64/dims=1L |
88990 ns |
89077 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
90220 ns |
88486.5 ns |
1.02 |
array/reductions/reduce/Float32/1d |
36714 ns |
37397.5 ns |
0.98 |
array/reductions/reduce/Float32/dims=1 |
47222.5 ns |
41826.5 ns |
1.13 |
array/reductions/reduce/Float32/dims=2 |
64405 ns |
59832 ns |
1.08 |
array/reductions/reduce/Float32/dims=1L |
53545.5 ns |
52417 ns |
1.02 |
array/reductions/reduce/Float32/dims=2L |
73889.5 ns |
72316 ns |
1.02 |
array/reductions/mapreduce/Int64/1d |
46259.5 ns |
43845 ns |
1.06 |
array/reductions/mapreduce/Int64/dims=1 |
49456.5 ns |
45366.5 ns |
1.09 |
array/reductions/mapreduce/Int64/dims=2 |
68910 ns |
61499 ns |
1.12 |
array/reductions/mapreduce/Int64/dims=1L |
88985.5 ns |
88928 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2L |
90437 ns |
88227 ns |
1.03 |
array/reductions/mapreduce/Float32/1d |
36765 ns |
37393.5 ns |
0.98 |
array/reductions/mapreduce/Float32/dims=1 |
48192 ns |
50536 ns |
0.95 |
array/reductions/mapreduce/Float32/dims=2 |
62947 ns |
59948 ns |
1.05 |
array/reductions/mapreduce/Float32/dims=1L |
53879 ns |
52655 ns |
1.02 |
array/reductions/mapreduce/Float32/dims=2L |
73500 ns |
72123.5 ns |
1.02 |
array/broadcast |
19952 ns |
20094 ns |
0.99 |
array/copyto!/gpu_to_gpu |
11454 ns |
12939 ns |
0.89 |
array/copyto!/cpu_to_gpu |
215164 ns |
218766.5 ns |
0.98 |
array/copyto!/gpu_to_cpu |
282301 ns |
286506 ns |
0.99 |
array/accumulate/Int64/1d |
127397 ns |
125248 ns |
1.02 |
array/accumulate/Int64/dims=1 |
85196 ns |
83707 ns |
1.02 |
array/accumulate/Int64/dims=2 |
158769 ns |
157937.5 ns |
1.01 |
array/accumulate/Int64/dims=1L |
1795083 ns |
1710672 ns |
1.05 |
array/accumulate/Int64/dims=2L |
973567 ns |
967046.5 ns |
1.01 |
array/accumulate/Float32/1d |
111567 ns |
109958 ns |
1.01 |
array/accumulate/Float32/dims=1 |
81756 ns |
80681 ns |
1.01 |
array/accumulate/Float32/dims=2 |
149503.5 ns |
147571.5 ns |
1.01 |
array/accumulate/Float32/dims=1L |
1713669 ns |
1619100 ns |
1.06 |
array/accumulate/Float32/dims=2L |
715021 ns |
698632 ns |
1.02 |
array/construct |
1247.6 ns |
1292.8 ns |
0.97 |
array/random/randn/Float32 |
45613.5 ns |
45085.5 ns |
1.01 |
array/random/randn!/Float32 |
25126 ns |
25182 ns |
1.00 |
array/random/rand!/Int64 |
27413 ns |
27387 ns |
1.00 |
array/random/rand!/Float32 |
8706 ns |
8796.333333333334 ns |
0.99 |
array/random/rand/Int64 |
30653.5 ns |
29906 ns |
1.02 |
array/random/rand/Float32 |
12880 ns |
13172 ns |
0.98 |
array/permutedims/4d |
52750 ns |
55451 ns |
0.95 |
array/permutedims/2d |
54303.5 ns |
54366.5 ns |
1.00 |
array/permutedims/3d |
54265 ns |
55262 ns |
0.98 |
array/sorting/1d |
2759316.5 ns |
2758900.5 ns |
1.00 |
array/sorting/by |
3368651.5 ns |
3345477 ns |
1.01 |
array/sorting/2d |
1088246 ns |
1082617 ns |
1.01 |
cuda/synchronization/stream/auto |
1020.5 ns |
1052 ns |
0.97 |
cuda/synchronization/stream/nonblocking |
7056.8 ns |
7628.6 ns |
0.93 |
cuda/synchronization/stream/blocking |
817.49 ns |
835.3333333333334 ns |
0.98 |
cuda/synchronization/context/auto |
1158.9 ns |
1173.3 ns |
0.99 |
cuda/synchronization/context/nonblocking |
8459.8 ns |
8290.1 ns |
1.02 |
cuda/synchronization/context/blocking |
887.6607142857143 ns |
927.025 ns |
0.96 |
This comment was automatically generated by workflow using github-action-benchmark.
aef3728 to
fef539a
Compare
9219357 to
e2d7489
Compare
e2d7489 to
180f4a5
Compare
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
[only tests]
[only benchmarks]