diff --git a/.github/workflows/CI_CUDA.yml b/.github/workflows/CI_CUDA.yml index a71e485..1a6b815 100644 --- a/.github/workflows/CI_CUDA.yml +++ b/.github/workflows/CI_CUDA.yml @@ -29,7 +29,7 @@ jobs: with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - - uses: julia-actions/cache@v2 + # - uses: julia-actions/cache@v2 - uses: julia-actions/julia-buildpkg@v1 # - uses: julia-actions/julia-runtest@v1 - name: test-cuda diff --git a/.github/workflows/CI_ROCM.yml b/.github/workflows/CI_ROCM.yml index f929bc8..d540ef9 100644 --- a/.github/workflows/CI_ROCM.yml +++ b/.github/workflows/CI_ROCM.yml @@ -29,7 +29,7 @@ jobs: with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - - uses: julia-actions/cache@v2 + # - uses: julia-actions/cache@v2 - uses: julia-actions/julia-buildpkg@v1 # - uses: julia-actions/julia-runtest@v1 - name: test-rocm diff --git a/Project.toml b/Project.toml index c14bea2..5dfe7fb 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "FiniteElementContainers" uuid = "d08262e4-672f-4e7f-a976-f2cea5767631" -version = "0.10.0" +version = "0.10.1" authors = ["Craig M. Hamel and contributors"] [deps] diff --git a/ext/FiniteElementContainersAMDGPUExt.jl b/ext/FiniteElementContainersAMDGPUExt.jl index 17ca128..8494900 100644 --- a/ext/FiniteElementContainersAMDGPUExt.jl +++ b/ext/FiniteElementContainersAMDGPUExt.jl @@ -36,8 +36,14 @@ function AMDGPU.rocSPARSE.ROCSparseMatrixCSC(asm::SparseMatrixAssembler) end function FiniteElementContainers._stiffness(asm::SparseMatrixAssembler, ::ROCBackend) - stiffness = AMDGPU.rocSPARSE.ROCSparseMatrixCSC(asm) - return stiffness + K = AMDGPU.rocSPARSE.ROCSparseMatrixCSC(asm) + + if FiniteElementContainers._is_condensed(asm.dof) + FiniteElementContainers._adjust_matrix_entries_for_constraints!( + K, asm.constraint_storage, get_backend(asm) + ) + end + return K end -end # module \ No newline at end of file +end # module diff --git a/ext/FiniteElementContainersCUDAExt.jl b/ext/FiniteElementContainersCUDAExt.jl index d6fed26..8991467 100644 --- a/ext/FiniteElementContainersCUDAExt.jl +++ b/ext/FiniteElementContainersCUDAExt.jl @@ -31,7 +31,13 @@ function CUDA.CUSPARSE.CuSparseMatrixCSC(asm::SparseMatrixAssembler) end function FiniteElementContainers._stiffness(asm::SparseMatrixAssembler, ::CUDABackend) - return CUDA.CUSPARSE.CuSparseMatrixCSC(asm) + K = CUDA.CUSPARSE.CuSparseMatrixCSC(asm) + if FiniteElementContainers._is_condensed(asm.dof) + FiniteElementContainers._adjust_matrix_entries_for_constraints!( + K, asm.constraint_storage, get_backend(asm) + ) + end + return K end # this one isn't quite right diff --git a/src/assemblers/Constraints.jl b/src/assemblers/Constraints.jl index 43d88ae..664d8cc 100644 --- a/src/assemblers/Constraints.jl +++ b/src/assemblers/Constraints.jl @@ -1,3 +1,26 @@ +# util for sparse matrices on gpus +# TODO make non-allocating through some mapreduce functionality +# COV_EXCL_START +KA.@kernel function __sptrace_kernel!(diagA, colptr, rowvals, nz) + J = KA.@index(Global) + col_start = colptr[J] + col_end = colptr[J + 1] - 1 + for k in col_start:col_end + if rowvals[k] == J + diagA[J] = nz[k] + end + end +end +# COV_EXCL_STOP + +function __sptrace(A) + backend = KA.get_backend(A) + diagA = KA.zeros(backend, eltype(A), size(A, 1)) + kernel! = __sptrace_kernel!(backend) + kernel!(diagA, A.colPtr, A.rowVal, A.nzVal, ndrange = size(A, 2)) + return sum(diagA) +end + # TODO this only work on CPU right now function _adjust_matrix_entries_for_constraints!( A::SparseMatrixCSC, constraint_storage, ::KA.CPU; @@ -35,43 +58,41 @@ end # COV_EXCL_START KA.@kernel function _adjust_matrix_entries_for_constraints_kernel!( - A, constraint_storage, trA; - penalty_scale = 1.e6 + colptr, rowvals, nz, size_A_2, + constraint_storage, trA, penalty_scale ) J = KA.@index(Global) - - penalty = penalty_scale * trA / size(A, 2) + penalty = penalty_scale * trA / size_A_2 # now modify A => (I - G) * A + G - nz = nonzeros(A) - rowval = rowvals(A) - col_start = A.colptr[J] - col_end = A.colptr[J + 1] - 1 + col_start = colptr[J] + col_end = colptr[J + 1] - 1 for k in col_start:col_end # for (I - G) * A term nz[k] = (1. - constraint_storage[J]) * nz[k] # for + G term - if rowval[k] == J + if rowvals[k] == J @inbounds nz[k] = nz[k] + penalty * constraint_storage[J] end end end # COV_EXCL_STOP -function _adjust_matrix_entries_for_constraints( - A, constraint_storage, backend::KA.Backend +function _adjust_matrix_entries_for_constraints!( + A, constraint_storage, backend::KA.Backend; + penalty_scale = 1.e6 ) # first ensure things are the right size @assert size(A, 1) == size(A, 2) @assert length(constraint_storage) == size(A, 2) # get trA ahead of time to save some allocations at kernel level - trA = tr(A) - + trA = __sptrace(A) + kernel! = _adjust_matrix_entries_for_constraints_kernel!(backend) - kernel!(A, constraint_storage, trA, ndrange = size(A, 2)) + kernel!(A.colPtr, A.rowVal, A.nzVal, size(A, 2), constraint_storage, trA, penalty_scale, ndrange = size(A, 2)) return nothing end diff --git a/test/TestAssemblers.jl b/test/TestAssemblers.jl index 6f66342..98336e6 100644 --- a/test/TestAssemblers.jl +++ b/test/TestAssemblers.jl @@ -1,3 +1,19 @@ +function test_sparse_matrix_gpu_trace() + A = SparseMatrixCSC(rand(10, 10)) + trA_check = tr(A) + + if AMDGPU.functional() + A_gpu = adapt(ROCArray, A) + elseif CUDA.functional() + A_gpu = adapt(CuArray, A) + else + return nothing + end + + trA = FiniteElementContainers.__sptrace(A_gpu) + @test trA ≈ trA_check +end + function test_sparse_matrix_assembler() # create very simple poisson problem mesh_file = "./poisson/poisson.g" @@ -51,5 +67,6 @@ function test_sparse_matrix_assembler() end @testset "Sparse matrix assembler" begin + test_sparse_matrix_gpu_trace() test_sparse_matrix_assembler() end diff --git a/test/runtests.jl b/test/runtests.jl index af0976b..243ec4b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,6 +8,7 @@ using ForwardDiff using LinearAlgebra using MPI using ReferenceFiniteElements +using SparseArrays using StaticArrays using Tensors using Test