Skip to content

Commit 3d7f4fe

Browse files
authored
Merge pull request #166 from ecmwf/feature/cuda
Get CUDA backend for eckit_linalg working again
2 parents 2217b41 + c4c4f04 commit 3d7f4fe

File tree

4 files changed

+142
-49
lines changed

4 files changed

+142
-49
lines changed

CMakeLists.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -260,9 +260,8 @@ ecbuild_add_option( FEATURE JEMALLOC
260260

261261
ecbuild_add_option( FEATURE CUDA
262262
DEFAULT OFF
263-
CONDITION HAVE_EXPERIMENTAL
264263
DESCRIPTION "CUDA GPU linear algebra operations"
265-
REQUIRED_PACKAGES CUDA )
264+
REQUIRED_PACKAGES CUDAToolkit )
266265

267266
### ViennaCL
268267

src/eckit/config/Configuration.h

+1-3
Original file line numberDiff line numberDiff line change
@@ -190,9 +190,7 @@ class Configuration : public Parametrisation {
190190
return isStringList(name);
191191
}
192192
}
193-
else {
194-
return false;
195-
}
193+
return false;
196194
}
197195

198196
template <typename T>

src/eckit/linalg/CMakeLists.txt

+1-8
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,7 @@ if( eckit_HAVE_CUDA )
4747
detail/CUDA.h
4848
sparse/LinearAlgebraCUDA.cc
4949
sparse/LinearAlgebraCUDA.h )
50-
list( APPEND eckit_la_pincludes "${CUDA_INCLUDE_DIRS}" )
51-
list( APPEND eckit_la_plibs "${CUDA_LIBRARIES}" "${CUDA_cusparse_LIBRARY}" )
50+
list( APPEND eckit_la_plibs CUDA::cudart CUDA::cusparse CUDA::cublas )
5251
endif()
5352

5453
if( eckit_HAVE_EIGEN )
@@ -105,9 +104,3 @@ if( eckit_HAVE_EIGEN )
105104
# Add include directories with "SYSTEM" to avoid warnings from within Eigen headers
106105
target_include_directories( eckit_linalg SYSTEM PRIVATE ${EIGEN3_INCLUDE_DIRS} )
107106
endif()
108-
109-
if( CUDA_FOUND )
110-
set( CUDA_LINK_LIBRARIES_KEYWORD PRIVATE )
111-
cuda_add_cublas_to_target( eckit_linalg )
112-
endif()
113-

src/eckit/linalg/sparse/LinearAlgebraCUDA.cc

+139-36
Original file line numberDiff line numberDiff line change
@@ -49,40 +49,90 @@ void LinearAlgebraCUDA::spmv(const SparseMatrix& A, const Vector& x, Vector& y)
4949
Scalar* d_A_values; ///< device memory matrix A values
5050
Scalar* d_x; ///< device memory vector x
5151
Scalar* d_y; ///< device memory vector y
52-
cusparseHandle_t handle;
53-
cusparseMatDescr_t descr;
5452

5553
CALL_CUDA(cudaMalloc((void**)&d_A_rowptr, sizeArowptr));
5654
CALL_CUDA(cudaMalloc((void**)&d_A_colidx, sizeAcolidx));
5755
CALL_CUDA(cudaMalloc((void**)&d_A_values, sizeAvalues));
5856
CALL_CUDA(cudaMalloc((void**)&d_x, sizex));
5957
CALL_CUDA(cudaMalloc((void**)&d_y, sizey));
6058

61-
CALL_CUSPARSE(cusparseCreate(&handle));
62-
CALL_CUSPARSE(cusparseCreateMatDescr(&descr));
63-
cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
64-
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
65-
6659
CALL_CUDA(cudaMemcpy(d_A_rowptr, A.outer(), sizeArowptr, cudaMemcpyHostToDevice));
6760
CALL_CUDA(cudaMemcpy(d_A_colidx, A.inner(), sizeAcolidx, cudaMemcpyHostToDevice));
6861
CALL_CUDA(cudaMemcpy(d_A_values, A.data(), sizeAvalues, cudaMemcpyHostToDevice));
6962
CALL_CUDA(cudaMemcpy(d_x, x.data(), sizex, cudaMemcpyHostToDevice));
7063

64+
cusparseHandle_t handle;
65+
CALL_CUSPARSE(cusparseCreate(&handle));
66+
67+
cusparseSpMatDescr_t matA;
68+
CALL_CUSPARSE( cusparseCreateCsr(
69+
&matA,
70+
A.rows(), A.cols(), A.nonZeros(),
71+
d_A_rowptr,
72+
d_A_colidx,
73+
d_A_values,
74+
CUSPARSE_INDEX_32I,
75+
CUSPARSE_INDEX_32I,
76+
CUSPARSE_INDEX_BASE_ZERO,
77+
CUDA_R_64F) );
78+
79+
cusparseDnVecDescr_t vecX;
80+
CALL_CUSPARSE( cusparseCreateDnVec(
81+
&vecX,
82+
x.size(),
83+
d_x,
84+
CUDA_R_64F) );
85+
86+
cusparseDnVecDescr_t vecY;
87+
CALL_CUSPARSE( cusparseCreateDnVec(
88+
&vecY,
89+
y.size(),
90+
d_y,
91+
CUDA_R_64F) );
92+
7193
const Scalar alpha = 1.0;
7294
const Scalar beta = 0.0;
73-
// cusparseStatus_t
74-
// cusparseDcsrmv(cusparseHandle_t handle, cusparseOperation_t transA,
75-
// int m, int n, int nnz,
76-
// const double *alpha, const cusparseMatDescr_t descrA,
77-
// const double *csrValA, const int *csrRowPtrA, const int *csrColIndA,
78-
// const double *x, const double *beta, double *y)
79-
CALL_CUSPARSE(cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, A.rows(), A.cols(), A.nonZeros(), &alpha,
80-
descr, d_A_values, d_A_rowptr, d_A_colidx, d_x, &beta, d_y));
8195

96+
// Determine buffer size
97+
size_t bufferSize = 0;
98+
CALL_CUSPARSE( cusparseSpMV_bufferSize(
99+
handle,
100+
CUSPARSE_OPERATION_NON_TRANSPOSE,
101+
&alpha,
102+
matA,
103+
vecX,
104+
&beta,
105+
vecY,
106+
CUDA_R_64F,
107+
CUSPARSE_SPMV_ALG_DEFAULT,
108+
&bufferSize) );
109+
110+
// Allocate buffer
111+
char* buffer;
112+
CALL_CUDA( cudaMalloc(&buffer, bufferSize) );
113+
114+
// Perform SpMV
115+
// y = alpha * A * x + beta * y
116+
CALL_CUSPARSE( cusparseSpMV(
117+
handle,
118+
CUSPARSE_OPERATION_NON_TRANSPOSE,
119+
&alpha,
120+
matA,
121+
vecX,
122+
&beta,
123+
vecY,
124+
CUDA_R_64F,
125+
CUSPARSE_SPMV_ALG_DEFAULT,
126+
buffer) );
127+
128+
// Copy result back to host
82129
CALL_CUDA(cudaMemcpy(y.data(), d_y, sizey, cudaMemcpyDeviceToHost));
83130

84-
CALL_CUSPARSE(cusparseDestroyMatDescr(descr));
85-
CALL_CUSPARSE(cusparseDestroy(handle));
131+
CALL_CUSPARSE( cusparseDestroyDnVec(vecY) );
132+
CALL_CUSPARSE( cusparseDestroyDnVec(vecX) );
133+
CALL_CUSPARSE( cusparseDestroySpMat(matA) );
134+
CALL_CUSPARSE( cusparseDestroy(handle) );
135+
86136

87137
CALL_CUDA(cudaFree(d_A_rowptr));
88138
CALL_CUDA(cudaFree(d_A_colidx));
@@ -107,44 +157,97 @@ void LinearAlgebraCUDA::spmm(const SparseMatrix& A, const Matrix& B, Matrix& C)
107157
Scalar* d_A_values; ///< device memory matrix A values
108158
Scalar* d_B; ///< device memory matrix B
109159
Scalar* d_C; ///< device memory matrix C
110-
cusparseHandle_t handle;
111-
cusparseMatDescr_t descr;
112160

113161
CALL_CUDA(cudaMalloc((void**)&d_A_rowptr, sizeArowptr));
114162
CALL_CUDA(cudaMalloc((void**)&d_A_colidx, sizeAcolidx));
115163
CALL_CUDA(cudaMalloc((void**)&d_A_values, sizeAvalues));
116164
CALL_CUDA(cudaMalloc((void**)&d_B, sizeB));
117165
CALL_CUDA(cudaMalloc((void**)&d_C, sizeC));
118166

119-
CALL_CUSPARSE(cusparseCreate(&handle));
120-
CALL_CUSPARSE(cusparseCreateMatDescr(&descr));
121-
cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
122-
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
123-
124167
CALL_CUDA(cudaMemcpy(d_A_rowptr, A.outer(), sizeArowptr, cudaMemcpyHostToDevice));
125168
CALL_CUDA(cudaMemcpy(d_A_colidx, A.inner(), sizeAcolidx, cudaMemcpyHostToDevice));
126169
CALL_CUDA(cudaMemcpy(d_A_values, A.data(), sizeAvalues, cudaMemcpyHostToDevice));
127170
CALL_CUDA(cudaMemcpy(d_B, B.data(), sizeB, cudaMemcpyHostToDevice));
128171

129-
// FIXME: Should we transpose B and use cusparseDcsrmm2 instread?
130-
// http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmm2
172+
cusparseHandle_t handle;
173+
CALL_CUSPARSE(cusparseCreate(&handle));
174+
175+
cusparseSpMatDescr_t matA;
176+
CALL_CUSPARSE( cusparseCreateCsr(
177+
&matA,
178+
A.rows(), A.cols(), A.nonZeros(),
179+
d_A_rowptr,
180+
d_A_colidx,
181+
d_A_values,
182+
CUSPARSE_INDEX_32I,
183+
CUSPARSE_INDEX_32I,
184+
CUSPARSE_INDEX_BASE_ZERO,
185+
CUDA_R_64F) );
186+
187+
// Create dense matrix descriptors
188+
cusparseDnMatDescr_t matB;
189+
CALL_CUSPARSE(cusparseCreateDnMat(
190+
&matB,
191+
B.rows(), // rows
192+
B.cols(), // cols
193+
B.rows(), // leading dimension
194+
d_B,
195+
CUDA_R_64F,
196+
CUSPARSE_ORDER_COL) );
197+
198+
cusparseDnMatDescr_t matC;
199+
CALL_CUSPARSE(cusparseCreateDnMat(
200+
&matC,
201+
C.rows(), // rows
202+
C.cols(), // cols
203+
C.rows(), // leading dimension
204+
d_C,
205+
CUDA_R_64F,
206+
CUSPARSE_ORDER_COL) );
207+
131208
const Scalar alpha = 1.0;
132209
const Scalar beta = 0.0;
133-
// cusparseStatus_t
134-
// cusparseDcsrmm(cusparseHandle_t handle, cusparseOperation_t transA,
135-
// int m, int n, int k, int nnz,
136-
// const double *alpha, const cusparseMatDescr_t descrA,
137-
// const double *csrValA, const int *csrRowPtrA, const int *csrColIndA,
138-
// const double *B, int ldb, const double *beta, double *C, int ldc)
139-
CALL_CUSPARSE(cusparseDcsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, A.rows(), A.cols(), B.cols(), A.nonZeros(),
140-
&alpha, descr, d_A_values, d_A_rowptr, d_A_colidx, d_B, B.rows(), &beta, d_C,
141-
C.rows()));
210+
211+
size_t bufferSize = 0;
212+
CALL_CUSPARSE(cusparseSpMM_bufferSize(
213+
handle,
214+
CUSPARSE_OPERATION_NON_TRANSPOSE,
215+
CUSPARSE_OPERATION_NON_TRANSPOSE,
216+
&alpha,
217+
matA,
218+
matB,
219+
&beta,
220+
matC,
221+
CUDA_R_64F,
222+
CUSPARSE_SPMM_ALG_DEFAULT,
223+
&bufferSize));
224+
225+
// Allocate buffer
226+
char* buffer;
227+
CALL_CUDA(cudaMalloc(&buffer, bufferSize));
228+
229+
// Perform SpMM
230+
CALL_CUSPARSE(cusparseSpMM(
231+
handle,
232+
CUSPARSE_OPERATION_NON_TRANSPOSE,
233+
CUSPARSE_OPERATION_NON_TRANSPOSE,
234+
&alpha,
235+
matA,
236+
matB,
237+
&beta,
238+
matC,
239+
CUDA_R_64F,
240+
CUSPARSE_SPMM_ALG_DEFAULT,
241+
buffer));
142242

143243
CALL_CUDA(cudaMemcpy(C.data(), d_C, sizeC, cudaMemcpyDeviceToHost));
144244

145-
CALL_CUSPARSE(cusparseDestroyMatDescr(descr));
146245
CALL_CUSPARSE(cusparseDestroy(handle));
246+
CALL_CUSPARSE(cusparseDestroyDnMat(matC));
247+
CALL_CUSPARSE(cusparseDestroyDnMat(matB));
248+
CALL_CUSPARSE(cusparseDestroySpMat(matA));
147249

250+
CALL_CUDA(cudaFree(buffer));
148251
CALL_CUDA(cudaFree(d_A_rowptr));
149252
CALL_CUDA(cudaFree(d_A_colidx));
150253
CALL_CUDA(cudaFree(d_A_values));

0 commit comments

Comments
 (0)