@@ -49,40 +49,90 @@ void LinearAlgebraCUDA::spmv(const SparseMatrix& A, const Vector& x, Vector& y)
49
49
Scalar* d_A_values; // /< device memory matrix A values
50
50
Scalar* d_x; // /< device memory vector x
51
51
Scalar* d_y; // /< device memory vector y
52
- cusparseHandle_t handle;
53
- cusparseMatDescr_t descr;
54
52
55
53
CALL_CUDA (cudaMalloc ((void **)&d_A_rowptr, sizeArowptr));
56
54
CALL_CUDA (cudaMalloc ((void **)&d_A_colidx, sizeAcolidx));
57
55
CALL_CUDA (cudaMalloc ((void **)&d_A_values, sizeAvalues));
58
56
CALL_CUDA (cudaMalloc ((void **)&d_x, sizex));
59
57
CALL_CUDA (cudaMalloc ((void **)&d_y, sizey));
60
58
61
- CALL_CUSPARSE (cusparseCreate (&handle));
62
- CALL_CUSPARSE (cusparseCreateMatDescr (&descr));
63
- cusparseSetMatType (descr, CUSPARSE_MATRIX_TYPE_GENERAL);
64
- cusparseSetMatIndexBase (descr, CUSPARSE_INDEX_BASE_ZERO);
65
-
66
59
CALL_CUDA (cudaMemcpy (d_A_rowptr, A.outer (), sizeArowptr, cudaMemcpyHostToDevice));
67
60
CALL_CUDA (cudaMemcpy (d_A_colidx, A.inner (), sizeAcolidx, cudaMemcpyHostToDevice));
68
61
CALL_CUDA (cudaMemcpy (d_A_values, A.data (), sizeAvalues, cudaMemcpyHostToDevice));
69
62
CALL_CUDA (cudaMemcpy (d_x, x.data (), sizex, cudaMemcpyHostToDevice));
70
63
64
+ cusparseHandle_t handle;
65
+ CALL_CUSPARSE (cusparseCreate (&handle));
66
+
67
+ cusparseSpMatDescr_t matA;
68
+ CALL_CUSPARSE ( cusparseCreateCsr (
69
+ &matA,
70
+ A.rows (), A.cols (), A.nonZeros (),
71
+ d_A_rowptr,
72
+ d_A_colidx,
73
+ d_A_values,
74
+ CUSPARSE_INDEX_32I,
75
+ CUSPARSE_INDEX_32I,
76
+ CUSPARSE_INDEX_BASE_ZERO,
77
+ CUDA_R_64F) );
78
+
79
+ cusparseDnVecDescr_t vecX;
80
+ CALL_CUSPARSE ( cusparseCreateDnVec (
81
+ &vecX,
82
+ x.size (),
83
+ d_x,
84
+ CUDA_R_64F) );
85
+
86
+ cusparseDnVecDescr_t vecY;
87
+ CALL_CUSPARSE ( cusparseCreateDnVec (
88
+ &vecY,
89
+ y.size (),
90
+ d_y,
91
+ CUDA_R_64F) );
92
+
71
93
const Scalar alpha = 1.0 ;
72
94
const Scalar beta = 0.0 ;
73
- // cusparseStatus_t
74
- // cusparseDcsrmv(cusparseHandle_t handle, cusparseOperation_t transA,
75
- // int m, int n, int nnz,
76
- // const double *alpha, const cusparseMatDescr_t descrA,
77
- // const double *csrValA, const int *csrRowPtrA, const int *csrColIndA,
78
- // const double *x, const double *beta, double *y)
79
- CALL_CUSPARSE (cusparseDcsrmv (handle, CUSPARSE_OPERATION_NON_TRANSPOSE, A.rows (), A.cols (), A.nonZeros (), &alpha,
80
- descr, d_A_values, d_A_rowptr, d_A_colidx, d_x, &beta, d_y));
81
95
96
+ // Determine buffer size
97
+ size_t bufferSize = 0 ;
98
+ CALL_CUSPARSE ( cusparseSpMV_bufferSize (
99
+ handle,
100
+ CUSPARSE_OPERATION_NON_TRANSPOSE,
101
+ &alpha,
102
+ matA,
103
+ vecX,
104
+ &beta,
105
+ vecY,
106
+ CUDA_R_64F,
107
+ CUSPARSE_SPMV_ALG_DEFAULT,
108
+ &bufferSize) );
109
+
110
+ // Allocate buffer
111
+ char * buffer;
112
+ CALL_CUDA ( cudaMalloc (&buffer, bufferSize) );
113
+
114
+ // Perform SpMV
115
+ // y = alpha * A * x + beta * y
116
+ CALL_CUSPARSE ( cusparseSpMV (
117
+ handle,
118
+ CUSPARSE_OPERATION_NON_TRANSPOSE,
119
+ &alpha,
120
+ matA,
121
+ vecX,
122
+ &beta,
123
+ vecY,
124
+ CUDA_R_64F,
125
+ CUSPARSE_SPMV_ALG_DEFAULT,
126
+ buffer) );
127
+
128
+ // Copy result back to host
82
129
CALL_CUDA (cudaMemcpy (y.data (), d_y, sizey, cudaMemcpyDeviceToHost));
83
130
84
- CALL_CUSPARSE (cusparseDestroyMatDescr (descr));
85
- CALL_CUSPARSE (cusparseDestroy (handle));
131
+ CALL_CUSPARSE ( cusparseDestroyDnVec (vecY) );
132
+ CALL_CUSPARSE ( cusparseDestroyDnVec (vecX) );
133
+ CALL_CUSPARSE ( cusparseDestroySpMat (matA) );
134
+ CALL_CUSPARSE ( cusparseDestroy (handle) );
135
+
86
136
87
137
CALL_CUDA (cudaFree (d_A_rowptr));
88
138
CALL_CUDA (cudaFree (d_A_colidx));
@@ -107,44 +157,97 @@ void LinearAlgebraCUDA::spmm(const SparseMatrix& A, const Matrix& B, Matrix& C)
107
157
Scalar* d_A_values; // /< device memory matrix A values
108
158
Scalar* d_B; // /< device memory matrix B
109
159
Scalar* d_C; // /< device memory matrix C
110
- cusparseHandle_t handle;
111
- cusparseMatDescr_t descr;
112
160
113
161
CALL_CUDA (cudaMalloc ((void **)&d_A_rowptr, sizeArowptr));
114
162
CALL_CUDA (cudaMalloc ((void **)&d_A_colidx, sizeAcolidx));
115
163
CALL_CUDA (cudaMalloc ((void **)&d_A_values, sizeAvalues));
116
164
CALL_CUDA (cudaMalloc ((void **)&d_B, sizeB));
117
165
CALL_CUDA (cudaMalloc ((void **)&d_C, sizeC));
118
166
119
- CALL_CUSPARSE (cusparseCreate (&handle));
120
- CALL_CUSPARSE (cusparseCreateMatDescr (&descr));
121
- cusparseSetMatType (descr, CUSPARSE_MATRIX_TYPE_GENERAL);
122
- cusparseSetMatIndexBase (descr, CUSPARSE_INDEX_BASE_ZERO);
123
-
124
167
CALL_CUDA (cudaMemcpy (d_A_rowptr, A.outer (), sizeArowptr, cudaMemcpyHostToDevice));
125
168
CALL_CUDA (cudaMemcpy (d_A_colidx, A.inner (), sizeAcolidx, cudaMemcpyHostToDevice));
126
169
CALL_CUDA (cudaMemcpy (d_A_values, A.data (), sizeAvalues, cudaMemcpyHostToDevice));
127
170
CALL_CUDA (cudaMemcpy (d_B, B.data (), sizeB, cudaMemcpyHostToDevice));
128
171
129
- // FIXME: Should we transpose B and use cusparseDcsrmm2 instread?
130
- // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmm2
172
+ cusparseHandle_t handle;
173
+ CALL_CUSPARSE (cusparseCreate (&handle));
174
+
175
+ cusparseSpMatDescr_t matA;
176
+ CALL_CUSPARSE ( cusparseCreateCsr (
177
+ &matA,
178
+ A.rows (), A.cols (), A.nonZeros (),
179
+ d_A_rowptr,
180
+ d_A_colidx,
181
+ d_A_values,
182
+ CUSPARSE_INDEX_32I,
183
+ CUSPARSE_INDEX_32I,
184
+ CUSPARSE_INDEX_BASE_ZERO,
185
+ CUDA_R_64F) );
186
+
187
+ // Create dense matrix descriptors
188
+ cusparseDnMatDescr_t matB;
189
+ CALL_CUSPARSE (cusparseCreateDnMat (
190
+ &matB,
191
+ B.rows (), // rows
192
+ B.cols (), // cols
193
+ B.rows (), // leading dimension
194
+ d_B,
195
+ CUDA_R_64F,
196
+ CUSPARSE_ORDER_COL) );
197
+
198
+ cusparseDnMatDescr_t matC;
199
+ CALL_CUSPARSE (cusparseCreateDnMat (
200
+ &matC,
201
+ C.rows (), // rows
202
+ C.cols (), // cols
203
+ C.rows (), // leading dimension
204
+ d_C,
205
+ CUDA_R_64F,
206
+ CUSPARSE_ORDER_COL) );
207
+
131
208
const Scalar alpha = 1.0 ;
132
209
const Scalar beta = 0.0 ;
133
- // cusparseStatus_t
134
- // cusparseDcsrmm(cusparseHandle_t handle, cusparseOperation_t transA,
135
- // int m, int n, int k, int nnz,
136
- // const double *alpha, const cusparseMatDescr_t descrA,
137
- // const double *csrValA, const int *csrRowPtrA, const int *csrColIndA,
138
- // const double *B, int ldb, const double *beta, double *C, int ldc)
139
- CALL_CUSPARSE (cusparseDcsrmm (handle, CUSPARSE_OPERATION_NON_TRANSPOSE, A.rows (), A.cols (), B.cols (), A.nonZeros (),
140
- &alpha, descr, d_A_values, d_A_rowptr, d_A_colidx, d_B, B.rows (), &beta, d_C,
141
- C.rows ()));
210
+
211
+ size_t bufferSize = 0 ;
212
+ CALL_CUSPARSE (cusparseSpMM_bufferSize (
213
+ handle,
214
+ CUSPARSE_OPERATION_NON_TRANSPOSE,
215
+ CUSPARSE_OPERATION_NON_TRANSPOSE,
216
+ &alpha,
217
+ matA,
218
+ matB,
219
+ &beta,
220
+ matC,
221
+ CUDA_R_64F,
222
+ CUSPARSE_SPMM_ALG_DEFAULT,
223
+ &bufferSize));
224
+
225
+ // Allocate buffer
226
+ char * buffer;
227
+ CALL_CUDA (cudaMalloc (&buffer, bufferSize));
228
+
229
+ // Perform SpMM
230
+ CALL_CUSPARSE (cusparseSpMM (
231
+ handle,
232
+ CUSPARSE_OPERATION_NON_TRANSPOSE,
233
+ CUSPARSE_OPERATION_NON_TRANSPOSE,
234
+ &alpha,
235
+ matA,
236
+ matB,
237
+ &beta,
238
+ matC,
239
+ CUDA_R_64F,
240
+ CUSPARSE_SPMM_ALG_DEFAULT,
241
+ buffer));
142
242
143
243
CALL_CUDA (cudaMemcpy (C.data (), d_C, sizeC, cudaMemcpyDeviceToHost));
144
244
145
- CALL_CUSPARSE (cusparseDestroyMatDescr (descr));
146
245
CALL_CUSPARSE (cusparseDestroy (handle));
246
+ CALL_CUSPARSE (cusparseDestroyDnMat (matC));
247
+ CALL_CUSPARSE (cusparseDestroyDnMat (matB));
248
+ CALL_CUSPARSE (cusparseDestroySpMat (matA));
147
249
250
+ CALL_CUDA (cudaFree (buffer));
148
251
CALL_CUDA (cudaFree (d_A_rowptr));
149
252
CALL_CUDA (cudaFree (d_A_colidx));
150
253
CALL_CUDA (cudaFree (d_A_values));
0 commit comments