diff --git a/blas.h b/blas.h index 66dca62..40bde3e 100644 --- a/blas.h +++ b/blas.h @@ -265,7 +265,6 @@ F77_symm(d, double); F77_symm(c, float _Complex); F77_symm(z, double _Complex); - #define F77_syrk(prefix, T) \ void prefix##syrk_(char *uplo, char *trans, \ int *n, int *k, \ @@ -283,6 +282,10 @@ void prefix##syr2k_(char *uplo, char *trans, \ T *beta, \ T *c, int *ldc) +F77_syr2k(s, float); +F77_syr2k(d, double); +F77_syr2k(c, float _Complex); +F77_syr2k(z, double _Complex); #define F77_trmm(prefix, T) \ void prefix##trmm_(char *side, \ @@ -292,6 +295,11 @@ void prefix##trmm_(char *side, \ T *a, int *lda, \ T *b, int *ldb) +F77_trmm(s, float); +F77_trmm(d, double); +F77_trmm(c, float _Complex); +F77_trmm(z, double _Complex); + #define F77_trsm(prefix, T) \ void prefix##trsm_(char *side, char *uplo, \ char *transa, char *diag, \ @@ -300,6 +308,11 @@ void prefix##trsm_(char *side, char *uplo, \ T *a, int *lda, \ T *b, int *ldb) +F77_trsm(s, float); +F77_trsm(d, double); +F77_trsm(c, float _Complex); +F77_trsm(z, double _Complex); + void _b2c_xerbla(const char *routine, int arg_pos); #ifdef __cplusplus diff --git a/blas_level3/hemm.cc b/blas_level3/hemm.cc index 44b0b5e..4da76f1 100644 --- a/blas_level3/hemm.cc +++ b/blas_level3/hemm.cc @@ -54,7 +54,6 @@ void _b2c_hemm(const CBLAS_SIDE side, gpuptr gpu_b(b, size(0, ldb, n, sizeof(*b))); gpuptr gpu_c(c, size(0, ldc, n, sizeof(*c))); - call_kernel( #if USE_CUDA hemm_func(b2c_cublas_handle, @@ -122,48 +121,32 @@ F77_hemm(c, float _Complex) { hemm_check(); _b2c_hemm(c_side(*side), c_uplo(*uplo), *m, *n, -#if USE_CUDA cu(*alpha), -#else - cu2(*alpha), -#endif cmplx_ptr(a), *lda, cmplx_ptr(b), *ldb, -#if USE_CUDA cu(*beta), -#else - cu2(*beta), -#endif cmplx_ptr(c), *ldc, #if USE_CUDA &cublasChemm #else &clblasChemm #endif - ); + ); } F77_hemm(z, double _Complex) { hemm_check(); _b2c_hemm(c_side(*side), c_uplo(*uplo), *m, *n, -#if USE_CUDA cu(*alpha), -#else - cu2(*alpha), -#endif cmplx_ptr(a), *lda, cmplx_ptr(b), *ldb, -#if USE_CUDA cu(*beta), -#else - cu2(*beta), -#endif cmplx_ptr(c), *ldc, #if USE_CUDA &cublasZhemm #else &clblasZhemm #endif - ); + ); } diff --git a/blas_level3/syr2k.cc b/blas_level3/syr2k.cc index a4dae96..6713c53 100644 --- a/blas_level3/syr2k.cc +++ b/blas_level3/syr2k.cc @@ -3,8 +3,8 @@ #include "../blas.h" #include "../conversions.h" #include "level3.h" -#include "../blas2cuda.h" #include "../runtime-blas.h" +#include "../runtime-mem.hpp" #if USE_CUDA extern cublasHandle_t b2c_cublas_handle; @@ -12,17 +12,9 @@ extern cublasHandle_t b2c_cublas_handle; extern cl_command_queue opencl_cmd_queue; #endif - -template -void _b2c_syr2k(const CBLAS_UPLO uplo, - const CBLAS_TRANSPOSE trans, - const int n, const int k, - const T alpha, - const T *a, const int lda, - const T *b, const int ldb, - const T beta, - T *c, const int ldc, - cublasStatus_t syr2k_func(cublasHandle_t, +template +#if USE_CUDA +using syr2k_t = cublasStatus_t (*)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, @@ -30,70 +22,130 @@ void _b2c_syr2k(const CBLAS_UPLO uplo, const T *, int, const T *, int, const T *, - T *, int)) -{ - const T *gpu_a, *gpu_b; - T *gpu_c; - int rows_a, cols_a, - rows_b, cols_b, - rows_c, cols_c; - int size_a, size_b, size_c; - cublasFillMode_t cuplo = cu(uplo); - cublasOperation_t ctrans = cu(trans); - const struct objinfo *a_info, *b_info, *c_info; - - rows_c = ldc; - cols_c = n; - if (trans == CblasNoTrans) { - rows_a = lda; - cols_a = k; - rows_b = ldb; - cols_b = k; - } else { - rows_a = lda; - cols_a = n; - rows_b = ldb; - cols_b = n; - } - + T *, int); +#else +using syr2k_t = clblasStatus (*)(clblasOrder order, + clblasUplo uplo, clblasTranspose transAB, + size_t N, size_t K, + S alpha, + const cl_mem A, size_t offA, size_t lda, + const cl_mem B, size_t offB, size_t ldb, + S beta, + cl_mem C, size_t offC, size_t ldc, + cl_uint numCommandQueues, cl_command_queue *commandQueues, + cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); +#endif - size_a = size(0, rows_a, cols_a, sizeof(*a)); - size_b = size(0, rows_b, cols_b, sizeof(*b)); - size_c = size(0, rows_c, cols_c, sizeof(*c)); - gpu_a = (T *) b2c_place_on_gpu((void *) a, size_a, &a_info, NULL); - gpu_b = (T *) b2c_place_on_gpu((void *) b, size_b, &b_info, - (void *) gpu_a, &a_info, - NULL); - gpu_c = (T *) b2c_place_on_gpu((void *) c, size_c, &c_info, - (void *) gpu_a, &a_info, - (void *) gpu_b, &b_info, - NULL); +template +void _b2c_syr2k(const CBLAS_UPLO uplo, + const CBLAS_TRANSPOSE trans, + const int n, const int k, + const S alpha, + const T *a, const int lda, + const T *b, const int ldb, + const S beta, + T *c, const int ldc, + syr2k_t syr2k_func) +{ + gpuptr gpu_a(a, size(0, lda, trans == CblasNoTrans ? k : n, sizeof *a)); + gpuptr gpu_b(b, size(0, ldb, trans == CblasNoTrans ? k : n, sizeof *b)); + gpuptr gpu_c(c, size(0, ldc, n, sizeof *c)); call_kernel( - syr2k_func(b2c_handle, - cuplo, ctrans, +#if USE_CUDA + syr2k_func(b2c_cublas_handle, + cu(uplo), cu(trans), n, k, &alpha, gpu_a, lda, gpu_b, ldb, &beta, gpu_c, ldc) +#else + syr2k_func(clblasColumnMajor, + clb(uplo), clb(trans), + n, k, + alpha, + gpu_a, 0, lda, + gpu_b, 0, ldb, + beta, + gpu_c, 0, ldc, + 1, &opencl_cmd_queue, 0, NULL, NULL) +#endif ); +} + +template ::value || std::is_same::value> +bool syr2k_check(const char *func_name, + char *uplo, + char *trans, + int *n, int *k, + T *alpha, + T *a, int *lda, + T *b, int *ldb, + T *beta, + T *c, int *ldc) { + int nrowa = runtime_blas_lsame(trans, "N") ? *n : *k; + int upper = runtime_blas_lsame(uplo, "U"); + int info = 0; - - runtime_fatal_errmsg(cudaGetLastError(), __func__); + if (!upper && !runtime_blas_lsame(uplo, "L")) + info = 1; + else if (!runtime_blas_lsame(trans, "N") && + !runtime_blas_lsame(trans, "T") && + (is_complex || !runtime_blas_lsame(trans, "C"))) + info = 2; + else if (*n < 0) + info = 3; + else if (*k < 0) + info = 4; + else if (*lda < std::max(1, nrowa)) + info = 7; + else if (*ldb < std::max(1, nrowa)) + info = 9; + else if (*ldc < std::max(1, *n)) + info = 12; - if (!c_info) { - b2c_copy_from_gpu(c, gpu_c, size_c); + if (info != 0) { + runtime_blas_xerbla(func_name, info); + return false; } - b2c_cleanup_gpu_ptr((void *) gpu_a, a_info); - b2c_cleanup_gpu_ptr((void *) gpu_b, b_info); - b2c_cleanup_gpu_ptr((void *) gpu_c, c_info); + if (*n == 0 || ((*alpha == 0 || *k == 0) && *beta == 1)) + return false; + + if (*alpha == 0 || *n == 0 || *k == 0) { + if (upper) { + if (*beta == 0) { + for (int j=1; j<=*n; ++j) + for (int i=1; i<=j; ++i) + c[IDX2F(i,j,*ldc)] = 0; + } else { + for (int j=1; j<=*n; ++j) + for (int i=1; i<=j; ++i) + c[IDX2F(i,j,*ldc)] *= *beta; + } + } else { + if (*beta == 0) { + for (int j=1; j<=*n; ++j) + for (int i=j; i<=*n; ++i) + c[IDX2F(i,j,*ldc)] = 0; + } else { + for (int j=1; j<=*n; ++j) + for (int i=j; i<=*n; ++i) + c[IDX2F(i,j,*ldc)] *= *beta; + } + } + return false; + } + + return true; } F77_syr2k(s, float) { + if (!syr2k_check(__func__, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) + return; _b2c_syr2k(c_uplo(*uplo), c_trans(*trans), *n, *k, *alpha, @@ -101,10 +153,17 @@ F77_syr2k(s, float) { b, *ldb, *beta, c, *ldc, - &cublasSsyr2k); +#if USE_CUDA + &cublasSsyr2k +#else + &clblasSsyr2k +#endif + ); } F77_syr2k(d, double) { + if (!syr2k_check(__func__, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) + return; _b2c_syr2k(c_uplo(*uplo), c_trans(*trans), *n, *k, *alpha, @@ -112,27 +171,46 @@ F77_syr2k(d, double) { b, *ldb, *beta, c, *ldc, - &cublasDsyr2k); +#if USE_CUDA + &cublasDsyr2k +#else + &clblasDsyr2k +#endif + ); } F77_syr2k(c, float _Complex) { + if (!syr2k_check(__func__, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) + return; _b2c_syr2k(c_uplo(*uplo), c_trans(*trans), *n, *k, cu(*alpha), - (cuComplex *) a, *lda, - (cuComplex *) b, *ldb, + cmplx_ptr(a), *lda, + cmplx_ptr(b), *ldb, cu(*beta), - (cuComplex *) c, *ldc, - &cublasCsyr2k); + cmplx_ptr(c), *ldc, +#if USE_CUDA + &cublasCsyr2k +#else + &clblasCsyr2k +#endif + ); } F77_syr2k(z, double _Complex) { + if (!syr2k_check(__func__, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) + return; _b2c_syr2k(c_uplo(*uplo), c_trans(*trans), *n, *k, cu(*alpha), - (cuDoubleComplex *) a, *lda, - (cuDoubleComplex *) b, *ldb, + cmplx_ptr(a), *lda, + cmplx_ptr(b), *ldb, cu(*beta), - (cuDoubleComplex *) c, *ldc, - &cublasZsyr2k); + cmplx_ptr(c), *ldc, +#if USE_CUDA + &cublasZsyr2k +#else + &clblasZsyr2k +#endif + ); } diff --git a/blas_level3/syrk.cc b/blas_level3/syrk.cc index a17f9a4..0a001dc 100644 --- a/blas_level3/syrk.cc +++ b/blas_level3/syrk.cc @@ -3,8 +3,8 @@ #include "../blas.h" #include "../conversions.h" #include "level3.h" -#include "../blas2cuda.h" #include "../runtime-blas.h" +#include "../runtime-mem.hpp" #if USE_CUDA extern cublasHandle_t b2c_cublas_handle; @@ -12,52 +12,49 @@ extern cublasHandle_t b2c_cublas_handle; extern cl_command_queue opencl_cmd_queue; #endif +#if USE_CUDA +extern cublasHandle_t b2c_cublas_handle; +#else +extern cl_command_queue opencl_cmd_queue; +#endif template -void _b2c_syrk(const CBLAS_UPLO uplo, - const CBLAS_TRANSPOSE trans, - const int n, const int k, - const T alpha, - const T *a, const int lda, - const T beta, - T *c, const int ldc, - cublasStatus_t syrk_func(cublasHandle_t, +#if USE_CUDA +using syr2k_t = cublasStatus_t (*)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const T *, const T *, int, const T *, - T *, int)) -{ - const T *gpu_a; - T *gpu_c; - int rows_a, cols_a, - rows_c, cols_c; - int size_a, size_c; - cublasFillMode_t cuplo = cu(uplo); - cublasOperation_t ctrans = cu(trans); - const struct objinfo *a_info, *c_info; - - rows_c = ldc; - cols_c = n; - if (trans == CblasNoTrans) { - rows_a = lda; - cols_a = k; - } else { - rows_a = lda; - cols_a = n; - } - - - size_a = size(0, rows_a, cols_a, sizeof(*a)); - size_c = size(0, rows_c, cols_c, sizeof(*c)); + T *, int); +#else +using syrk_t = clblasStatus (*)(clblasOrder order, + clblasUplo uplo, + clblasTranspose transAB, + size_t N, size_t K, + T alpha, + const cl_mem A, size_t offA, size_t lda, + T beta, + cl_mem C, size_t offC, size_t ldc, + cl_uint numCommandQueues, cl_command_queue *commandQueues, + cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); +#endif - gpu_a = (T *) b2c_place_on_gpu((void *) a, size_a, &a_info, NULL); - gpu_c = (T *) b2c_place_on_gpu((void *) c, size_c, &c_info, - (void *) gpu_a, &a_info, - NULL); +template +void _b2c_syrk(const CBLAS_UPLO uplo, + const CBLAS_TRANSPOSE trans, + const int n, const int k, + const S alpha, + const T *a, const int lda, + const S beta, + T *c, const int ldc, + syrk_t syrk_func) +{ + gpuptr gpu_a(a, size(0, lda, k, sizeof *a)); + gpuptr gpu_c(c, size(0, ldc, n, sizeof *c)); call_kernel( +#if USE_CUDA syrk_func(b2c_handle, cuplo, ctrans, n, k, @@ -65,56 +62,153 @@ void _b2c_syrk(const CBLAS_UPLO uplo, gpu_a, lda, &beta, gpu_c, ldc) +#else + syrk_func(clblasColumnMajor, + clb(uplo), clb(trans), + n, k, + alpha, + gpu_a, 0, lda, + beta, + gpu_c, 0, ldc, + 1, &opencl_cmd_queue, 0, NULL, NULL) +#endif ); +} +template +bool syrk_check(const char *func_name, + char *uplo, + char *trans, + int *n, int *k, + T *alpha, + T *a, int *lda, + T *beta, + T *c, int *ldc) { + int nrowa = -1; + int info; + int upper; + + if (runtime_blas_lsame(trans, "N")) + nrowa = *n; + else + nrowa = *k; + upper = runtime_blas_lsame(uplo, "U"); + info = 0; + + if (!upper && !runtime_blas_lsame(uplo, "L")) + info = 1; + else if (!runtime_blas_lsame(trans, "N") && + !runtime_blas_lsame(trans, "T") && + !runtime_blas_lsame(trans, "C")) + info = 2; + else if (*n < 0) + info = 3; + else if (*k < 0) + info = 4; + else if (*lda < std::max(1, nrowa)) + info = 7; + else if (*ldc < std::max(1, *n)) + info = 10; - runtime_fatal_errmsg(cudaGetLastError(), __func__); - - if (!c_info) { - b2c_copy_from_gpu(c, gpu_c, size_c); + if (info != 0) { + runtime_blas_xerbla(func_name, info); + return false; } - b2c_cleanup_gpu_ptr((void *) gpu_a, a_info); - b2c_cleanup_gpu_ptr((void *) gpu_c, c_info); - + // quick return if possible + if (*n == 0 || ((*alpha == 0 || *k == 0) && *beta == 1)) + return false; + + if (*alpha == 0) { + if (upper) { + if (*beta == 0) { + for (int j=1; j<=*n; j++) + for (int i=1; i<=j; i++) + c[IDX2F(i, j, *ldc)] = 0; + } else { + for (int i=1; i<=*n; i++) + for (int j=1; j<=i; j++) + c[IDX2F(i, j, *ldc)] *= *beta; + } + } else { + if (*beta == 0) { + for (int j=1; j<=*n; j++) + for (int i=j; i<=*n; i++) + c[IDX2F(i, j, *ldc)] = 0; + } else { + for (int j=1; j<=*n; j++) + for (int i=j; i<=*n; i++) + c[IDX2F(i, j, *ldc)] *= *beta; + } + } + } + return true; } F77_syrk(s, float) { + if (!syrk_check(__func__, uplo, trans, n, k, alpha, a, lda, beta, c, ldc)) + return; _b2c_syrk(c_uplo(*uplo), c_trans(*trans), *n, *k, *alpha, a, *lda, *beta, c, *ldc, - &cublasSsyrk); +#if USE_CUDA + &cublasSsyrk +#else + &clblasSsyrk +#endif + ); } F77_syrk(d, double) { + if (!syrk_check(__func__, uplo, trans, n, k, alpha, a, lda, beta, c, ldc)) + return; _b2c_syrk(c_uplo(*uplo), c_trans(*trans), *n, *k, *alpha, a, *lda, *beta, c, *ldc, - &cublasDsyrk); +#if USE_CUDA + &cublasDsyrk +#else + &clblasDsyrk +#endif + ); } F77_syrk(c, float _Complex) { + if (!syrk_check(__func__, uplo, trans, n, k, alpha, a, lda, beta, c, ldc)) + return; _b2c_syrk(c_uplo(*uplo), c_trans(*trans), *n, *k, - cu(*alpha), - (cuComplex *) a, *lda, - cu(*beta), - (cuComplex *) c, *ldc, - &cublasCsyrk); + cu2(*alpha), + cmplx_ptr(a), *lda, + cu2(*beta), + cmplx_ptr(c), *ldc, +#if USE_CUDA + &cublasCsyrk +#else + &clblasCsyrk +#endif + ); } F77_syrk(z, double _Complex) { + if (!syrk_check(__func__, uplo, trans, n, k, alpha, a, lda, beta, c, ldc)) + return; _b2c_syrk(c_uplo(*uplo), c_trans(*trans), *n, *k, - cu(*alpha), - (cuDoubleComplex *) a, *lda, - cu(*beta), - (cuDoubleComplex *) c, *ldc, - &cublasZsyrk); + cu2(*alpha), + cmplx_ptr(a), *lda, + cu2(*beta), + cmplx_ptr(c), *ldc, +#if USE_CUDA + &cublasZsyrk +#else + &clblasZsyrk +#endif + ); } diff --git a/blas_level3/trmm.cc b/blas_level3/trmm.cc index 74c3dcd..4c5eb75 100644 --- a/blas_level3/trmm.cc +++ b/blas_level3/trmm.cc @@ -3,26 +3,14 @@ #include "../blas.h" #include "../conversions.h" #include "level3.h" -#include "../blas2cuda.h" #include "../runtime-blas.h" +#include "../runtime-mem.hpp" #if USE_CUDA extern cublasHandle_t b2c_cublas_handle; -#else -extern cl_command_queue opencl_cmd_queue; -#endif - template -void _b2c_trmm(const CBLAS_SIDE side, - const CBLAS_UPLO uplo, - const CBLAS_TRANSPOSE transa, - const CBLAS_DIAG diag, - const int m, const int n, - const T alpha, - const T *a, const int lda, - T *b, const int ldb, - cublasStatus_t trmm_func(cublasHandle_t, +using trmm_t = cublasStatus_t (*)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, @@ -31,90 +19,195 @@ void _b2c_trmm(const CBLAS_SIDE side, const T *, const T *, int, const T *, int, - T *, int)) -{ - const T *gpu_a; - T *gpu_b; - int rows_a, cols_a, - rows_b, cols_b; - int size_a, size_b; - cublasSideMode_t cside = cu(side); - cublasFillMode_t cuplo = cu(uplo); - cublasOperation_t ctransa = cu(transa); - cublasDiagType_t cdiag = cu(diag); - const struct objinfo *a_info, *b_info; + T *, int); +#else +extern cl_command_queue opencl_cmd_queue; - cols_a = lda; - rows_a = (side == CblasLeft) ? m : n; - size_a = size(0, rows_a, cols_a, sizeof(*a)); +template +using trmm_t = clblasStatus (*)(clblasOrder order, + clblasSide side, + clblasUplo uplo, + clblasTranspose transA, + clblasDiag diag, + size_t M, size_t N, + T alpha, + const cl_mem A, size_t offA, size_t lda, + cl_mem B, size_t offB, size_t ldb, + cl_uint numCommandQueues, cl_command_queue *commandQueues, + cl_uint numEventsInWaitList, const cl_event *eventWaitList, + cl_event *events); +#endif - cols_b = ldb; - rows_b = (side == CblasLeft) ? m : n; - size_b = size(0, rows_b, cols_b, sizeof(*b)); - gpu_a = (T *) b2c_place_on_gpu((void *) a, size_a, &a_info, NULL); - gpu_b = (T *) b2c_place_on_gpu((void *) b, size_b, &b_info, - (void *) gpu_a, &a_info, - NULL); +template +void _b2c_trmm(const CBLAS_SIDE side, + const CBLAS_UPLO uplo, + const CBLAS_TRANSPOSE transa, + const CBLAS_DIAG diag, + const int m, const int n, + const S alpha, + const T *a, const int lda, + T *b, const int ldb, + trmm_t trmm_func) +{ + gpuptr gpu_a(a, size(0, lda, m, sizeof *a)); + gpuptr gpu_b(b, size(0, ldb, m, sizeof *b)); call_kernel( +#if USE_CUDA trmm_func(b2c_handle, - cside, cuplo, - ctransa, cdiag, + cu(side), cu(uplo), + cu(transa), cu(diag), m, n, &alpha, gpu_a, lda, gpu_b, ldb, gpu_b, ldb) +#else + trmm_func(clblasColumnMajor, + clb(side), clb(uplo), + clb(transa), clb(diag), + m, n, + alpha, + gpu_a, 0, lda, + gpu_b, 0, ldb, + 1, &opencl_cmd_queue, + 0, NULL, + NULL) +#endif ); +} +template +bool trmm_check(const char *func_name, + char *side, char *uplo, char *transa, char *diag, + int *m, int *n, + T *alpha, + T *a, int *lda, + T *b, int *ldb) { + int lside = runtime_blas_lsame(side, "L"); + int nrowa; + int upper = runtime_blas_lsame(uplo, "U"); + int info; + + if (lside) + nrowa = *m; + else + nrowa = *n; - runtime_fatal_errmsg(cudaGetLastError(), __func__); + info = 0; - if (!b_info) { - b2c_copy_from_gpu(b, gpu_b, size_b); + if (!lside && !runtime_blas_lsame(side, "R")) + info = 1; + else if (!upper && !runtime_blas_lsame(uplo, "L")) + info = 2; + else if (!runtime_blas_lsame(transa, "N") && + !runtime_blas_lsame(transa, "T") && + !runtime_blas_lsame(transa, "C")) + info = 3; + else if (!runtime_blas_lsame(diag, "U") && !runtime_blas_lsame(diag, "N")) + info = 4; + else if (*m < 0) + info = 5; + else if (*n < 0) + info = 6; + else if (*lda < std::max(1, nrowa)) + info = 9; + else if (*ldb < std::max(1, *m)) + info = 11; + + if (info != 0) { + runtime_blas_xerbla(func_name, info); + return false; } - b2c_cleanup_gpu_ptr((void *) gpu_a, a_info); - b2c_cleanup_gpu_ptr((void *) gpu_b, b_info); + // quick return if possible + if (*m == 0 || *n == 0) + return false; + + // and when alpha == 0 + if (*alpha == 0) { + for (int j=1; j<=*n; j++) + for (int i=1; i<=*m; i++) + b[IDX2F(i, j, *ldb)] = 0; + return false; + } + + return true; } F77_trmm(s, float) { + if (!trmm_check(__func__, + side, uplo, transa, diag, + m, n, alpha, a, lda, b, ldb)) + return; _b2c_trmm(c_side(*side), c_uplo(*uplo), c_trans(*transa), c_diag(*diag), *m, *n, *alpha, a, *lda, b, *ldb, - &cublasStrmm); +#if USE_CUDA + &cublasStrmm +#else + &clblasStrmm +#endif + ); } F77_trmm(d, double) { + if (!trmm_check(__func__, + side, uplo, transa, diag, + m, n, alpha, a, lda, b, ldb)) + return; _b2c_trmm(c_side(*side), c_uplo(*uplo), c_trans(*transa), c_diag(*diag), *m, *n, *alpha, a, *lda, b, *ldb, - &cublasDtrmm); +#if USE_CUDA + &cublasDtrmm +#else + &clblasDtrmm +#endif + ); } F77_trmm(c, float _Complex) { + if (!trmm_check(__func__, + side, uplo, transa, diag, + m, n, alpha, a, lda, b, ldb)) + return; _b2c_trmm(c_side(*side), c_uplo(*uplo), c_trans(*transa), c_diag(*diag), *m, *n, - cu(*alpha), - (cuComplex *) a, *lda, - (cuComplex *) b, *ldb, - &cublasCtrmm); + cu2(*alpha), + cmplx_ptr(a), *lda, + cmplx_ptr(b), *ldb, +#if USE_CUDA + &cublasCtrmm +#else + &clblasCtrmm +#endif + ); } F77_trmm(z, double _Complex) { + if (!trmm_check(__func__, + side, uplo, transa, diag, + m, n, alpha, a, lda, b, ldb)) + return; _b2c_trmm(c_side(*side), c_uplo(*uplo), c_trans(*transa), c_diag(*diag), *m, *n, - cu(*alpha), - (cuDoubleComplex *) a, *lda, - (cuDoubleComplex *) b, *ldb, - &cublasZtrmm); + cu2(*alpha), + cmplx_ptr(a), *lda, + cmplx_ptr(b), *ldb, +#if USE_CUDA + &cublasZtrmm +#else + &clblasZtrmm +#endif + ); } diff --git a/blas_level3/trsm.cc b/blas_level3/trsm.cc index be4f343..6b49457 100644 --- a/blas_level3/trsm.cc +++ b/blas_level3/trsm.cc @@ -5,6 +5,7 @@ #include "level3.h" #include "../blas2cuda.h" #include "../runtime-blas.h" +#include "../runtime-mem.hpp" #if USE_CUDA extern cublasHandle_t b2c_cublas_handle; @@ -12,106 +13,201 @@ extern cublasHandle_t b2c_cublas_handle; extern cl_command_queue opencl_cmd_queue; #endif +template +#if USE_CUDA +using trsm_t = cublasStatus_t (*)(cublasHandle_t, + cublasSideMode_t, cublasFillMode_t, + cublasOperation_t, cublasDiagType_t, + int, int, + const S *, + const T *, int, + T *, int); +#else +using trsm_t = clblasStatus (*)(clblasOrder order, clblasSide side, + clblasUplo uplo, clblasTranspose transA, + clblasDiag diag, size_t M, size_t N, + S alpha, + const cl_mem A, size_t offA, size_t lda, + cl_mem B, size_t offB, size_t ldb, + cl_uint numCommandQueues, + cl_command_queue *commandQueues, + cl_uint numEventsInWaitList, + const cl_event *eventWaitList, + cl_event *events); +#endif -template + +template void _b2c_trsm(const CBLAS_SIDE side, const CBLAS_UPLO uplo, const CBLAS_TRANSPOSE transa, const CBLAS_DIAG diag, const int m, const int n, - const T alpha, + const S alpha, const T *a, const int lda, T *b, const int ldb, - cublasStatus_t trsm_func(cublasHandle_t, - cublasSideMode_t, cublasFillMode_t, - cublasOperation_t, cublasDiagType_t, - int, int, - const T *, - const T *, int, - T *, int)) + trsm_t trsm_func) { - const T *gpu_a; - T *gpu_b; - int rows_a, cols_a, - rows_b, cols_b; - int size_a, size_b; - cublasSideMode_t cside = cu(side); - cublasFillMode_t cuplo = cu(uplo); - cublasOperation_t ctransa = cu(transa); - cublasDiagType_t cdiag = cu(diag); - const struct objinfo *a_info, *b_info; - - cols_a = lda; - rows_a = (side == CblasLeft) ? m : n; - size_a = size(0, rows_a, cols_a, sizeof(*a)); - - cols_b = ldb; - rows_b = (side == CblasLeft) ? m : n; - size_b = size(0, rows_b, cols_b, sizeof(*b)); - - gpu_a = (T *) b2c_place_on_gpu((void *) a, size_a, &a_info, NULL); - gpu_b = (T *) b2c_place_on_gpu((void *) b, size_b, &b_info, - (void *) gpu_a, &a_info, - NULL); - + gpuptr gpu_a(a, size(0, lda, side == CblasLeft ? m : n, sizeof *a)); + gpuptr gpu_b(b, size(0, ldb, n, sizeof *b)); call_kernel( +#if USE_CUDA trsm_func(b2c_handle, - cside, cuplo, - ctransa, cdiag, + cu(side), cu(uplo), + cu(transa), cu(diag), m, n, &alpha, gpu_a, lda, gpu_b, ldb) +#else + trsm_func(clblasColumnMajor, + clb(side), clb(uplo), + clb(transa), clb(diag), + m, n, + alpha, + gpu_a, 0, lda, + gpu_b, 0, ldb, + 1, &opencl_cmd_queue, 0, NULL, NULL) +#endif ); +} + + +template +bool trsm_check(const char *func_name, + char *side, char *uplo, + char *transa, char *diag, + int *m, int *n, + T *alpha, + T *a, int *lda, + T *b, int *ldb) { + int lside = runtime_blas_lsame(side, "L"); + int nrowa; + int upper = runtime_blas_lsame(uplo, "U"); + int info; + + if (lside) + nrowa = *m; + else + nrowa = *n; + + info = 0; + if (!lside && !runtime_blas_lsame(side, "R")) + info = 1; + else if (!upper && !runtime_blas_lsame(uplo, "L")) + info = 2; + else if (!runtime_blas_lsame(transa, "N") && + !runtime_blas_lsame(transa, "T") && + !runtime_blas_lsame(transa, "C")) + info = 3; + else if (!runtime_blas_lsame(diag, "U") && !runtime_blas_lsame(diag, "N")) + info = 4; + else if (*m < 0) + info = 5; + else if (*n < 0) + info = 6; + else if (*lda < std::max(1, nrowa)) + info = 9; + else if (*ldb < std::max(1, *m)) + info = 11; - runtime_fatal_errmsg(cudaGetLastError(), __func__); + if (info != 0) { + runtime_blas_xerbla(func_name, info); + return false; + } - if (!b_info) { - b2c_copy_from_gpu(b, gpu_b, size_b); + // quick return if possible + if (*m == 0 || *n == 0) + return false; + + // and when alpha == 0 + if (*alpha == 0) { + for (int j=1; j<=*n; j++) + for (int i=1; i<=*m; i++) + b[IDX2F(i,j, *ldb)] = 0; } - b2c_cleanup_gpu_ptr((void *) gpu_a, a_info); - b2c_cleanup_gpu_ptr((void *) gpu_b, b_info); + return true; } F77_trsm(s, float) { + if (!trsm_check(__func__, + side, uplo, transa, diag, + m, n, alpha, + a, lda, b, ldb)) + return; _b2c_trsm(c_side(*side), c_uplo(*uplo), c_trans(*transa), c_diag(*diag), *m, *n, *alpha, a, *lda, b, *ldb, - &cublasStrsm); +#if USE_CUDA + &cublasStrsm +#else + &clblasStrsm +#endif + ); } F77_trsm(d, double) { + if (!trsm_check(__func__, + side, uplo, transa, diag, + m, n, alpha, + a, lda, b, ldb)) + return; _b2c_trsm(c_side(*side), c_uplo(*uplo), c_trans(*transa), c_diag(*diag), *m, *n, *alpha, a, *lda, b, *ldb, - &cublasDtrsm); +#if USE_CUDA + &cublasDtrsm +#else + &clblasDtrsm +#endif + ); } F77_trsm(c, float _Complex) { + if (!trsm_check(__func__, + side, uplo, transa, diag, + m, n, alpha, + a, lda, b, ldb)) + return; _b2c_trsm(c_side(*side), c_uplo(*uplo), c_trans(*transa), c_diag(*diag), *m, *n, cu(*alpha), - (cuComplex *)a, *lda, - (cuComplex *)b, *ldb, - &cublasCtrsm); + cmplx_ptr(a), *lda, + cmplx_ptr(b), *ldb, +#if USE_CUDA + &cublasCtrsm +#else + &clblasCtrsm +#endif + ); } F77_trsm(z, double _Complex) { + if (!trsm_check(__func__, + side, uplo, transa, diag, + m, n, alpha, + a, lda, b, ldb)) + return; _b2c_trsm(c_side(*side), c_uplo(*uplo), c_trans(*transa), c_diag(*diag), *m, *n, cu(*alpha), - (cuDoubleComplex *)a, *lda, - (cuDoubleComplex *)b, *ldb, - &cublasZtrsm); + cmplx_ptr(a), *lda, + cmplx_ptr(b), *ldb, +#if USE_CUDA + &cublasZtrsm +#else + &clblasZtrsm +#endif + ); } diff --git a/conversions.h b/conversions.h index dc73419..b5b8b68 100644 --- a/conversions.h +++ b/conversions.h @@ -113,6 +113,9 @@ static inline cuDoubleComplex cu(double r, double i) { return (cuDoubleComplex) { .x = r, .y = i }; } +static inline cuComplex cu2(float _Complex f) { return cu(f); } +static inline cuDoubleComplex cu2(double _Complex d) { return cu(d); } + static inline cublasOperation_t cu(CBLAS_TRANSPOSE trans) { switch (trans) { case CblasNoTrans: diff --git a/lib/obj_tracker.h b/lib/obj_tracker.h index 24eea1d..1785432 100644 --- a/lib/obj_tracker.h +++ b/lib/obj_tracker.h @@ -195,6 +195,9 @@ void internal_free(void *ptr); #ifdef __cplusplus }; +/** + * RAII for the object tracker + */ struct objtracker_guard { objtracker_guard() { obj_tracker_internal_enter(); } ~objtracker_guard() { obj_tracker_internal_leave(); } diff --git a/meson.build b/meson.build index 488d756..6b9ba1f 100644 --- a/meson.build +++ b/meson.build @@ -1,25 +1,28 @@ -project('gpublas', 'c', 'cpp', 'fortran', +project('libgpublas', 'c', 'cpp', 'fortran', default_options: ['c_std=gnu11', 'cpp_std=gnu++17'], version: '0.1', license: 'GPL3+', - meson_version: '>= 0.42.0') + meson_version: '>= 0.43.0') cc = meson.get_compiler('c') root_inc = include_directories('.') -c_args = [ +optflags = get_option('optflags') + +c_args = cc.get_supported_arguments([ '-Wall', '-Wextra', '-Wformat=2', '-Werror', '-Wno-unused-parameter', - '-Winit-self', - get_option('optflags'), + '-Wnull-dereference', + '-D_GLIBCXX_ASSERTIONS', + optflags, '-g', '-ggdb3', -] +]) -link_args = '-Wl,-init,blas2cuda_init,-fini,blas2cuda_fini,-eentry' +link_args = ['-Wl,-init,blas2cuda_init,-fini,blas2cuda_fini,-eentry'] sources = files( 'blas2cuda.c', @@ -78,12 +81,16 @@ blas_level3_sources = files( 'blas_level3/her2k.cc', 'blas_level3/herk.cc', 'blas_level3/symm.cc', - # 'blas_level3/syr2k.cc', - # 'blas_level3/syrk.cc', - # 'blas_level3/trmm.cc', - # 'blas_level3/trsm.cc', + 'blas_level3/syr2k.cc', + 'blas_level3/syrk.cc', + # TODO: cannot enable until https://github.com/clMathLibraries/clBLAS/issues/341 is resolved + # 'blas_level3/trmm.cc', + 'blas_level3/trsm.cc', ) +runtime = '' +prefix = get_option('prefix') + cudadir = get_option('CUDA') cuda_lib_dirs = [cudadir + '/lib64', cudadir + '/lib64/stubs'] @@ -100,19 +107,21 @@ gpu_libs = [] gpu_inc = [] gpu_srcs = [] -if not get_option('opencl') - libcudart_dep = cc.find_library('cudart', dirs: cuda_lib_dirs, required: false) - libcuda_dep = cc.find_library('cuda', dirs: cuda_lib_dirs, required: false) - libcublas_dep = cc.find_library('cublas', dirs: cuda_lib_dirs, required: false) +# select the supported runtime +if get_option('runtime') == 'auto' or get_option('backend') == 'cuda' + libcudart_dep = cc.find_library('cudart', dirs: cuda_lib_dirs, required: get_option('backend') == 'cuda') + libcuda_dep = cc.find_library('cuda', dirs: cuda_lib_dirs, required: get_option('backend') == 'cuda') + libcublas_dep = cc.find_library('cublas', dirs: cuda_lib_dirs, required: get_option('backend') == 'cuda') endif -if not get_option('opencl') and (libcublas_dep.found() and libcuda_dep.found() and libcudart_dep.found()) +if get_option('runtime') != 'opencl' and (libcublas_dep.found() and libcuda_dep.found() and libcudart_dep.found()) # use CUDA cuda_inc = include_directories(cudadir + '/include') gpu_libs = [libcublas_dep, libcuda_dep, libcudart_dep] gpu_inc = [cuda_inc] c_args += '-DUSE_CUDA' link_args += ',-rpath='+cudadir+'/lib64' + runtime = 'CUDA' else #use OpenCL libcl_dep = dependency('OpenCL') @@ -128,6 +137,7 @@ else command: [find_program('./clblas_ext.py'), '@INPUT@']) gpu_srcs += [clext_h, clblas_ext_h] c_args += '-DUSE_OPENCL' + runtime = 'OpenCL' endif if not get_option('blas_opt') @@ -150,3 +160,19 @@ libgpublas = library('blas2cuda', gpu_srcs + sources + blas_level1_sources + ) subdir('tests/netlib') + +output = [ + '', + '', + ' libgpublas ' + meson.project_version(), + ' ===============', + '', + ' Prefix....................... ' + prefix, + ' Runtime...................... ' + runtime, + ' C/C++ flags.................. ' + ' '.join(c_args), + '', + ' Now type \'ninja -C ' + meson.build_root() + '\' to build ' + meson.project_name(), + '', + '', +] +message('\n'.join(output)) diff --git a/meson_options.txt b/meson_options.txt index 95cb1e1..8e94a59 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -1,4 +1,4 @@ option('CUDA', type: 'string', value: '/opt/cuda', description: 'Path to CUDA libraries and headers') -option('opencl', type: 'boolean', value: false, description: 'Use OpenCL even if CUDA is present') -option('optflags', type: 'string', value: '-O3', description: 'Optimization flags') +option('runtime', type: 'combo', choices: ['auto', 'opencl', 'cuda'], value: 'auto', description: '') +option('optflags', type: 'string', value: '-O2', description: 'Optimization flags') option('blas_opt', type: 'boolean', value: true, description: 'Optimize when to use GPU for BLAS calls.') diff --git a/runtime-blas.h b/runtime-blas.h index 3add98d..3d20298 100644 --- a/runtime-blas.h +++ b/runtime-blas.h @@ -63,7 +63,7 @@ static inline const char *runtime_blas_error_msg(runtime_blas_error_t error) { } #else -#error "Only CUDA and OpenCL are supported" +#error "Only CUDA and OpenCL are supported. Define either USE_OPENCL or USE_CUDA" #endif #ifdef __cplusplus diff --git a/runtime-mem.hpp b/runtime-mem.hpp index 71d3e7b..3a9456a 100644 --- a/runtime-mem.hpp +++ b/runtime-mem.hpp @@ -6,14 +6,17 @@ #include #include +extern size_t b2c_hits, b2c_misses; + #if USE_OPENCL extern cl_command_queue opencl_cmd_queue; +extern cl_context opencl_ctx; #endif /** - * + * RAII for GPU buffers. */ -template +template ::value> class gpuptr { private: T *host_ptr; @@ -28,22 +31,11 @@ class gpuptr { const struct objinfo *o_info; #if USE_OPENCL - // if the type is const - template ::value, int> = 0> - cl_mem_flags get_mem_flags() { return CL_MEM_READ_ONLY; } - - // if the type is non-const - template ::value, int> = 0> - cl_mem_flags get_mem_flags() { return CL_MEM_READ_WRITE; } - + cl_mem_flags get_mem_flags() { return is_const ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE; } #endif gpuptr(T *host_ptr, size_t size) : host_ptr(host_ptr), size(size), gpu_ptr(0), grabbed(false), o_info(0) { runtime_error_t err; - extern size_t b2c_hits, b2c_misses; -#if USE_OPENCL - extern cl_context opencl_ctx; -#endif objtracker_guard guard; if (size == 0) { @@ -51,7 +43,7 @@ class gpuptr { #if USE_CUDA err = runtime_malloc((void **)&this->gpu_ptr, dummy_size); #elif USE_OPENCL - this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags(), dummy_size, NULL, &err); + this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags(), dummy_size, NULL, &err); #endif if (runtime_is_error(err)) { writef(STDERR_FILENO, "blas2cuda: failed to allocate %zu B on device: %s\n", @@ -66,7 +58,7 @@ class gpuptr { #if USE_CUDA err = runtime_malloc((void **)&this->gpu_ptr, size); #else - this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags(), size, NULL, &err); + this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags(), size, NULL, &err); #endif if (runtime_is_error(err)) { @@ -76,18 +68,12 @@ class gpuptr { } } else if ((this->o_info = obj_tracker_objinfo_subptr((void *)host_ptr))) { // host_ptr is already shared with GPU - assert(this->o_info->ptr == host_ptr); - err = runtime_svm_unmap((void *)host_ptr); - if (runtime_is_error(err)) { - writef(STDERR_FILENO, "blas2cuda: failed to unmap %p from host: %s\n", - host_ptr, runtime_error_string(err)); - abort(); - } #if USE_CUDA this->gpu_ptr = host_ptr; #else // create a buffer that is backed by SVM - this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags() | CL_MEM_USE_HOST_PTR, size, (void *)host_ptr, &err); + assert(this->o_info->ptr == host_ptr && "cannot create CL buffer within CL buffer"); + this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags() | CL_MEM_USE_HOST_PTR, size, (void *)host_ptr, &err); if (runtime_is_error(err)) { writef(STDERR_FILENO, "blas2cuda: failed to create a buffer backed by %p: %s\n", host_ptr, runtime_error_string(err)); @@ -100,7 +86,7 @@ class gpuptr { #if USE_CUDA err = runtime_malloc((void **)&this->gpu_ptr, size); #else - this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags() | CL_MEM_COPY_HOST_PTR, size, (void *)host_ptr, &err); + this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags() | CL_MEM_COPY_HOST_PTR, size, (void *)host_ptr, &err); #endif if (runtime_is_error(err)) { @@ -127,37 +113,35 @@ class gpuptr { } } - // if the type is const, do nothing - template ::value, int> = 0> - void cleanup_unmanaged() { } - - // if the type is non-const - template ::value, int> = 0> +private: void cleanup_unmanaged() { - runtime_error_t err; - - // copy the GPU buffer back to host - if (this->grabbed) { -#if USE_CUDA - err = runtime_memcpy_dtoh(this->host_ptr, this->gpu_ptr, this->size); -#else - err = clEnqueueReadBuffer(opencl_cmd_queue, this->gpu_ptr, CL_TRUE, 0, this->size, this->host_ptr, 0, NULL, NULL); -#endif - if (runtime_is_error(err)) { - writef(STDERR_FILENO, "blas2cuda: failed to copy %zu B from %p (GPU) ---> %p (CPU): %s\n", - this->size, this->gpu_ptr, this->host_ptr, runtime_error_string(err)); - abort(); + if (!is_const) { + runtime_error_t err; + + // copy the GPU buffer back to host + if (this->grabbed) { + #if USE_CUDA + err = runtime_memcpy_dtoh(this->host_ptr, this->gpu_ptr, this->size); + #else + err = clEnqueueReadBuffer(opencl_cmd_queue, this->gpu_ptr, CL_TRUE, 0, this->size, (void *) this->host_ptr, 0, NULL, NULL); + #endif + if (runtime_is_error(err)) { + writef(STDERR_FILENO, "blas2cuda: failed to copy %zu B from %p (GPU) ---> %p (CPU): %s\n", + this->size, this->gpu_ptr, this->host_ptr, runtime_error_string(err)); + abort(); + } } } } +public: ~gpuptr() { runtime_error_t err = RUNTIME_ERROR_SUCCESS; objtracker_guard guard; if (!this->o_info) { if (this->size > 0) - this->cleanup_unmanaged(); + this->cleanup_unmanaged(); // free the temporary GPU buffer #if USE_CUDA err = runtime_free((void *) this->gpu_ptr); @@ -180,6 +164,7 @@ class gpuptr { } } + // TODO: return const cl_mem if underlying buffer is constant #if USE_CUDA operator T*() { #else diff --git a/runtime.c b/runtime.c index c172231..6700156 100644 --- a/runtime.c +++ b/runtime.c @@ -14,6 +14,7 @@ struct opencl_device { char *name; cl_device_type type; cl_device_svm_capabilities svm_capabilities; + bool is_valid; }; struct opencl_platform { @@ -21,6 +22,7 @@ struct opencl_platform { char *name; cl_uint num_devices; struct opencl_device *devices; + bool is_valid; }; struct opencl_platform *opencl_platforms; @@ -39,58 +41,67 @@ struct opencl_platform *runtime_get_platforms(cl_int *err_in, cl_uint *nplatform if ((*err_in = clGetPlatformIDs(num_platforms, platform_ids, NULL)) != CL_SUCCESS) goto end; + // get platform info for (cl_uint p = 0; p < num_platforms; p++) { + struct opencl_platform *const curr_platform = &platforms[p]; size_t platform_name_sz; - platforms[p].id = platform_ids[p]; - if ((*err_in = clGetDeviceIDs(platforms[p].id, CL_DEVICE_TYPE_ALL, 0, NULL, &platforms[p].num_devices)) != CL_SUCCESS) - goto end; - platforms[p].devices = calloc(platforms[p].num_devices, sizeof *platforms[p].devices); - device_ids = realloc(device_ids, platforms[p].num_devices * sizeof *device_ids); - if ((*err_in = clGetDeviceIDs(platforms[p].id, CL_DEVICE_TYPE_ALL, platforms[p].num_devices, device_ids, NULL)) != CL_SUCCESS) - goto end; + curr_platform->id = platform_ids[p]; - if ((*err_in = clGetPlatformInfo(platforms[p].id, CL_PLATFORM_NAME, 0, NULL, &platform_name_sz)) != CL_SUCCESS) + if ((*err_in = clGetDeviceIDs(curr_platform->id, CL_DEVICE_TYPE_ALL, 0, NULL, &curr_platform->num_devices)) != CL_SUCCESS) goto end; - platforms[p].name = calloc(1, platform_name_sz); - if ((*err_in = clGetPlatformInfo(platforms[p].id, CL_PLATFORM_NAME, platform_name_sz, platforms[p].name, NULL)) != CL_SUCCESS) - goto end; - writef(STDOUT_FILENO, "Platform [%u] = %s\n", p, platforms[p].name); - - for (cl_uint d = 0; d < platforms[p].num_devices; ++d) { + curr_platform->devices = calloc(curr_platform->num_devices, sizeof *curr_platform->devices); + device_ids = realloc(device_ids, curr_platform->num_devices * sizeof *device_ids); + if ((*err_in = clGetDeviceIDs(curr_platform->id, CL_DEVICE_TYPE_ALL, curr_platform->num_devices, device_ids, NULL)) != CL_SUCCESS) + continue; + + if ((*err_in = clGetPlatformInfo(curr_platform->id, CL_PLATFORM_NAME, 0, NULL, &platform_name_sz)) != CL_SUCCESS) + continue; + curr_platform->name = calloc(1, platform_name_sz); + if ((*err_in = clGetPlatformInfo(curr_platform->id, CL_PLATFORM_NAME, platform_name_sz, curr_platform->name, NULL)) != CL_SUCCESS) + continue; + writef(STDOUT_FILENO, "Platform [%u] = %s\n", p, curr_platform->name); + + // get device info + for (cl_uint d = 0; d < curr_platform->num_devices; ++d) { + struct opencl_device *const curr_device = &curr_platform->devices[d]; size_t device_name_sz; - platforms[p].devices[d].id = device_ids[d]; + curr_device->id = device_ids[d]; if ((*err_in = clGetDeviceInfo(device_ids[d], CL_DEVICE_NAME, 0, NULL, &device_name_sz)) != CL_SUCCESS) - goto end; - platforms[p].devices[d].name = calloc(1, device_name_sz); + continue; + curr_device->name = calloc(1, device_name_sz); if ((*err_in = clGetDeviceInfo(device_ids[d], CL_DEVICE_NAME, device_name_sz, - platforms[p].devices[d].name, + curr_device->name, NULL)) != CL_SUCCESS) - goto end; + continue; if ((*err_in = clGetDeviceInfo(device_ids[d], CL_DEVICE_TYPE, - sizeof platforms[p].devices[d].type, - &platforms[p].devices[d].type, + sizeof curr_device->type, + &curr_device->type, NULL)) != CL_SUCCESS) continue; - writef(STDOUT_FILENO, " Device [%u] = %s (%s)\n", d, platforms[p].devices[d].name, clDeviceTypeGetString(platforms[p].devices[d].type)); - writef(STDOUT_FILENO, " SVM capabilities:\n"); + writef(STDOUT_FILENO, " Device [%u] = %s (%s)\n", d, curr_device->name, clDeviceTypeGetString(curr_device->type)); + writef(STDOUT_FILENO, " SVM capabilities:\n"); if ((*err_in = clGetDeviceInfo(device_ids[d], CL_DEVICE_SVM_CAPABILITIES, - sizeof platforms[p].devices[d].svm_capabilities, - &platforms[p].devices[d].svm_capabilities, + sizeof curr_device->svm_capabilities, + &curr_device->svm_capabilities, NULL)) != CL_SUCCESS) continue; + + curr_device->is_valid = true; - writef(STDOUT_FILENO, " Course-grained buffer?: %s\n", platforms[p].devices[d].svm_capabilities & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "yes" : "no"); - writef(STDOUT_FILENO, " Fine-grained buffer?: %s\n", platforms[p].devices[d].svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "yes" : "no"); - writef(STDOUT_FILENO, " Fine-grained system?: %s\n", platforms[p].devices[d].svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "yes" : "no"); - writef(STDOUT_FILENO, " Atomics?: %s\n", platforms[p].devices[d].svm_capabilities & CL_DEVICE_SVM_ATOMICS ? "yes" : "no"); + writef(STDOUT_FILENO, " Course-grained buffer?: %s\n", curr_device->svm_capabilities & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "yes" : "no"); + writef(STDOUT_FILENO, " Fine-grained buffer?: %s\n", curr_device->svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "yes" : "no"); + writef(STDOUT_FILENO, " Fine-grained system?: %s\n", curr_device->svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "yes" : "no"); + writef(STDOUT_FILENO, " Atomics?: %s\n", curr_device->svm_capabilities & CL_DEVICE_SVM_ATOMICS ? "yes" : "no"); } + + curr_platform->is_valid = true; } end: @@ -104,44 +115,78 @@ struct opencl_platform *runtime_get_platforms(cl_int *err_in, cl_uint *nplatform } void opencl_platform_cleanup(struct opencl_platform platform) { - for (cl_uint d = 0; d < platform.num_devices; d++) + for (cl_uint d = 0; platform.devices && d < platform.num_devices; d++) free(platform.devices[d].name); free(platform.devices); free(platform.name); } -#endif +#endif /* USE_OPENCL */ runtime_error_t runtime_init(runtime_init_info_t info) { #if USE_CUDA return cudaSuccess; #else runtime_error_t err; + bool have_platform = false; opencl_platforms = runtime_get_platforms(&err, &num_platforms); - if (runtime_is_error(err)) { - writef(STDERR_FILENO, "blas2cuda: %s: failed to get OpenCL runtimes - %s\n", __func__, runtime_error_string(err)); + for (cl_uint p = 0; p < num_platforms; ++p) + if (opencl_platforms[p].is_valid) { + have_platform = true; + break; + } + + if (!have_platform) { + writef(STDERR_FILENO, "blas2cuda: %s: failed to get OpenCL platforms - %s\n", __func__, runtime_error_string(err)); abort(); } // create a context for a platform and device - opencl_ctx = clCreateContext( - (cl_context_properties[]){ - CL_CONTEXT_PLATFORM, - (cl_context_properties) opencl_platforms[info.platform].id, - 0 - }, 1, - &opencl_platforms[info.platform].devices[info.device].id, NULL, - NULL, &err); - - if (runtime_is_error(err)) + struct opencl_platform *selected_platform = NULL; + struct opencl_device *selected_device = NULL; + for (cl_uint p = 0; p < num_platforms && !selected_device; p++) { + struct opencl_platform *const curr_platform = &opencl_platforms[p]; + if (!curr_platform->is_valid) + continue; + for (cl_uint d = 0; d < curr_platform->num_devices && !selected_device; d++) { + struct opencl_device *const curr_device = &curr_platform->devices[d]; + if (!curr_device->is_valid) + continue; + opencl_ctx = clCreateContext( + (cl_context_properties[]){ + CL_CONTEXT_PLATFORM, + (cl_context_properties) curr_platform->id, + 0 + }, 1, + &curr_device->id, NULL, + NULL, &err); + if (runtime_is_error(err)) + return err; + // we will break now that opencl_ctx has been initialized + selected_device = curr_device; + selected_platform = curr_platform; + if (p != info.platform) + writef(STDERR_FILENO, "blas2cuda: %s: WARNING: could not select platform #%d (%s)\n", __func__, info.platform, curr_platform->name); + if (d != info.device) + writef(STDERR_FILENO, "blas2cuda: %s: WARNING: could not select device #%d (%s)\n", __func__, info.device, curr_device->name); + } + } + + if (!selected_device) { + writef(STDERR_FILENO, "blas2cuda: %s: FATAL: could not select a device\n", __func__); return err; + } // create a command queue for the device opencl_cmd_queue = clCreateCommandQueueWithProperties( - opencl_ctx, opencl_platforms[info.platform].devices[info.device].id, + opencl_ctx, selected_device->id, (cl_queue_properties[]) { 0 }, &err); + + if (!runtime_is_error(err)) + writef(STDOUT_FILENO, "blas2cuda: %s: selected %s [%s]\n", + __func__, selected_platform->name, selected_device->name); return err; #endif diff --git a/tests/netlib/meson.build b/tests/netlib/meson.build index 939d621..e961a23 100644 --- a/tests/netlib/meson.build +++ b/tests/netlib/meson.build @@ -12,10 +12,10 @@ sh = find_program('./test.py') libm_dep = cc.find_library('m') libft_dep = cc.find_library('gfortran') -foreach prefix : exe_prefixes - exe = executable(prefix + '-blas', [prefix + '.f'], include_directories: incdir, - dependencies: [libblas_dep, libm_dep, libft_dep], c_args: ['-O0', '-g', '-ggdb3']) - test(prefix + '-' + 'correctness', sh, - args: [prefix, exe, files('input.'+prefix), libgpublas, '--nointeractive'], +foreach exe_prefix : exe_prefixes + exe = executable(exe_prefix + '-blas', [exe_prefix + '.f'], include_directories: incdir, + dependencies: [libblas_dep, libm_dep, libft_dep], c_args: ['-O0', '-g', '-ggdb3'], link_with: [libgpublas]) + test(exe_prefix + '-' + 'correctness', sh, + args: [exe_prefix, exe, files('input.'+exe_prefix), libgpublas, '--nointeractive'], timeout: 300) endforeach diff --git a/tests/netlib/test.py b/tests/netlib/test.py index 6c02f9d..9ddd381 100755 --- a/tests/netlib/test.py +++ b/tests/netlib/test.py @@ -12,6 +12,7 @@ parser.add_argument('input', help='input file to test binary') parser.add_argument('gpublas', help='location of libgpublas') parser.add_argument('--nointeractive', action='store_true') + parser.add_argument('--valgrind', action='store_true') args = parser.parse_args() args.input = os.path.abspath(args.input) @@ -38,11 +39,24 @@ os._exit(126) else: try: - subprocess.run(['gdb', args.binary, '-ex', f'set exec-wrapper env LD_PRELOAD={args.gpublas}', '-ex', f'run <{args.input}'], check=True) + proc = None + if args.valgrind: + proc = subprocess.Popen(f'valgrind --vgdb=yes --vgdb-error=0 {args.binary} <{args.input} &>/dev/null', shell=True) + try: + proc.wait(3) + except: + pass + subprocess.run(['gdb', args.binary, '-ex', 'set non-stop off', '-ex', 'target remote | vgdb'], check=True) + proc.wait() + else: + subprocess.run(['gdb', args.binary, '-ex', f'set exec-wrapper env LD_PRELOAD={args.gpublas}', '-ex', f'run <{args.input}'], check=True) subprocess.run(['edit', f'{args.test.upper()}.SUMM'], check=True) except KeyboardInterrupt: - pass + if proc: + proc.kill() except: + if proc: + proc.kill() os._exit(126) print(f'Done testing {args.test}') os._exit(retval)