Skip to content

Commit

Permalink
Fix kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
Prince781 committed Feb 22, 2020
1 parent e7e410d commit ebbf547
Show file tree
Hide file tree
Showing 15 changed files with 793 additions and 360 deletions.
15 changes: 14 additions & 1 deletion blas.h
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,6 @@ F77_symm(d, double);
F77_symm(c, float _Complex);
F77_symm(z, double _Complex);


#define F77_syrk(prefix, T) \
void prefix##syrk_(char *uplo, char *trans, \
int *n, int *k, \
Expand All @@ -283,6 +282,10 @@ void prefix##syr2k_(char *uplo, char *trans, \
T *beta, \
T *c, int *ldc)

F77_syr2k(s, float);
F77_syr2k(d, double);
F77_syr2k(c, float _Complex);
F77_syr2k(z, double _Complex);

#define F77_trmm(prefix, T) \
void prefix##trmm_(char *side, \
Expand All @@ -292,6 +295,11 @@ void prefix##trmm_(char *side, \
T *a, int *lda, \
T *b, int *ldb)

F77_trmm(s, float);
F77_trmm(d, double);
F77_trmm(c, float _Complex);
F77_trmm(z, double _Complex);

#define F77_trsm(prefix, T) \
void prefix##trsm_(char *side, char *uplo, \
char *transa, char *diag, \
Expand All @@ -300,6 +308,11 @@ void prefix##trsm_(char *side, char *uplo, \
T *a, int *lda, \
T *b, int *ldb)

F77_trsm(s, float);
F77_trsm(d, double);
F77_trsm(c, float _Complex);
F77_trsm(z, double _Complex);

void _b2c_xerbla(const char *routine, int arg_pos);

#ifdef __cplusplus
Expand Down
21 changes: 2 additions & 19 deletions blas_level3/hemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ void _b2c_hemm(const CBLAS_SIDE side,
gpuptr<const T> gpu_b(b, size(0, ldb, n, sizeof(*b)));
gpuptr<T> gpu_c(c, size(0, ldc, n, sizeof(*c)));


call_kernel(
#if USE_CUDA
hemm_func(b2c_cublas_handle,
Expand Down Expand Up @@ -122,48 +121,32 @@ F77_hemm(c, float _Complex) {
hemm_check();
_b2c_hemm(c_side(*side), c_uplo(*uplo),
*m, *n,
#if USE_CUDA
cu(*alpha),
#else
cu2(*alpha),
#endif
cmplx_ptr(a), *lda,
cmplx_ptr(b), *ldb,
#if USE_CUDA
cu(*beta),
#else
cu2(*beta),
#endif
cmplx_ptr(c), *ldc,
#if USE_CUDA
&cublasChemm
#else
&clblasChemm
#endif
);
);
}

F77_hemm(z, double _Complex) {
hemm_check();
_b2c_hemm(c_side(*side), c_uplo(*uplo),
*m, *n,
#if USE_CUDA
cu(*alpha),
#else
cu2(*alpha),
#endif
cmplx_ptr(a), *lda,
cmplx_ptr(b), *ldb,
#if USE_CUDA
cu(*beta),
#else
cu2(*beta),
#endif
cmplx_ptr(c), *ldc,
#if USE_CUDA
&cublasZhemm
#else
&clblasZhemm
#endif
);
);
}
214 changes: 146 additions & 68 deletions blas_level3/syr2k.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,136 +3,214 @@
#include "../blas.h"
#include "../conversions.h"
#include "level3.h"
#include "../blas2cuda.h"
#include "../runtime-blas.h"
#include "../runtime-mem.hpp"

#if USE_CUDA
extern cublasHandle_t b2c_cublas_handle;
#else
extern cl_command_queue opencl_cmd_queue;
#endif


template <typename T>
void _b2c_syr2k(const CBLAS_UPLO uplo,
const CBLAS_TRANSPOSE trans,
const int n, const int k,
const T alpha,
const T *a, const int lda,
const T *b, const int ldb,
const T beta,
T *c, const int ldc,
cublasStatus_t syr2k_func(cublasHandle_t,
template <typename T, typename S>
#if USE_CUDA
using syr2k_t = cublasStatus_t (*)(cublasHandle_t,
cublasFillMode_t,
cublasOperation_t,
int, int,
const T *,
const T *, int,
const T *, int,
const T *,
T *, int))
{
const T *gpu_a, *gpu_b;
T *gpu_c;
int rows_a, cols_a,
rows_b, cols_b,
rows_c, cols_c;
int size_a, size_b, size_c;
cublasFillMode_t cuplo = cu(uplo);
cublasOperation_t ctrans = cu(trans);
const struct objinfo *a_info, *b_info, *c_info;

rows_c = ldc;
cols_c = n;
if (trans == CblasNoTrans) {
rows_a = lda;
cols_a = k;
rows_b = ldb;
cols_b = k;
} else {
rows_a = lda;
cols_a = n;
rows_b = ldb;
cols_b = n;
}

T *, int);
#else
using syr2k_t = clblasStatus (*)(clblasOrder order,
clblasUplo uplo, clblasTranspose transAB,
size_t N, size_t K,
S alpha,
const cl_mem A, size_t offA, size_t lda,
const cl_mem B, size_t offB, size_t ldb,
S beta,
cl_mem C, size_t offC, size_t ldc,
cl_uint numCommandQueues, cl_command_queue *commandQueues,
cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events);
#endif

size_a = size(0, rows_a, cols_a, sizeof(*a));
size_b = size(0, rows_b, cols_b, sizeof(*b));
size_c = size(0, rows_c, cols_c, sizeof(*c));

gpu_a = (T *) b2c_place_on_gpu((void *) a, size_a, &a_info, NULL);
gpu_b = (T *) b2c_place_on_gpu((void *) b, size_b, &b_info,
(void *) gpu_a, &a_info,
NULL);
gpu_c = (T *) b2c_place_on_gpu((void *) c, size_c, &c_info,
(void *) gpu_a, &a_info,
(void *) gpu_b, &b_info,
NULL);
template <typename T, typename S>
void _b2c_syr2k(const CBLAS_UPLO uplo,
const CBLAS_TRANSPOSE trans,
const int n, const int k,
const S alpha,
const T *a, const int lda,
const T *b, const int ldb,
const S beta,
T *c, const int ldc,
syr2k_t<T,S> syr2k_func)
{
gpuptr<const T> gpu_a(a, size(0, lda, trans == CblasNoTrans ? k : n, sizeof *a));
gpuptr<const T> gpu_b(b, size(0, ldb, trans == CblasNoTrans ? k : n, sizeof *b));
gpuptr<T> gpu_c(c, size(0, ldc, n, sizeof *c));

call_kernel(
syr2k_func(b2c_handle,
cuplo, ctrans,
#if USE_CUDA
syr2k_func(b2c_cublas_handle,
cu(uplo), cu(trans),
n, k,
&alpha,
gpu_a, lda,
gpu_b, ldb,
&beta,
gpu_c, ldc)
#else
syr2k_func(clblasColumnMajor,
clb(uplo), clb(trans),
n, k,
alpha,
gpu_a, 0, lda,
gpu_b, 0, ldb,
beta,
gpu_c, 0, ldc,
1, &opencl_cmd_queue, 0, NULL, NULL)
#endif
);
}

template <typename T, bool is_complex = std::is_same<T, float _Complex>::value || std::is_same<T, double _Complex>::value>
bool syr2k_check(const char *func_name,
char *uplo,
char *trans,
int *n, int *k,
T *alpha,
T *a, int *lda,
T *b, int *ldb,
T *beta,
T *c, int *ldc) {
int nrowa = runtime_blas_lsame(trans, "N") ? *n : *k;
int upper = runtime_blas_lsame(uplo, "U");
int info = 0;


runtime_fatal_errmsg(cudaGetLastError(), __func__);
if (!upper && !runtime_blas_lsame(uplo, "L"))
info = 1;
else if (!runtime_blas_lsame(trans, "N") &&
!runtime_blas_lsame(trans, "T") &&
(is_complex || !runtime_blas_lsame(trans, "C")))
info = 2;
else if (*n < 0)
info = 3;
else if (*k < 0)
info = 4;
else if (*lda < std::max(1, nrowa))
info = 7;
else if (*ldb < std::max(1, nrowa))
info = 9;
else if (*ldc < std::max(1, *n))
info = 12;

if (!c_info) {
b2c_copy_from_gpu(c, gpu_c, size_c);
if (info != 0) {
runtime_blas_xerbla(func_name, info);
return false;
}

b2c_cleanup_gpu_ptr((void *) gpu_a, a_info);
b2c_cleanup_gpu_ptr((void *) gpu_b, b_info);
b2c_cleanup_gpu_ptr((void *) gpu_c, c_info);
if (*n == 0 || ((*alpha == 0 || *k == 0) && *beta == 1))
return false;

if (*alpha == 0 || *n == 0 || *k == 0) {
if (upper) {
if (*beta == 0) {
for (int j=1; j<=*n; ++j)
for (int i=1; i<=j; ++i)
c[IDX2F(i,j,*ldc)] = 0;
} else {
for (int j=1; j<=*n; ++j)
for (int i=1; i<=j; ++i)
c[IDX2F(i,j,*ldc)] *= *beta;
}
} else {
if (*beta == 0) {
for (int j=1; j<=*n; ++j)
for (int i=j; i<=*n; ++i)
c[IDX2F(i,j,*ldc)] = 0;
} else {
for (int j=1; j<=*n; ++j)
for (int i=j; i<=*n; ++i)
c[IDX2F(i,j,*ldc)] *= *beta;
}
}
return false;
}

return true;
}

F77_syr2k(s, float) {
if (!syr2k_check(__func__, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc))
return;
_b2c_syr2k(c_uplo(*uplo), c_trans(*trans),
*n, *k,
*alpha,
a, *lda,
b, *ldb,
*beta,
c, *ldc,
&cublasSsyr2k);
#if USE_CUDA
&cublasSsyr2k
#else
&clblasSsyr2k
#endif
);
}

F77_syr2k(d, double) {
if (!syr2k_check(__func__, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc))
return;
_b2c_syr2k(c_uplo(*uplo), c_trans(*trans),
*n, *k,
*alpha,
a, *lda,
b, *ldb,
*beta,
c, *ldc,
&cublasDsyr2k);
#if USE_CUDA
&cublasDsyr2k
#else
&clblasDsyr2k
#endif
);
}

F77_syr2k(c, float _Complex) {
if (!syr2k_check(__func__, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc))
return;
_b2c_syr2k(c_uplo(*uplo), c_trans(*trans),
*n, *k,
cu(*alpha),
(cuComplex *) a, *lda,
(cuComplex *) b, *ldb,
cmplx_ptr(a), *lda,
cmplx_ptr(b), *ldb,
cu(*beta),
(cuComplex *) c, *ldc,
&cublasCsyr2k);
cmplx_ptr(c), *ldc,
#if USE_CUDA
&cublasCsyr2k
#else
&clblasCsyr2k
#endif
);
}

F77_syr2k(z, double _Complex) {
if (!syr2k_check(__func__, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc))
return;
_b2c_syr2k(c_uplo(*uplo), c_trans(*trans),
*n, *k,
cu(*alpha),
(cuDoubleComplex *) a, *lda,
(cuDoubleComplex *) b, *ldb,
cmplx_ptr(a), *lda,
cmplx_ptr(b), *ldb,
cu(*beta),
(cuDoubleComplex *) c, *ldc,
&cublasZsyr2k);
cmplx_ptr(c), *ldc,
#if USE_CUDA
&cublasZsyr2k
#else
&clblasZsyr2k
#endif
);
}
Loading

0 comments on commit ebbf547

Please sign in to comment.