diff --git a/blas.h b/blas.h
index 66dca62..40bde3e 100644
--- a/blas.h
+++ b/blas.h
@@ -265,7 +265,6 @@ F77_symm(d, double);
 F77_symm(c, float _Complex);
 F77_symm(z, double _Complex);
 
-
 #define F77_syrk(prefix, T)                         \
 void prefix##syrk_(char *uplo, char *trans,         \
         int *n, int *k,                             \
@@ -283,6 +282,10 @@ void prefix##syr2k_(char *uplo, char *trans,        \
         T *beta,                                    \
         T *c, int *ldc)
 
+F77_syr2k(s, float);
+F77_syr2k(d, double);
+F77_syr2k(c, float _Complex);
+F77_syr2k(z, double _Complex);
 
 #define F77_trmm(prefix, T)                         \
 void prefix##trmm_(char *side,                      \
@@ -292,6 +295,11 @@ void prefix##trmm_(char *side,                      \
         T *a, int *lda,                             \
         T *b, int *ldb)
 
+F77_trmm(s, float);
+F77_trmm(d, double);
+F77_trmm(c, float _Complex);
+F77_trmm(z, double _Complex);
+
 #define F77_trsm(prefix, T)                         \
 void prefix##trsm_(char *side, char *uplo,          \
         char *transa, char *diag,                   \
@@ -300,6 +308,11 @@ void prefix##trsm_(char *side, char *uplo,          \
         T *a, int *lda,                             \
         T *b, int *ldb)
 
+F77_trsm(s, float);
+F77_trsm(d, double);
+F77_trsm(c, float _Complex);
+F77_trsm(z, double _Complex);
+
 void _b2c_xerbla(const char *routine, int arg_pos);
 
 #ifdef __cplusplus
diff --git a/blas_level3/hemm.cc b/blas_level3/hemm.cc
index 44b0b5e..4da76f1 100644
--- a/blas_level3/hemm.cc
+++ b/blas_level3/hemm.cc
@@ -54,7 +54,6 @@ void _b2c_hemm(const CBLAS_SIDE side,
     gpuptr<const T> gpu_b(b, size(0, ldb, n, sizeof(*b)));
     gpuptr<T> gpu_c(c, size(0, ldc, n, sizeof(*c)));
 
-
     call_kernel(
 #if USE_CUDA
         hemm_func(b2c_cublas_handle,
@@ -122,48 +121,32 @@ F77_hemm(c, float _Complex) {
     hemm_check();
     _b2c_hemm(c_side(*side), c_uplo(*uplo),
             *m, *n, 
-#if USE_CUDA
             cu(*alpha),
-#else
-            cu2(*alpha),
-#endif
             cmplx_ptr(a), *lda,
             cmplx_ptr(b), *ldb,
-#if USE_CUDA
             cu(*beta),
-#else
-            cu2(*beta),
-#endif
             cmplx_ptr(c), *ldc,
 #if USE_CUDA
             &cublasChemm
 #else
             &clblasChemm
 #endif
-            );
+    );
 }
 
 F77_hemm(z, double _Complex) {
     hemm_check();
     _b2c_hemm(c_side(*side), c_uplo(*uplo),
             *m, *n, 
-#if USE_CUDA
             cu(*alpha),
-#else
-            cu2(*alpha),
-#endif
             cmplx_ptr(a), *lda,
             cmplx_ptr(b), *ldb,
-#if USE_CUDA
             cu(*beta),
-#else
-            cu2(*beta),
-#endif
             cmplx_ptr(c), *ldc,
 #if USE_CUDA
             &cublasZhemm
 #else
             &clblasZhemm
 #endif
-            );
+    );
 }
diff --git a/blas_level3/syr2k.cc b/blas_level3/syr2k.cc
index a4dae96..6713c53 100644
--- a/blas_level3/syr2k.cc
+++ b/blas_level3/syr2k.cc
@@ -3,8 +3,8 @@
 #include "../blas.h"
 #include "../conversions.h"
 #include "level3.h"
-#include "../blas2cuda.h"
 #include "../runtime-blas.h"
+#include "../runtime-mem.hpp"
 
 #if USE_CUDA
 extern cublasHandle_t b2c_cublas_handle;
@@ -12,17 +12,9 @@ extern cublasHandle_t b2c_cublas_handle;
 extern cl_command_queue opencl_cmd_queue;
 #endif
 
-
-template <typename T>
-void _b2c_syr2k(const CBLAS_UPLO uplo,
-        const CBLAS_TRANSPOSE trans,
-        const int n, const int k,
-        const T alpha,
-        const T *a, const int lda,
-        const T *b, const int ldb,
-        const T beta,
-        T *c, const int ldc,
-        cublasStatus_t syr2k_func(cublasHandle_t,
+template <typename T, typename S>
+#if USE_CUDA
+using syr2k_t = cublasStatus_t (*)(cublasHandle_t,
             cublasFillMode_t,
             cublasOperation_t,
             int, int,
@@ -30,70 +22,130 @@ void _b2c_syr2k(const CBLAS_UPLO uplo,
             const T *, int,
             const T *, int,
             const T *,
-            T *, int))
-{
-    const T *gpu_a, *gpu_b;
-    T *gpu_c;
-    int rows_a, cols_a,
-        rows_b, cols_b,
-        rows_c, cols_c;
-    int size_a, size_b, size_c;
-    cublasFillMode_t cuplo = cu(uplo);
-    cublasOperation_t ctrans = cu(trans);
-    const struct objinfo *a_info, *b_info, *c_info;
-
-    rows_c = ldc;
-    cols_c = n;
-    if (trans == CblasNoTrans) {
-        rows_a = lda;
-        cols_a = k;
-        rows_b = ldb;
-        cols_b = k;
-    } else {
-        rows_a = lda;
-        cols_a = n;
-        rows_b = ldb;
-        cols_b = n;
-    }
-
+            T *, int);
+#else
+using syr2k_t = clblasStatus (*)(clblasOrder order, 
+        clblasUplo uplo, clblasTranspose transAB, 
+        size_t N, size_t K, 
+        S alpha, 
+        const cl_mem A, size_t offA, size_t lda, 
+        const cl_mem B, size_t offB, size_t ldb, 
+        S beta, 
+        cl_mem C, size_t offC, size_t ldc, 
+        cl_uint numCommandQueues, cl_command_queue *commandQueues, 
+        cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events);
+#endif
 
-    size_a = size(0, rows_a, cols_a, sizeof(*a));
-    size_b = size(0, rows_b, cols_b, sizeof(*b));
-    size_c = size(0, rows_c, cols_c, sizeof(*c));
 
-    gpu_a = (T *) b2c_place_on_gpu((void *) a, size_a, &a_info, NULL);
-    gpu_b = (T *) b2c_place_on_gpu((void *) b, size_b, &b_info, 
-            (void *) gpu_a, &a_info,
-            NULL);
-    gpu_c = (T *) b2c_place_on_gpu((void *) c, size_c, &c_info, 
-            (void *) gpu_a, &a_info,
-            (void *) gpu_b, &b_info,
-            NULL);
+template <typename T, typename S>
+void _b2c_syr2k(const CBLAS_UPLO uplo,
+        const CBLAS_TRANSPOSE trans,
+        const int n, const int k,
+        const S alpha,
+        const T *a, const int lda,
+        const T *b, const int ldb,
+        const S beta,
+        T *c, const int ldc,
+        syr2k_t<T,S> syr2k_func)
+{
+    gpuptr<const T> gpu_a(a, size(0, lda, trans == CblasNoTrans ? k : n, sizeof *a));
+    gpuptr<const T> gpu_b(b, size(0, ldb, trans == CblasNoTrans ? k : n, sizeof *b));
+    gpuptr<T> gpu_c(c, size(0, ldc, n, sizeof *c));
 
     call_kernel(
-        syr2k_func(b2c_handle,
-                cuplo, ctrans,
+#if USE_CUDA
+        syr2k_func(b2c_cublas_handle,
+                cu(uplo), cu(trans),
                 n, k,
                 &alpha,
                 gpu_a, lda,
                 gpu_b, ldb,
                 &beta,
                 gpu_c, ldc)
+#else
+        syr2k_func(clblasColumnMajor, 
+            clb(uplo), clb(trans),
+            n, k,
+            alpha,
+            gpu_a, 0, lda,
+            gpu_b, 0, ldb,
+            beta,
+            gpu_c, 0, ldc,
+            1, &opencl_cmd_queue, 0, NULL, NULL)
+#endif
     );
+}
+
+template <typename T, bool is_complex = std::is_same<T, float _Complex>::value || std::is_same<T, double _Complex>::value>
+bool syr2k_check(const char *func_name,
+                 char *uplo, 
+                 char *trans,
+                 int *n, int *k,
+                 T *alpha,
+                 T *a, int *lda,
+                 T *b, int *ldb,
+                 T *beta,
+                 T *c, int *ldc) {
+    int nrowa = runtime_blas_lsame(trans, "N") ? *n : *k;
+    int upper = runtime_blas_lsame(uplo, "U");
+    int info = 0;
 
-    
-    runtime_fatal_errmsg(cudaGetLastError(), __func__);
+    if (!upper && !runtime_blas_lsame(uplo, "L"))
+        info = 1;
+    else if (!runtime_blas_lsame(trans, "N") && 
+             !runtime_blas_lsame(trans, "T") && 
+             (is_complex || !runtime_blas_lsame(trans, "C")))
+        info = 2;
+    else if (*n < 0)
+        info = 3;
+    else if (*k < 0)
+        info = 4;
+    else if (*lda < std::max(1, nrowa))
+        info = 7;
+    else if (*ldb < std::max(1, nrowa))
+        info = 9;
+    else if (*ldc < std::max(1, *n))
+        info = 12;
 
-    if (!c_info) {
-        b2c_copy_from_gpu(c, gpu_c, size_c);
+    if (info != 0) {
+        runtime_blas_xerbla(func_name, info);
+        return false;
     }
 
-    b2c_cleanup_gpu_ptr((void *) gpu_a, a_info);
-    b2c_cleanup_gpu_ptr((void *) gpu_b, b_info);
-    b2c_cleanup_gpu_ptr((void *) gpu_c, c_info);
+    if (*n == 0 || ((*alpha == 0 || *k == 0) && *beta == 1))
+        return false;
+
+    if (*alpha == 0 || *n == 0 || *k == 0) {
+        if (upper) {
+            if (*beta == 0) {
+                for (int j=1; j<=*n; ++j)
+                    for (int i=1; i<=j; ++i)
+                        c[IDX2F(i,j,*ldc)] = 0;
+            } else {
+                for (int j=1; j<=*n; ++j)
+                    for (int i=1; i<=j; ++i)
+                        c[IDX2F(i,j,*ldc)] *= *beta;
+            }
+        } else {
+            if (*beta == 0) {
+                for (int j=1; j<=*n; ++j)
+                    for (int i=j; i<=*n; ++i)
+                        c[IDX2F(i,j,*ldc)] = 0;
+            } else {
+                for (int j=1; j<=*n; ++j)
+                    for (int i=j; i<=*n; ++i)
+                        c[IDX2F(i,j,*ldc)] *= *beta;
+            }
+        }
+        return false;
+    }
+
+    return true;
 }
 
 F77_syr2k(s, float) {
+    if (!syr2k_check(__func__, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc))
+        return;
     _b2c_syr2k(c_uplo(*uplo), c_trans(*trans),
             *n, *k,
             *alpha,
@@ -101,10 +153,17 @@ F77_syr2k(s, float) {
             b, *ldb,
             *beta,
             c, *ldc,
-            &cublasSsyr2k);
+#if USE_CUDA
+            &cublasSsyr2k
+#else
+            &clblasSsyr2k
+#endif
+            );
 }
 
 F77_syr2k(d, double) {
+    if (!syr2k_check(__func__, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc))
+        return;
     _b2c_syr2k(c_uplo(*uplo), c_trans(*trans),
             *n, *k,
             *alpha,
@@ -112,27 +171,46 @@ F77_syr2k(d, double) {
             b, *ldb,
             *beta,
             c, *ldc,
-            &cublasDsyr2k);
+#if USE_CUDA
+            &cublasDsyr2k
+#else
+            &clblasDsyr2k
+#endif
+            );
 }
 
 F77_syr2k(c, float _Complex) {
+    if (!syr2k_check(__func__, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc))
+        return;
     _b2c_syr2k(c_uplo(*uplo), c_trans(*trans),
             *n, *k,
             cu(*alpha),
-            (cuComplex *) a, *lda,
-            (cuComplex *) b, *ldb,
+            cmplx_ptr(a), *lda,
+            cmplx_ptr(b), *ldb,
             cu(*beta),
-            (cuComplex *) c, *ldc,
-            &cublasCsyr2k);
+            cmplx_ptr(c), *ldc,
+#if USE_CUDA
+            &cublasCsyr2k
+#else
+            &clblasCsyr2k
+#endif
+            );
 }
 
 F77_syr2k(z, double _Complex) {
+    if (!syr2k_check(__func__, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc))
+        return;
     _b2c_syr2k(c_uplo(*uplo), c_trans(*trans),
             *n, *k,
             cu(*alpha),
-            (cuDoubleComplex *) a, *lda,
-            (cuDoubleComplex *) b, *ldb,
+            cmplx_ptr(a), *lda,
+            cmplx_ptr(b), *ldb,
             cu(*beta),
-            (cuDoubleComplex *) c, *ldc,
-            &cublasZsyr2k);
+            cmplx_ptr(c), *ldc,
+#if USE_CUDA
+            &cublasZsyr2k
+#else
+            &clblasZsyr2k
+#endif
+            );
 }
diff --git a/blas_level3/syrk.cc b/blas_level3/syrk.cc
index a17f9a4..0a001dc 100644
--- a/blas_level3/syrk.cc
+++ b/blas_level3/syrk.cc
@@ -3,8 +3,8 @@
 #include "../blas.h"
 #include "../conversions.h"
 #include "level3.h"
-#include "../blas2cuda.h"
 #include "../runtime-blas.h"
+#include "../runtime-mem.hpp"
 
 #if USE_CUDA
 extern cublasHandle_t b2c_cublas_handle;
@@ -12,52 +12,49 @@ extern cublasHandle_t b2c_cublas_handle;
 extern cl_command_queue opencl_cmd_queue;
 #endif
 
+#if USE_CUDA
+extern cublasHandle_t b2c_cublas_handle;
+#else
+extern cl_command_queue opencl_cmd_queue;
+#endif
 
 template <typename T>
-void _b2c_syrk(const CBLAS_UPLO uplo,
-        const CBLAS_TRANSPOSE trans,
-        const int n, const int k,
-        const T alpha,
-        const T *a, const int lda,
-        const T beta,
-        T *c, const int ldc,
-        cublasStatus_t syrk_func(cublasHandle_t,
+#if USE_CUDA
+using syr2k_t = cublasStatus_t (*)(cublasHandle_t,
             cublasFillMode_t, cublasOperation_t,
             int, int,
             const T *,
             const T *, int,
             const T *,
-            T *, int))
-{
-    const T *gpu_a;
-    T *gpu_c;
-    int rows_a, cols_a,
-        rows_c, cols_c;
-    int size_a, size_c;
-    cublasFillMode_t cuplo = cu(uplo);
-    cublasOperation_t ctrans = cu(trans);
-    const struct objinfo *a_info, *c_info;
-
-    rows_c = ldc;
-    cols_c = n;
-    if (trans == CblasNoTrans) {
-        rows_a = lda;
-        cols_a = k;
-    } else {
-        rows_a = lda;
-        cols_a = n;
-    }
-
-
-    size_a = size(0, rows_a, cols_a, sizeof(*a));
-    size_c = size(0, rows_c, cols_c, sizeof(*c));
+            T *, int);
+#else
+using syrk_t = clblasStatus (*)(clblasOrder order,
+            clblasUplo uplo,
+            clblasTranspose transAB,
+            size_t N, size_t K,
+            T alpha,
+            const cl_mem A, size_t offA, size_t lda,
+            T beta,
+            cl_mem C, size_t offC, size_t ldc,
+            cl_uint numCommandQueues, cl_command_queue *commandQueues, 
+            cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events);
+#endif
 
-    gpu_a = (T *) b2c_place_on_gpu((void *) a, size_a, &a_info, NULL);
-    gpu_c = (T *) b2c_place_on_gpu((void *) c, size_c, &c_info, 
-            (void *) gpu_a, &a_info,
-            NULL);
+template <typename S, typename T>
+void _b2c_syrk(const CBLAS_UPLO uplo,
+        const CBLAS_TRANSPOSE trans,
+        const int n, const int k,
+        const S alpha,
+        const T *a, const int lda,
+        const S beta,
+        T *c, const int ldc,
+        syrk_t<S> syrk_func)
+{
+    gpuptr<const T> gpu_a(a, size(0, lda, k, sizeof *a));
+    gpuptr<T> gpu_c(c, size(0, ldc, n, sizeof *c));
 
     call_kernel(
+#if USE_CUDA
         syrk_func(b2c_handle,
                 cuplo, ctrans,
                 n, k,
@@ -65,56 +62,153 @@ void _b2c_syrk(const CBLAS_UPLO uplo,
                 gpu_a, lda,
                 &beta,
                 gpu_c, ldc)
+#else
+        syrk_func(clblasColumnMajor,
+            clb(uplo), clb(trans),
+            n, k,
+            alpha,
+            gpu_a, 0, lda,
+            beta,
+            gpu_c, 0, ldc,
+            1, &opencl_cmd_queue, 0, NULL, NULL)
+#endif
     );
+}
 
+template <typename T>
+bool syrk_check(const char *func_name,
+                 char *uplo,
+                 char *trans,
+                 int *n, int *k,
+                 T *alpha,
+                 T *a, int *lda,
+                 T *beta,
+                 T *c, int *ldc) {
+    int nrowa = -1;
+    int info;
+    int upper;
+
+    if (runtime_blas_lsame(trans, "N"))
+        nrowa = *n;
+    else
+        nrowa = *k;
+    upper = runtime_blas_lsame(uplo, "U");
+    info = 0;
+
+    if (!upper && !runtime_blas_lsame(uplo, "L"))
+        info = 1;
+    else if (!runtime_blas_lsame(trans, "N") &&
+             !runtime_blas_lsame(trans, "T") &&
+             !runtime_blas_lsame(trans, "C"))
+        info = 2;
+    else if (*n < 0)
+        info = 3;
+    else if (*k < 0)
+        info = 4;
+    else if (*lda < std::max(1, nrowa))
+        info = 7;
+    else if (*ldc < std::max(1, *n))
+        info = 10;
     
-    runtime_fatal_errmsg(cudaGetLastError(), __func__);
-
-    if (!c_info) {
-        b2c_copy_from_gpu(c, gpu_c, size_c);
+    if (info != 0) {
+        runtime_blas_xerbla(func_name, info);
+        return false;
     }
 
-    b2c_cleanup_gpu_ptr((void *) gpu_a, a_info);
-    b2c_cleanup_gpu_ptr((void *) gpu_c, c_info);
-
+    // quick return if possible
+    if (*n == 0 || ((*alpha == 0 || *k == 0) && *beta == 1))
+        return false;
+    
+    if (*alpha == 0) {
+        if (upper) {
+            if (*beta == 0) {
+                for (int j=1; j<=*n; j++)
+                    for (int i=1; i<=j; i++)
+                        c[IDX2F(i, j, *ldc)] = 0;
+            } else {
+                for (int i=1; i<=*n; i++)
+                    for (int j=1; j<=i; j++)
+                        c[IDX2F(i, j, *ldc)] *= *beta;
+            }
+        } else {
+            if (*beta == 0) {
+                for (int j=1; j<=*n; j++)
+                    for (int i=j; i<=*n; i++)
+                        c[IDX2F(i, j, *ldc)] = 0;
+            } else {
+                for (int j=1; j<=*n; j++)
+                    for (int i=j; i<=*n; i++)
+                        c[IDX2F(i, j, *ldc)] *= *beta;
+            }
+        }
+    }
+    return true;
 }
 
 F77_syrk(s, float) {
+    if (!syrk_check(__func__, uplo, trans, n, k, alpha, a, lda, beta, c, ldc))
+        return;
     _b2c_syrk(c_uplo(*uplo), c_trans(*trans),
             *n, *k,
             *alpha,
             a, *lda,
             *beta,
             c, *ldc,
-            &cublasSsyrk);
+#if USE_CUDA
+            &cublasSsyrk
+#else
+            &clblasSsyrk
+#endif
+    );
 }
 
 F77_syrk(d, double) {
+    if (!syrk_check(__func__, uplo, trans, n, k, alpha, a, lda, beta, c, ldc))
+        return;
     _b2c_syrk(c_uplo(*uplo), c_trans(*trans),
             *n, *k,
             *alpha,
             a, *lda,
             *beta,
             c, *ldc,
-            &cublasDsyrk);
+#if USE_CUDA
+            &cublasDsyrk
+#else
+            &clblasDsyrk
+#endif
+    );
 }
 
 F77_syrk(c, float _Complex) {
+    if (!syrk_check(__func__, uplo, trans, n, k, alpha, a, lda, beta, c, ldc))
+        return;
     _b2c_syrk(c_uplo(*uplo), c_trans(*trans),
             *n, *k,
-            cu(*alpha),
-            (cuComplex *) a, *lda,
-            cu(*beta),
-            (cuComplex *) c, *ldc,
-            &cublasCsyrk);
+            cu2(*alpha),
+            cmplx_ptr(a), *lda,
+            cu2(*beta),
+            cmplx_ptr(c), *ldc,
+#if USE_CUDA
+            &cublasCsyrk
+#else
+            &clblasCsyrk
+#endif
+    );
 }
 
 F77_syrk(z, double _Complex) {
+    if (!syrk_check(__func__, uplo, trans, n, k, alpha, a, lda, beta, c, ldc))
+        return;
     _b2c_syrk(c_uplo(*uplo), c_trans(*trans),
             *n, *k,
-            cu(*alpha),
-            (cuDoubleComplex *) a, *lda,
-            cu(*beta),
-            (cuDoubleComplex *) c, *ldc,
-            &cublasZsyrk);
+            cu2(*alpha),
+            cmplx_ptr(a), *lda,
+            cu2(*beta),
+            cmplx_ptr(c), *ldc,
+#if USE_CUDA
+            &cublasZsyrk
+#else
+            &clblasZsyrk
+#endif
+    );
 }
diff --git a/blas_level3/trmm.cc b/blas_level3/trmm.cc
index 74c3dcd..4c5eb75 100644
--- a/blas_level3/trmm.cc
+++ b/blas_level3/trmm.cc
@@ -3,26 +3,14 @@
 #include "../blas.h"
 #include "../conversions.h"
 #include "level3.h"
-#include "../blas2cuda.h"
 #include "../runtime-blas.h"
+#include "../runtime-mem.hpp"
 
 #if USE_CUDA
 extern cublasHandle_t b2c_cublas_handle;
-#else
-extern cl_command_queue opencl_cmd_queue;
-#endif
-
 
 template <typename T>
-void _b2c_trmm(const CBLAS_SIDE side,
-        const CBLAS_UPLO uplo,
-        const CBLAS_TRANSPOSE transa,
-        const CBLAS_DIAG diag,
-        const int m, const int n,
-        const T alpha,
-        const T *a, const int lda,
-        T *b, const int ldb,
-        cublasStatus_t trmm_func(cublasHandle_t,
+using trmm_t = cublasStatus_t (*)(cublasHandle_t,
             cublasSideMode_t,
             cublasFillMode_t,
             cublasOperation_t,
@@ -31,90 +19,195 @@ void _b2c_trmm(const CBLAS_SIDE side,
             const T *,
             const T *, int,
             const T *, int,
-            T *, int))
-{
-    const T *gpu_a;
-    T *gpu_b;
-    int rows_a, cols_a,
-        rows_b, cols_b;
-    int size_a, size_b;
-    cublasSideMode_t cside = cu(side);
-    cublasFillMode_t cuplo = cu(uplo);
-    cublasOperation_t ctransa = cu(transa);
-    cublasDiagType_t cdiag = cu(diag);
-    const struct objinfo *a_info, *b_info;
+            T *, int);
+#else
+extern cl_command_queue opencl_cmd_queue;
 
-    cols_a = lda;
-    rows_a = (side == CblasLeft) ? m : n;
-    size_a = size(0, rows_a, cols_a, sizeof(*a));
+template <typename T>
+using trmm_t = clblasStatus (*)(clblasOrder order, 
+                                clblasSide side,
+                                clblasUplo uplo, 
+                                clblasTranspose transA, 
+                                clblasDiag diag, 
+                                size_t M, size_t N,
+                                T alpha, 
+                                const cl_mem A, size_t offA, size_t lda, 
+                                cl_mem B, size_t offB, size_t ldb, 
+                                cl_uint numCommandQueues, cl_command_queue *commandQueues, 
+                                cl_uint numEventsInWaitList, const cl_event *eventWaitList, 
+                                cl_event *events);
+#endif
 
-    cols_b = ldb;
-    rows_b = (side == CblasLeft) ? m : n;
-    size_b = size(0, rows_b, cols_b, sizeof(*b));
 
-    gpu_a = (T *) b2c_place_on_gpu((void *) a, size_a, &a_info, NULL);
-    gpu_b = (T *) b2c_place_on_gpu((void *) b, size_b, &b_info, 
-            (void *) gpu_a, &a_info,
-            NULL);
+template <typename S, typename T>
+void _b2c_trmm(const CBLAS_SIDE side,
+        const CBLAS_UPLO uplo,
+        const CBLAS_TRANSPOSE transa,
+        const CBLAS_DIAG diag,
+        const int m, const int n,
+        const S alpha,
+        const T *a, const int lda,
+        T *b, const int ldb,
+        trmm_t<S> trmm_func)
+{
+    gpuptr<const T> gpu_a(a, size(0, lda, m, sizeof *a));
+    gpuptr<T> gpu_b(b, size(0, ldb, m, sizeof *b));
 
     call_kernel(
+#if USE_CUDA
         trmm_func(b2c_handle,
-                cside, cuplo,
-                ctransa, cdiag,
+                cu(side), cu(uplo),
+                cu(transa), cu(diag),
                 m, n,
                 &alpha,
                 gpu_a, lda,
                 gpu_b, ldb,
                 gpu_b, ldb)
+#else
+        trmm_func(clblasColumnMajor,
+                  clb(side), clb(uplo),
+                  clb(transa), clb(diag),
+                  m, n,
+                  alpha,
+                  gpu_a, 0, lda,
+                  gpu_b, 0, ldb,
+                  1, &opencl_cmd_queue,
+                  0, NULL,
+                  NULL)
+#endif
     );
+}
 
+template <typename T>
+bool trmm_check(const char *func_name,
+                char *side, char *uplo, char *transa, char *diag,
+                int *m, int *n,
+                T *alpha,
+                T *a, int *lda,
+                T *b, int *ldb) {
+    int lside = runtime_blas_lsame(side, "L");
+    int nrowa;
+    int upper = runtime_blas_lsame(uplo, "U");
+    int info;
+
+    if (lside)
+        nrowa = *m;
+    else
+        nrowa = *n;
     
-    runtime_fatal_errmsg(cudaGetLastError(), __func__);
+    info = 0;
 
-    if (!b_info) {
-        b2c_copy_from_gpu(b, gpu_b, size_b);
+    if (!lside && !runtime_blas_lsame(side, "R"))
+        info = 1;
+    else if (!upper && !runtime_blas_lsame(uplo, "L"))
+        info = 2;
+    else if (!runtime_blas_lsame(transa, "N") &&
+             !runtime_blas_lsame(transa, "T") &&
+             !runtime_blas_lsame(transa, "C"))
+        info = 3;
+    else if (!runtime_blas_lsame(diag, "U") && !runtime_blas_lsame(diag, "N"))
+        info = 4;
+    else if (*m < 0)
+        info = 5;
+    else if (*n < 0)
+        info = 6;
+    else if (*lda < std::max(1, nrowa))
+        info = 9;
+    else if (*ldb < std::max(1, *m))
+        info = 11;
+    
+    if (info != 0) {
+        runtime_blas_xerbla(func_name, info);
+        return false;
     }
 
-    b2c_cleanup_gpu_ptr((void *) gpu_a, a_info);
-    b2c_cleanup_gpu_ptr((void *) gpu_b, b_info);
+    // quick return if possible
+    if (*m == 0 || *n == 0)
+        return false;
+    
+    // and when alpha == 0
+    if (*alpha == 0) {
+        for (int j=1; j<=*n; j++)
+            for (int i=1; i<=*m; i++)
+                b[IDX2F(i, j, *ldb)] = 0;
+        return false;
+    }
+
+    return true;
 }
 
 F77_trmm(s, float) {
+    if (!trmm_check(__func__, 
+                    side, uplo, transa, diag, 
+                    m, n, alpha, a, lda, b, ldb))
+        return;
     _b2c_trmm(c_side(*side), c_uplo(*uplo), 
             c_trans(*transa), c_diag(*diag),
             *m, *n,
             *alpha,
             a, *lda,
             b, *ldb,
-            &cublasStrmm);
+#if USE_CUDA
+            &cublasStrmm
+#else
+            &clblasStrmm
+#endif
+    );
 }
 
 F77_trmm(d, double) {
+    if (!trmm_check(__func__, 
+                    side, uplo, transa, diag, 
+                    m, n, alpha, a, lda, b, ldb))
+        return;
     _b2c_trmm(c_side(*side), c_uplo(*uplo), 
             c_trans(*transa), c_diag(*diag),
             *m, *n,
             *alpha,
             a, *lda,
             b, *ldb,
-            &cublasDtrmm);
+#if USE_CUDA
+            &cublasDtrmm
+#else
+            &clblasDtrmm
+#endif
+    );
 }
 
 F77_trmm(c, float _Complex) {
+    if (!trmm_check(__func__, 
+                    side, uplo, transa, diag, 
+                    m, n, alpha, a, lda, b, ldb))
+        return;
     _b2c_trmm(c_side(*side), c_uplo(*uplo), 
             c_trans(*transa), c_diag(*diag),
             *m, *n,
-            cu(*alpha),
-            (cuComplex *) a, *lda,
-            (cuComplex *) b, *ldb,
-            &cublasCtrmm);
+            cu2(*alpha),
+            cmplx_ptr(a), *lda,
+            cmplx_ptr(b), *ldb,
+#if USE_CUDA
+            &cublasCtrmm
+#else
+            &clblasCtrmm
+#endif
+    );
 }
 
 F77_trmm(z, double _Complex) {
+    if (!trmm_check(__func__, 
+                    side, uplo, transa, diag, 
+                    m, n, alpha, a, lda, b, ldb))
+        return;
     _b2c_trmm(c_side(*side), c_uplo(*uplo), 
             c_trans(*transa), c_diag(*diag),
             *m, *n,
-            cu(*alpha),
-            (cuDoubleComplex *) a, *lda,
-            (cuDoubleComplex *) b, *ldb,
-            &cublasZtrmm);
+            cu2(*alpha),
+            cmplx_ptr(a), *lda,
+            cmplx_ptr(b), *ldb,
+#if USE_CUDA
+            &cublasZtrmm
+#else
+            &clblasZtrmm
+#endif
+    );
 }
diff --git a/blas_level3/trsm.cc b/blas_level3/trsm.cc
index be4f343..6b49457 100644
--- a/blas_level3/trsm.cc
+++ b/blas_level3/trsm.cc
@@ -5,6 +5,7 @@
 #include "level3.h"
 #include "../blas2cuda.h"
 #include "../runtime-blas.h"
+#include "../runtime-mem.hpp"
 
 #if USE_CUDA
 extern cublasHandle_t b2c_cublas_handle;
@@ -12,106 +13,201 @@ extern cublasHandle_t b2c_cublas_handle;
 extern cl_command_queue opencl_cmd_queue;
 #endif
 
+template <typename S, typename T>
+#if USE_CUDA
+using trsm_t = cublasStatus_t (*)(cublasHandle_t,
+                                  cublasSideMode_t, cublasFillMode_t,
+                                  cublasOperation_t, cublasDiagType_t,
+                                  int, int,
+                                  const S *,
+                                  const T *, int,
+                                  T *, int);
+#else
+using trsm_t = clblasStatus (*)(clblasOrder order, clblasSide side,
+                                clblasUplo uplo, clblasTranspose transA,
+                                clblasDiag diag, size_t M, size_t N,
+                                S alpha, 
+                                const cl_mem A, size_t offA, size_t lda,
+                                cl_mem B, size_t offB, size_t ldb,
+                                cl_uint numCommandQueues,
+                                cl_command_queue *commandQueues,
+                                cl_uint numEventsInWaitList,
+                                const cl_event *eventWaitList,
+                                cl_event *events);
+#endif
 
-template <typename T>
+
+template <typename S, typename T>
 void _b2c_trsm(const CBLAS_SIDE side,
         const CBLAS_UPLO uplo,
         const CBLAS_TRANSPOSE transa,
         const CBLAS_DIAG diag,
         const int m, const int n,
-        const T alpha,
+        const S alpha,
         const T *a, const int lda,
         T *b, const int ldb,
-        cublasStatus_t trsm_func(cublasHandle_t,
-            cublasSideMode_t, cublasFillMode_t,
-            cublasOperation_t, cublasDiagType_t,
-            int, int,
-            const T *,
-            const T *, int,
-            T *, int))
+        trsm_t<S,T> trsm_func)
 {
-    const T *gpu_a;
-    T *gpu_b;
-    int rows_a, cols_a,
-        rows_b, cols_b;
-    int size_a, size_b;
-    cublasSideMode_t cside = cu(side);
-    cublasFillMode_t cuplo = cu(uplo);
-    cublasOperation_t ctransa = cu(transa);
-    cublasDiagType_t cdiag = cu(diag);
-    const struct objinfo *a_info, *b_info;
-
-    cols_a = lda;
-    rows_a = (side == CblasLeft) ? m : n;
-    size_a = size(0, rows_a, cols_a, sizeof(*a));
-
-    cols_b = ldb;
-    rows_b = (side == CblasLeft) ? m : n;
-    size_b = size(0, rows_b, cols_b, sizeof(*b));
-
-    gpu_a = (T *) b2c_place_on_gpu((void *) a, size_a, &a_info, NULL);
-    gpu_b = (T *) b2c_place_on_gpu((void *) b, size_b, &b_info, 
-            (void *) gpu_a, &a_info,
-            NULL);
-
+    gpuptr<const T> gpu_a(a, size(0, lda, side == CblasLeft ? m : n, sizeof *a));
+    gpuptr<T> gpu_b(b, size(0, ldb, n, sizeof *b));
     call_kernel(
+#if USE_CUDA
         trsm_func(b2c_handle,
-                cside, cuplo,
-                ctransa, cdiag,
+                cu(side), cu(uplo),
+                cu(transa), cu(diag),
                 m, n,
                 &alpha,
                 gpu_a, lda,
                 gpu_b, ldb)
+#else
+        trsm_func(clblasColumnMajor,
+                  clb(side), clb(uplo),
+                  clb(transa), clb(diag),
+                  m, n,
+                  alpha,
+                  gpu_a, 0, lda,
+                  gpu_b, 0, ldb,
+                  1, &opencl_cmd_queue, 0, NULL, NULL)
+#endif
     );
+}
+
+
+template <typename T>
+bool trsm_check(const char *func_name,
+                char *side, char *uplo,
+                char *transa, char *diag,
+                int *m, int *n,
+                T *alpha,
+                T *a, int *lda,
+                T *b, int *ldb) {
+    int lside = runtime_blas_lsame(side, "L");
+    int nrowa;
+    int upper = runtime_blas_lsame(uplo, "U");
+    int info;
+
+    if (lside)
+        nrowa = *m;
+    else
+        nrowa = *n;
+    
+    info = 0;
 
+    if (!lside && !runtime_blas_lsame(side, "R"))
+        info = 1;
+    else if (!upper && !runtime_blas_lsame(uplo, "L"))
+        info = 2;
+    else if (!runtime_blas_lsame(transa, "N") &&
+             !runtime_blas_lsame(transa, "T") &&
+             !runtime_blas_lsame(transa, "C"))
+        info = 3;
+    else if (!runtime_blas_lsame(diag, "U") && !runtime_blas_lsame(diag, "N"))
+        info = 4;
+    else if (*m < 0)
+        info = 5;
+    else if (*n < 0)
+        info = 6;
+    else if (*lda < std::max(1, nrowa))
+        info = 9;
+    else if (*ldb < std::max(1, *m))
+        info = 11;
     
-    runtime_fatal_errmsg(cudaGetLastError(), __func__);
+    if (info != 0) {
+        runtime_blas_xerbla(func_name, info);
+        return false;
+    }
 
-    if (!b_info) {
-        b2c_copy_from_gpu(b, gpu_b, size_b);
+    // quick return if possible
+    if (*m == 0 || *n == 0)
+        return false;
+    
+    // and when alpha == 0
+    if (*alpha == 0) {
+        for (int j=1; j<=*n; j++)
+            for (int i=1; i<=*m; i++)
+                b[IDX2F(i,j, *ldb)] = 0;
     }
 
-    b2c_cleanup_gpu_ptr((void *) gpu_a, a_info);
-    b2c_cleanup_gpu_ptr((void *) gpu_b, b_info);
+    return true;
 }
 
 F77_trsm(s, float) {
+    if (!trsm_check(__func__,
+                    side, uplo, transa, diag,
+                    m, n, alpha,
+                    a, lda, b, ldb))
+        return;
     _b2c_trsm(c_side(*side), c_uplo(*uplo),
             c_trans(*transa), c_diag(*diag),
             *m, *n, 
             *alpha,
             a, *lda,
             b, *ldb,
-            &cublasStrsm);
+#if USE_CUDA
+            &cublasStrsm
+#else
+            &clblasStrsm
+#endif
+    );
 }
 
 F77_trsm(d, double) {
+    if (!trsm_check(__func__,
+                    side, uplo, transa, diag,
+                    m, n, alpha,
+                    a, lda, b, ldb))
+        return;
     _b2c_trsm(c_side(*side), c_uplo(*uplo),
             c_trans(*transa), c_diag(*diag),
             *m, *n, 
             *alpha,
             a, *lda,
             b, *ldb,
-            &cublasDtrsm);
+#if USE_CUDA
+            &cublasDtrsm
+#else
+            &clblasDtrsm
+#endif
+    );
 }
 
 
 F77_trsm(c, float _Complex) {
+    if (!trsm_check(__func__,
+                    side, uplo, transa, diag,
+                    m, n, alpha,
+                    a, lda, b, ldb))
+        return;
     _b2c_trsm(c_side(*side), c_uplo(*uplo),
             c_trans(*transa), c_diag(*diag),
             *m, *n, 
             cu(*alpha),
-            (cuComplex *)a, *lda,
-            (cuComplex *)b, *ldb,
-            &cublasCtrsm);
+            cmplx_ptr(a), *lda,
+            cmplx_ptr(b), *ldb,
+#if USE_CUDA
+            &cublasCtrsm
+#else
+            &clblasCtrsm
+#endif
+    );
 }
 
 F77_trsm(z, double _Complex) {
+    if (!trsm_check(__func__,
+                    side, uplo, transa, diag,
+                    m, n, alpha,
+                    a, lda, b, ldb))
+        return;
     _b2c_trsm(c_side(*side), c_uplo(*uplo),
             c_trans(*transa), c_diag(*diag),
             *m, *n, 
             cu(*alpha),
-            (cuDoubleComplex *)a, *lda,
-            (cuDoubleComplex *)b, *ldb,
-            &cublasZtrsm);
+            cmplx_ptr(a), *lda,
+            cmplx_ptr(b), *ldb,
+#if USE_CUDA
+            &cublasZtrsm
+#else
+            &clblasZtrsm
+#endif
+    );
 }
diff --git a/conversions.h b/conversions.h
index dc73419..b5b8b68 100644
--- a/conversions.h
+++ b/conversions.h
@@ -113,6 +113,9 @@ static inline cuDoubleComplex cu(double r, double i) {
     return (cuDoubleComplex) { .x = r, .y = i };
 }
 
+static inline cuComplex cu2(float _Complex f) { return cu(f); }
+static inline cuDoubleComplex cu2(double _Complex d) { return cu(d); }
+
 static inline cublasOperation_t cu(CBLAS_TRANSPOSE trans) {
     switch (trans) {
         case CblasNoTrans:
diff --git a/lib/obj_tracker.h b/lib/obj_tracker.h
index 24eea1d..1785432 100644
--- a/lib/obj_tracker.h
+++ b/lib/obj_tracker.h
@@ -195,6 +195,9 @@ void internal_free(void *ptr);
 #ifdef __cplusplus
 };
 
+/**
+ * RAII for the object tracker
+ */
 struct objtracker_guard {
     objtracker_guard() { obj_tracker_internal_enter(); }
     ~objtracker_guard() { obj_tracker_internal_leave(); }
diff --git a/meson.build b/meson.build
index 488d756..6b9ba1f 100644
--- a/meson.build
+++ b/meson.build
@@ -1,25 +1,28 @@
-project('gpublas', 'c', 'cpp', 'fortran',
+project('libgpublas', 'c', 'cpp', 'fortran',
     default_options: ['c_std=gnu11', 'cpp_std=gnu++17'],
     version: '0.1',
     license: 'GPL3+',
-    meson_version: '>= 0.42.0')
+    meson_version: '>= 0.43.0')
 
 cc = meson.get_compiler('c')
 root_inc = include_directories('.')
 
-c_args = [
+optflags = get_option('optflags')
+
+c_args = cc.get_supported_arguments([
   '-Wall', 
   '-Wextra',
   '-Wformat=2',
   '-Werror', 
   '-Wno-unused-parameter',
-  '-Winit-self',
-  get_option('optflags'),
+  '-Wnull-dereference',
+  '-D_GLIBCXX_ASSERTIONS',
+  optflags,
   '-g',
   '-ggdb3',
-]
+])
 
-link_args = '-Wl,-init,blas2cuda_init,-fini,blas2cuda_fini,-eentry'
+link_args = ['-Wl,-init,blas2cuda_init,-fini,blas2cuda_fini,-eentry']
 
 sources = files(
     'blas2cuda.c',
@@ -78,12 +81,16 @@ blas_level3_sources = files(
     'blas_level3/her2k.cc',
     'blas_level3/herk.cc',
     'blas_level3/symm.cc',
-    #    'blas_level3/syr2k.cc',
-    #    'blas_level3/syrk.cc',
-    #    'blas_level3/trmm.cc',
-    #    'blas_level3/trsm.cc',
+    'blas_level3/syr2k.cc',
+    'blas_level3/syrk.cc',
+    # TODO: cannot enable until https://github.com/clMathLibraries/clBLAS/issues/341 is resolved
+    # 'blas_level3/trmm.cc',
+    'blas_level3/trsm.cc',
 )
 
+runtime = ''
+prefix = get_option('prefix')
+
 cudadir = get_option('CUDA')
 cuda_lib_dirs = [cudadir + '/lib64', cudadir + '/lib64/stubs']
 
@@ -100,19 +107,21 @@ gpu_libs = []
 gpu_inc = []
 gpu_srcs = []
 
-if not get_option('opencl')
-  libcudart_dep = cc.find_library('cudart', dirs: cuda_lib_dirs, required: false)
-  libcuda_dep = cc.find_library('cuda', dirs: cuda_lib_dirs, required: false)
-  libcublas_dep = cc.find_library('cublas', dirs: cuda_lib_dirs, required: false)
+# select the supported runtime
+if get_option('runtime') == 'auto' or get_option('backend') == 'cuda'
+  libcudart_dep = cc.find_library('cudart', dirs: cuda_lib_dirs, required: get_option('backend') == 'cuda')
+  libcuda_dep = cc.find_library('cuda', dirs: cuda_lib_dirs, required: get_option('backend') == 'cuda')
+  libcublas_dep = cc.find_library('cublas', dirs: cuda_lib_dirs, required: get_option('backend') == 'cuda')
 endif
 
-if not get_option('opencl') and (libcublas_dep.found() and libcuda_dep.found() and libcudart_dep.found())
+if get_option('runtime') != 'opencl' and (libcublas_dep.found() and libcuda_dep.found() and libcudart_dep.found())
   # use CUDA
   cuda_inc = include_directories(cudadir + '/include')
   gpu_libs = [libcublas_dep, libcuda_dep, libcudart_dep]
   gpu_inc = [cuda_inc]
   c_args += '-DUSE_CUDA'
   link_args += ',-rpath='+cudadir+'/lib64'
+  runtime = 'CUDA'
 else
   #use OpenCL
   libcl_dep = dependency('OpenCL')
@@ -128,6 +137,7 @@ else
                                 command: [find_program('./clblas_ext.py'), '@INPUT@'])
   gpu_srcs += [clext_h, clblas_ext_h]
   c_args += '-DUSE_OPENCL'
+  runtime = 'OpenCL'
 endif
 
 if not get_option('blas_opt')
@@ -150,3 +160,19 @@ libgpublas = library('blas2cuda', gpu_srcs + sources + blas_level1_sources +
 )
 
 subdir('tests/netlib')
+
+output = [
+  '',
+  '',
+  '   libgpublas ' + meson.project_version(),
+  '  ===============',
+  '',
+  '    Prefix....................... ' + prefix,
+  '    Runtime...................... ' + runtime,
+  '    C/C++ flags.................. ' + ' '.join(c_args),
+  '',
+  '  Now type \'ninja -C ' + meson.build_root() + '\' to build ' + meson.project_name(),
+  '',
+  '',
+]
+message('\n'.join(output))
diff --git a/meson_options.txt b/meson_options.txt
index 95cb1e1..8e94a59 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -1,4 +1,4 @@
 option('CUDA', type: 'string', value: '/opt/cuda', description: 'Path to CUDA libraries and headers')
-option('opencl', type: 'boolean', value: false, description: 'Use OpenCL even if CUDA is present')
-option('optflags', type: 'string', value: '-O3', description: 'Optimization flags')
+option('runtime', type: 'combo', choices: ['auto', 'opencl', 'cuda'], value: 'auto', description: '')
+option('optflags', type: 'string', value: '-O2', description: 'Optimization flags')
 option('blas_opt', type: 'boolean', value: true, description: 'Optimize when to use GPU for BLAS calls.')
diff --git a/runtime-blas.h b/runtime-blas.h
index 3add98d..3d20298 100644
--- a/runtime-blas.h
+++ b/runtime-blas.h
@@ -63,7 +63,7 @@ static inline const char *runtime_blas_error_msg(runtime_blas_error_t error) {
 }
 
 #else
-#error "Only CUDA and OpenCL are supported"
+#error "Only CUDA and OpenCL are supported. Define either USE_OPENCL or USE_CUDA"
 #endif
 
 #ifdef __cplusplus
diff --git a/runtime-mem.hpp b/runtime-mem.hpp
index 71d3e7b..3a9456a 100644
--- a/runtime-mem.hpp
+++ b/runtime-mem.hpp
@@ -6,14 +6,17 @@
 #include <type_traits>
 #include <iostream>
 
+extern size_t b2c_hits, b2c_misses;
+
 #if USE_OPENCL
 extern cl_command_queue opencl_cmd_queue;
+extern cl_context opencl_ctx;
 #endif
 
 /**
- * 
+ * RAII for GPU buffers.
  */
-template <class T>
+template <typename T, bool is_const = std::is_same<T, const T>::value>
 class gpuptr {
 private:
     T *host_ptr;
@@ -28,22 +31,11 @@ class gpuptr {
     const struct objinfo *o_info;
 
 #if USE_OPENCL
-    // if the type is const
-    template <class U, std::enable_if_t<std::is_same<U, const U>::value, int> = 0>
-    cl_mem_flags get_mem_flags() { return CL_MEM_READ_ONLY; }
-
-    // if the type is non-const
-    template <class U, std::enable_if_t<!std::is_same<U, const U>::value, int> = 0>
-    cl_mem_flags get_mem_flags() { return CL_MEM_READ_WRITE; }
-
+    cl_mem_flags get_mem_flags() { return is_const ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE; }
 #endif
 
     gpuptr(T *host_ptr, size_t size) : host_ptr(host_ptr), size(size), gpu_ptr(0), grabbed(false), o_info(0) {
         runtime_error_t err;
-        extern size_t b2c_hits, b2c_misses;
-#if USE_OPENCL
-        extern cl_context opencl_ctx;
-#endif
         objtracker_guard guard;
 
         if (size == 0) {
@@ -51,7 +43,7 @@ class gpuptr {
 #if USE_CUDA
             err = runtime_malloc((void **)&this->gpu_ptr, dummy_size);
 #elif USE_OPENCL
-            this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags<T>(), dummy_size, NULL, &err);
+            this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags(), dummy_size, NULL, &err);
 #endif
             if (runtime_is_error(err)) {
                 writef(STDERR_FILENO, "blas2cuda: failed to allocate %zu B on device: %s\n",
@@ -66,7 +58,7 @@ class gpuptr {
 #if USE_CUDA
             err = runtime_malloc((void **)&this->gpu_ptr, size);
 #else
-            this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags<T>(), size, NULL, &err);
+            this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags(), size, NULL, &err);
 #endif
 
             if (runtime_is_error(err)) {
@@ -76,18 +68,12 @@ class gpuptr {
             }
         } else if ((this->o_info = obj_tracker_objinfo_subptr((void *)host_ptr))) {
             // host_ptr is already shared with GPU
-            assert(this->o_info->ptr == host_ptr);
-            err = runtime_svm_unmap((void *)host_ptr);
-            if (runtime_is_error(err)) {
-                writef(STDERR_FILENO, "blas2cuda: failed to unmap %p from host: %s\n",
-                        host_ptr, runtime_error_string(err));
-                abort();
-            }
 #if USE_CUDA
             this->gpu_ptr = host_ptr;
 #else
             // create a buffer that is backed by SVM
-            this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags<T>() | CL_MEM_USE_HOST_PTR, size, (void *)host_ptr, &err);
+            assert(this->o_info->ptr == host_ptr && "cannot create CL buffer within CL buffer");
+            this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags() | CL_MEM_USE_HOST_PTR, size, (void *)host_ptr, &err);
             if (runtime_is_error(err)) {
                 writef(STDERR_FILENO, "blas2cuda: failed to create a buffer backed by %p: %s\n",
                         host_ptr, runtime_error_string(err));
@@ -100,7 +86,7 @@ class gpuptr {
 #if USE_CUDA
             err = runtime_malloc((void **)&this->gpu_ptr, size);
 #else
-            this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags<T>() | CL_MEM_COPY_HOST_PTR, size, (void *)host_ptr, &err);
+            this->gpu_ptr = clCreateBuffer(opencl_ctx, this->get_mem_flags() | CL_MEM_COPY_HOST_PTR, size, (void *)host_ptr, &err);
 #endif
 
             if (runtime_is_error(err)) {
@@ -127,37 +113,35 @@ class gpuptr {
         }
     }
 
-    // if the type is const, do nothing
-    template <class U, std::enable_if_t<std::is_same<U, const U>::value, int> = 0>
-    void cleanup_unmanaged() { }
-
-    // if the type is non-const
-    template <class U, std::enable_if_t<!std::is_same<U, const U>::value, int> = 0>
+private:
     void cleanup_unmanaged() {
-        runtime_error_t err;
-
-        // copy the GPU buffer back to host
-        if (this->grabbed) {
-#if USE_CUDA
-            err = runtime_memcpy_dtoh(this->host_ptr, this->gpu_ptr, this->size);
-#else
-            err = clEnqueueReadBuffer(opencl_cmd_queue, this->gpu_ptr, CL_TRUE, 0, this->size, this->host_ptr, 0, NULL, NULL);
-#endif
-            if (runtime_is_error(err)) {
-                writef(STDERR_FILENO, "blas2cuda: failed to copy %zu B from %p (GPU) ---> %p (CPU): %s\n", 
-                        this->size, this->gpu_ptr, this->host_ptr, runtime_error_string(err));
-                abort();
+        if (!is_const) {
+            runtime_error_t err;
+
+            // copy the GPU buffer back to host
+            if (this->grabbed) {
+    #if USE_CUDA
+                err = runtime_memcpy_dtoh(this->host_ptr, this->gpu_ptr, this->size);
+    #else
+                err = clEnqueueReadBuffer(opencl_cmd_queue, this->gpu_ptr, CL_TRUE, 0, this->size, (void *) this->host_ptr, 0, NULL, NULL);
+    #endif
+                if (runtime_is_error(err)) {
+                    writef(STDERR_FILENO, "blas2cuda: failed to copy %zu B from %p (GPU) ---> %p (CPU): %s\n", 
+                            this->size, this->gpu_ptr, this->host_ptr, runtime_error_string(err));
+                    abort();
+                }
             }
         }
     }
 
+public:
     ~gpuptr() {
         runtime_error_t err = RUNTIME_ERROR_SUCCESS;
         objtracker_guard guard;
 
         if (!this->o_info) {
             if (this->size > 0)
-                this->cleanup_unmanaged<T>();
+                this->cleanup_unmanaged();
             // free the temporary GPU buffer
 #if USE_CUDA
             err = runtime_free((void *) this->gpu_ptr);
@@ -180,6 +164,7 @@ class gpuptr {
         }
     }
 
+    // TODO: return const cl_mem if underlying buffer is constant
 #if USE_CUDA
     operator T*() {
 #else
diff --git a/runtime.c b/runtime.c
index c172231..6700156 100644
--- a/runtime.c
+++ b/runtime.c
@@ -14,6 +14,7 @@ struct opencl_device {
     char *name;
     cl_device_type type;
     cl_device_svm_capabilities svm_capabilities;
+    bool is_valid;
 };
 
 struct opencl_platform {
@@ -21,6 +22,7 @@ struct opencl_platform {
     char *name;
     cl_uint num_devices;
     struct opencl_device *devices;
+    bool is_valid;
 };
 
 struct opencl_platform *opencl_platforms;
@@ -39,58 +41,67 @@ struct opencl_platform *runtime_get_platforms(cl_int *err_in, cl_uint *nplatform
     if ((*err_in = clGetPlatformIDs(num_platforms, platform_ids, NULL)) != CL_SUCCESS)
         goto end;
 
+    // get platform info
     for (cl_uint p = 0; p < num_platforms; p++) {
+        struct opencl_platform *const curr_platform = &platforms[p];
         size_t platform_name_sz;
-        platforms[p].id = platform_ids[p];
 
-        if ((*err_in = clGetDeviceIDs(platforms[p].id, CL_DEVICE_TYPE_ALL, 0, NULL, &platforms[p].num_devices)) != CL_SUCCESS)
-            goto end;
-        platforms[p].devices = calloc(platforms[p].num_devices, sizeof *platforms[p].devices);
-        device_ids = realloc(device_ids, platforms[p].num_devices * sizeof *device_ids);
-        if ((*err_in = clGetDeviceIDs(platforms[p].id, CL_DEVICE_TYPE_ALL, platforms[p].num_devices, device_ids, NULL)) != CL_SUCCESS)
-            goto end;
+        curr_platform->id = platform_ids[p];
 
-        if ((*err_in = clGetPlatformInfo(platforms[p].id, CL_PLATFORM_NAME, 0, NULL, &platform_name_sz)) != CL_SUCCESS)
+        if ((*err_in = clGetDeviceIDs(curr_platform->id, CL_DEVICE_TYPE_ALL, 0, NULL, &curr_platform->num_devices)) != CL_SUCCESS)
             goto end;
-        platforms[p].name = calloc(1, platform_name_sz);
-        if ((*err_in = clGetPlatformInfo(platforms[p].id, CL_PLATFORM_NAME, platform_name_sz, platforms[p].name, NULL)) != CL_SUCCESS)
-            goto end;
-        writef(STDOUT_FILENO, "Platform [%u] = %s\n", p, platforms[p].name);
-
-        for (cl_uint d = 0; d < platforms[p].num_devices; ++d) {
+        curr_platform->devices = calloc(curr_platform->num_devices, sizeof *curr_platform->devices);
+        device_ids = realloc(device_ids, curr_platform->num_devices * sizeof *device_ids);
+        if ((*err_in = clGetDeviceIDs(curr_platform->id, CL_DEVICE_TYPE_ALL, curr_platform->num_devices, device_ids, NULL)) != CL_SUCCESS)
+            continue;
+
+        if ((*err_in = clGetPlatformInfo(curr_platform->id, CL_PLATFORM_NAME, 0, NULL, &platform_name_sz)) != CL_SUCCESS)
+            continue;
+        curr_platform->name = calloc(1, platform_name_sz);
+        if ((*err_in = clGetPlatformInfo(curr_platform->id, CL_PLATFORM_NAME, platform_name_sz, curr_platform->name, NULL)) != CL_SUCCESS)
+            continue;
+        writef(STDOUT_FILENO, "Platform [%u] = %s\n", p, curr_platform->name);
+
+        // get device info
+        for (cl_uint d = 0; d < curr_platform->num_devices; ++d) {
+            struct opencl_device *const curr_device = &curr_platform->devices[d];
             size_t device_name_sz;
 
-            platforms[p].devices[d].id = device_ids[d];
+            curr_device->id = device_ids[d];
 
             if ((*err_in = clGetDeviceInfo(device_ids[d], CL_DEVICE_NAME, 0, NULL, &device_name_sz)) != CL_SUCCESS)
-                goto end;
-            platforms[p].devices[d].name = calloc(1, device_name_sz);
+                continue;
+            curr_device->name = calloc(1, device_name_sz);
             if ((*err_in = clGetDeviceInfo(device_ids[d], CL_DEVICE_NAME, 
                             device_name_sz, 
-                            platforms[p].devices[d].name, 
+                            curr_device->name, 
                             NULL)) != CL_SUCCESS)
-                goto end;
+                continue;
 
             if ((*err_in = clGetDeviceInfo(device_ids[d], CL_DEVICE_TYPE, 
-                            sizeof platforms[p].devices[d].type, 
-                            &platforms[p].devices[d].type, 
+                            sizeof curr_device->type, 
+                            &curr_device->type, 
                             NULL)) != CL_SUCCESS)
                 continue;
 
-            writef(STDOUT_FILENO, "  Device [%u] = %s (%s)\n", d, platforms[p].devices[d].name, clDeviceTypeGetString(platforms[p].devices[d].type));
-            writef(STDOUT_FILENO, " SVM capabilities:\n");
+            writef(STDOUT_FILENO, "  Device [%u] = %s (%s)\n", d, curr_device->name, clDeviceTypeGetString(curr_device->type));
+            writef(STDOUT_FILENO, "   SVM capabilities:\n");
 
             if ((*err_in = clGetDeviceInfo(device_ids[d], CL_DEVICE_SVM_CAPABILITIES, 
-                            sizeof platforms[p].devices[d].svm_capabilities, 
-                            &platforms[p].devices[d].svm_capabilities, 
+                            sizeof curr_device->svm_capabilities, 
+                            &curr_device->svm_capabilities, 
                             NULL)) != CL_SUCCESS)
                 continue;
+            
+            curr_device->is_valid = true;
 
-            writef(STDOUT_FILENO, "    Course-grained buffer?: %s\n", platforms[p].devices[d].svm_capabilities & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "yes" : "no");
-            writef(STDOUT_FILENO, "    Fine-grained buffer?: %s\n", platforms[p].devices[d].svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "yes" : "no");
-            writef(STDOUT_FILENO, "    Fine-grained system?: %s\n", platforms[p].devices[d].svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "yes" : "no");
-            writef(STDOUT_FILENO, "    Atomics?: %s\n", platforms[p].devices[d].svm_capabilities & CL_DEVICE_SVM_ATOMICS ? "yes" : "no");
+            writef(STDOUT_FILENO, "    Course-grained buffer?: %s\n", curr_device->svm_capabilities & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "yes" : "no");
+            writef(STDOUT_FILENO, "    Fine-grained buffer?: %s\n", curr_device->svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "yes" : "no");
+            writef(STDOUT_FILENO, "    Fine-grained system?: %s\n", curr_device->svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "yes" : "no");
+            writef(STDOUT_FILENO, "    Atomics?: %s\n", curr_device->svm_capabilities & CL_DEVICE_SVM_ATOMICS ? "yes" : "no");
         }
+
+        curr_platform->is_valid = true;
     }
 
 end:
@@ -104,44 +115,78 @@ struct opencl_platform *runtime_get_platforms(cl_int *err_in, cl_uint *nplatform
 }
 
 void opencl_platform_cleanup(struct opencl_platform platform) {
-    for (cl_uint d = 0; d < platform.num_devices; d++)
+    for (cl_uint d = 0; platform.devices && d < platform.num_devices; d++)
         free(platform.devices[d].name);
     free(platform.devices);
     free(platform.name);
 }
 
-#endif
+#endif /* USE_OPENCL */
 
 runtime_error_t runtime_init(runtime_init_info_t info) {
 #if USE_CUDA
     return cudaSuccess;
 #else
     runtime_error_t err;
+    bool have_platform = false;
     opencl_platforms = runtime_get_platforms(&err, &num_platforms);
 
-    if (runtime_is_error(err)) {
-        writef(STDERR_FILENO, "blas2cuda: %s: failed to get OpenCL runtimes - %s\n", __func__, runtime_error_string(err));
+    for (cl_uint p = 0; p < num_platforms; ++p)
+        if (opencl_platforms[p].is_valid) {
+            have_platform = true;
+            break;
+        }
+
+    if (!have_platform) {
+        writef(STDERR_FILENO, "blas2cuda: %s: failed to get OpenCL platforms - %s\n", __func__, runtime_error_string(err));
         abort();
     }
 
     // create a context for a platform and device
-    opencl_ctx = clCreateContext(
-            (cl_context_properties[]){
-                CL_CONTEXT_PLATFORM,
-                (cl_context_properties) opencl_platforms[info.platform].id,
-                0
-            }, 1,
-            &opencl_platforms[info.platform].devices[info.device].id, NULL,
-            NULL, &err);
-
-    if (runtime_is_error(err))
+    struct opencl_platform *selected_platform = NULL;
+    struct opencl_device *selected_device = NULL;
+    for (cl_uint p = 0; p < num_platforms && !selected_device; p++) {
+        struct opencl_platform *const curr_platform = &opencl_platforms[p];
+        if (!curr_platform->is_valid)
+            continue;
+        for (cl_uint d = 0; d < curr_platform->num_devices && !selected_device; d++) {
+            struct opencl_device *const curr_device = &curr_platform->devices[d];
+            if (!curr_device->is_valid)
+                continue;
+            opencl_ctx = clCreateContext(
+                    (cl_context_properties[]){
+                        CL_CONTEXT_PLATFORM,
+                        (cl_context_properties) curr_platform->id,
+                        0
+                    }, 1,
+                    &curr_device->id, NULL,
+                    NULL, &err);
+            if (runtime_is_error(err))
+                return err;
+            // we will break now that opencl_ctx has been initialized
+            selected_device = curr_device;
+            selected_platform = curr_platform;
+            if (p != info.platform)
+                writef(STDERR_FILENO, "blas2cuda: %s: WARNING: could not select platform #%d (%s)\n", __func__, info.platform, curr_platform->name);
+            if (d != info.device)
+                writef(STDERR_FILENO, "blas2cuda: %s: WARNING: could not select device #%d (%s)\n", __func__, info.device, curr_device->name);
+        }
+    }
+
+    if (!selected_device) {
+        writef(STDERR_FILENO, "blas2cuda: %s: FATAL: could not select a device\n", __func__);
         return err;
+    }
 
     // create a command queue for the device
     opencl_cmd_queue = clCreateCommandQueueWithProperties(
-            opencl_ctx, opencl_platforms[info.platform].devices[info.device].id,
+            opencl_ctx, selected_device->id,
             (cl_queue_properties[]) { 0 },
             &err);
+    
+    if (!runtime_is_error(err))
+        writef(STDOUT_FILENO, "blas2cuda: %s: selected %s [%s]\n", 
+               __func__, selected_platform->name, selected_device->name);
 
     return err;
 #endif
diff --git a/tests/netlib/meson.build b/tests/netlib/meson.build
index 939d621..e961a23 100644
--- a/tests/netlib/meson.build
+++ b/tests/netlib/meson.build
@@ -12,10 +12,10 @@ sh = find_program('./test.py')
 libm_dep = cc.find_library('m')
 libft_dep = cc.find_library('gfortran')
 
-foreach prefix : exe_prefixes
-  exe = executable(prefix + '-blas', [prefix + '.f'], include_directories: incdir, 
-    dependencies: [libblas_dep, libm_dep, libft_dep], c_args: ['-O0', '-g', '-ggdb3'])
-  test(prefix + '-' + 'correctness', sh, 
-    args: [prefix, exe, files('input.'+prefix), libgpublas, '--nointeractive'],
+foreach exe_prefix : exe_prefixes
+  exe = executable(exe_prefix + '-blas', [exe_prefix + '.f'], include_directories: incdir, 
+    dependencies: [libblas_dep, libm_dep, libft_dep], c_args: ['-O0', '-g', '-ggdb3'], link_with: [libgpublas])
+  test(exe_prefix + '-' + 'correctness', sh, 
+    args: [exe_prefix, exe, files('input.'+exe_prefix), libgpublas, '--nointeractive'],
     timeout: 300)
 endforeach
diff --git a/tests/netlib/test.py b/tests/netlib/test.py
index 6c02f9d..9ddd381 100755
--- a/tests/netlib/test.py
+++ b/tests/netlib/test.py
@@ -12,6 +12,7 @@
     parser.add_argument('input', help='input file to test binary')
     parser.add_argument('gpublas', help='location of libgpublas')
     parser.add_argument('--nointeractive', action='store_true')
+    parser.add_argument('--valgrind', action='store_true')
 
     args = parser.parse_args()
     args.input = os.path.abspath(args.input)
@@ -38,11 +39,24 @@
             os._exit(126)
     else:
         try:
-            subprocess.run(['gdb', args.binary, '-ex', f'set exec-wrapper env LD_PRELOAD={args.gpublas}', '-ex', f'run <{args.input}'], check=True)
+            proc = None
+            if args.valgrind:
+                proc = subprocess.Popen(f'valgrind --vgdb=yes --vgdb-error=0 {args.binary} <{args.input} &>/dev/null', shell=True)
+                try:
+                    proc.wait(3)
+                except:
+                    pass
+                subprocess.run(['gdb', args.binary, '-ex', 'set non-stop off', '-ex', 'target remote | vgdb'], check=True)
+                proc.wait()
+            else:
+                subprocess.run(['gdb', args.binary, '-ex', f'set exec-wrapper env LD_PRELOAD={args.gpublas}', '-ex', f'run <{args.input}'], check=True)
             subprocess.run(['edit', f'{args.test.upper()}.SUMM'], check=True)
         except KeyboardInterrupt:
-            pass
+            if proc:
+                proc.kill()
         except:
+            if proc:
+                proc.kill()
             os._exit(126)
     print(f'Done testing {args.test}')
     os._exit(retval)