diff --git a/.Rbuildignore b/.Rbuildignore
index 7ce4f93..60e7f1c 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -14,3 +14,4 @@ docs/
 extradata/
 revdep/
 ^CRAN-SUBMISSION$
+bench/
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
deleted file mode 100644
index 50a78d1..0000000
--- a/.github/FUNDING.yml
+++ /dev/null
@@ -1,2 +0,0 @@
-# These are supported funding model platforms
-custom: ['www.rexy.ai']
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
index 3588d0f..526ee8e 100644
--- a/.github/workflows/R-CMD-check.yaml
+++ b/.github/workflows/R-CMD-check.yaml
@@ -1,28 +1,45 @@
-# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag.
-# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   push:
-    branches:
-      - master
+    branches: [master]
   pull_request:
-    branches:
-      - master
+    branches: [master]
 
 name: R-CMD-check
 
 jobs:
   R-CMD-check:
-    runs-on: macOS-latest
+    runs-on:  ubuntu-latest
+
+    name: (${{ matrix.config.r }})
+
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {r: 'devel'}
+          # minimal required R version
+          - {r: '3.6.0'}
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      R_KEEP_PKG_SOURCE: yes
+
     steps:
-      - uses: actions/checkout@v2
-      - uses: r-lib/actions/setup-r@v1
-      - name: Install dependencies
-        run: |
-          install.packages(c("remotes", "rcmdcheck", "Matrix"))
-          remotes::install_deps(dependencies = TRUE)
-        shell: Rscript {0}
-      - name: Check
-        run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error")
-        shell: Rscript {0}
+      - uses: actions/checkout@v3
+
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: ${{ matrix.config.r }}
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck, any::Matrix
+          needs: check
+
+      - uses: r-lib/actions/check-r-package@v2
+        with:
+          upload-snapshots: true
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
index 200516b..a2171a3 100644
--- a/.github/workflows/test-coverage.yaml
+++ b/.github/workflows/test-coverage.yaml
@@ -1,47 +1,31 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   push:
-    branches:
-      - master
+    branches: [master]
   pull_request:
-    branches:
-      - master
+    branches: [master]
 
 name: test-coverage
 
 jobs:
   test-coverage:
-    runs-on: macOS-latest
+    runs-on: ubuntu-latest
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
-    steps:
-
-      - uses: actions/checkout@v2
-
-      - uses: r-lib/actions/setup-r@master
-
-      - uses: r-lib/actions/setup-pandoc@master
 
-      - name: Query dependencies
-        run: |
-          install.packages('remotes')
-          saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
-          writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
-        shell: Rscript {0}
+    steps:
+      - uses: actions/checkout@v3
 
-      - name: Cache R packages
-        uses: actions/cache@v1
+      - uses: r-lib/actions/setup-r@v2
         with:
-          path: ${{ env.R_LIBS_USER }}
-          key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
+          use-public-rspm: true
 
-      - name: Install dependencies
-        run: |
-          install.packages(c("remotes", "Matrix"))
-          remotes::install_deps(dependencies = TRUE)
-          remotes::install_cran("covr")
-        shell: Rscript {0}
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::covr, any::Matrix
+          needs: coverage
 
       - name: Test coverage
-        run: covr::codecov()
+        run: covr::codecov(quiet = FALSE)
         shell: Rscript {0}
diff --git a/.gitignore b/.gitignore
index 2eec536..1beb33f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ autom4te.cache
 src/Makevars
 revdep
 .Rprofile
+bench/
diff --git a/NEWS.md b/NEWS.md
index e5cb2a5..13399a4 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,7 @@
+# rsparse dev
+- faster WRMF solver see #72, #75
+- updated github actions
+
 # rsparse 0.5.1 (2022-09-11)
 - update `configure` script, thanks to @david-cortes, see #73
 - minor fixes in WRMF
diff --git a/R/model_WRMF.R b/R/model_WRMF.R
index 2e1dda1..1ead70c 100644
--- a/R/model_WRMF.R
+++ b/R/model_WRMF.R
@@ -180,10 +180,15 @@ WRMF = R6::R6Class(
           RhpcBLASctl::blas_set_num_threads(blas_threads_keep)
         })
       }
-
+      logger$debug("converting input user-item matrix")
       c_ui = MatrixExtra::as.csc.matrix(x)
+      # c_ui = as(x, "CsparseMatrix")
+      logger$debug("pre-processing input")
       c_ui = private$preprocess(c_ui)
-      c_iu = MatrixExtra::t_shallow(MatrixExtra::as.csr.matrix(x))
+      logger$debug("creating item-user matrix")
+      c_iu = MatrixExtra::t_shallow(MatrixExtra::as.csr.matrix(c_ui))
+      # c_iu = t(c_ui)
+      logger$debug("created item-user matrix")
       # store item_ids in order to use them in predict method
       private$item_ids = colnames(c_ui)
 
@@ -195,7 +200,7 @@ WRMF = R6::R6Class(
       n_user = nrow(c_ui)
       n_item = ncol(c_ui)
 
-      logger$trace("initializing U")
+      logger$debug("initializing U")
       if (private$precision == "double") {
         private$U = large_rand_matrix(private$rank, n_user)
         # for item biases
@@ -210,7 +215,7 @@ WRMF = R6::R6Class(
       }
 
       if (is.null(self$components)) {
-
+        logger$debug("initializing components")
         if (private$solver_code == 1L) { ### <- cholesky
           if (private$precision == "double") {
             self$components = matrix(0, private$rank, n_item)
@@ -331,6 +336,7 @@ WRMF = R6::R6Class(
 
         loss_prev_iter = loss
       }
+      logger$debug("solver finished")
 
       if (private$precision == "double")
         data.table::setattr(self$components, "dimnames", list(NULL, colnames(x)))
@@ -341,12 +347,16 @@ WRMF = R6::R6Class(
       rank_ = ifelse(private$with_user_item_bias, private$rank - 1L, private$rank)
       ridge = fl(diag(x = private$lambda, nrow = rank_, ncol = rank_))
       XX = if (private$with_user_item_bias) self$components[-1L, , drop = FALSE] else self$components
+
+      RhpcBLASctl::blas_set_num_threads(RhpcBLASctl::get_num_cores())
       private$XtX = tcrossprod(XX) + ridge
+      RhpcBLASctl::blas_set_num_threads(1)
 
       # call extra transform to ensure results from transform() and fit_transform()
       # are the same (due to avoid_cg, etc)
       # this adds some extra computation, but not a big deal though
-      self$transform(x)
+      # self$transform(x)
+      private$transform_(c_iu, ...)
     },
     # project new users into latent user space - just make ALS step given fixed items matrix
     #' @description create user embeddings for new input
@@ -366,6 +376,41 @@ WRMF = R6::R6Class(
         x = MatrixExtra::t_shallow(x)
       }
 
+      x = private$preprocess(x)
+
+      if (self$global_bias != 0. && private$feedback == "explicit")
+        x@x = x@x - self$global_bias
+
+      private$transform_(x, ...)
+    }
+  ),
+  #### private -----
+  private = list(
+    solver_code = NULL,
+    cg_steps = NULL,
+    scorers = NULL,
+    lambda = NULL,
+    dynamic_lambda = FALSE,
+    rank = NULL,
+    non_negative = NULL,
+    cnt_u = NULL,
+    # user factor matrix = rank * n_users
+    U = NULL,
+    # item factor matrix = rank * n_items
+    I = NULL,
+    # preprocess - transformation of input matrix before passing it to ALS
+    # for example we can scale each row or apply log() to values
+    # this is essentially "confidence" transformation from WRMF article
+    preprocess = NULL,
+    feedback = NULL,
+    precision = NULL,
+    XtX = NULL,
+    solver = NULL,
+    with_user_item_bias = NULL,
+    with_global_bias = NULL,
+    init_user_item_bias = NULL,
+    transform_ = function(x, ...) {
+      logger$debug('starting transform')
       if (private$feedback == "implicit" ) {
         logger$trace("WRMF$transform(): calling `RhpcBLASctl::blas_set_num_threads(1)` (to avoid thread contention)")
         blas_threads_keep = RhpcBLASctl::blas_get_num_procs()
@@ -375,11 +420,6 @@ WRMF = R6::R6Class(
           RhpcBLASctl::blas_set_num_threads(blas_threads_keep)
         })
       }
-
-      x = private$preprocess(x)
-      if (self$global_bias != 0. && private$feedback == "explicit")
-        x@x = x@x - self$global_bias
-
       if (private$precision == "double") {
         res = matrix(0, nrow = private$rank, ncol = ncol(x))
       } else {
@@ -389,7 +429,7 @@ WRMF = R6::R6Class(
       if (private$with_user_item_bias) {
         res[1, ] = if(private$precision == "double") 1.0 else float::fl(1.0)
       }
-
+      logger$debug('starting transform solver')
       loss = private$solver(
         x,
         self$components,
@@ -399,6 +439,7 @@ WRMF = R6::R6Class(
         cnt_X = private$cnt_u,
         avoid_cg = TRUE
       )
+      logger$debug('finished transform solver')
 
       res = t(res)
 
@@ -406,35 +447,9 @@ WRMF = R6::R6Class(
         setattr(res, "dimnames", list(colnames(x), NULL))
       else
         setattr(res@Data, "dimnames", list(colnames(x), NULL))
-
+      logger$debug('finished transform')
       res
     }
-  ),
-  #### private -----
-  private = list(
-    solver_code = NULL,
-    cg_steps = NULL,
-    scorers = NULL,
-    lambda = NULL,
-    dynamic_lambda = FALSE,
-    rank = NULL,
-    non_negative = NULL,
-    cnt_u = NULL,
-    # user factor matrix = rank * n_users
-    U = NULL,
-    # item factor matrix = rank * n_items
-    I = NULL,
-    # preprocess - transformation of input matrix before passing it to ALS
-    # for example we can scale each row or apply log() to values
-    # this is essentially "confidence" transformation from WRMF article
-    preprocess = NULL,
-    feedback = NULL,
-    precision = NULL,
-    XtX = NULL,
-    solver = NULL,
-    with_user_item_bias = NULL,
-    with_global_bias = NULL,
-    init_user_item_bias = NULL
   )
 )
 
@@ -465,7 +480,9 @@ als_implicit = function(
     } else {
       XX = X
     }
+    RhpcBLASctl::blas_set_num_threads(RhpcBLASctl::get_num_cores())
     XtX = tcrossprod(XX) + ridge
+    RhpcBLASctl::blas_set_num_threads(1)
   }
   if (is.null(global_bias_base)) {
     global_bias_base = numeric()
diff --git a/README.md b/README.md
index eb50aa1..f2aea63 100644
--- a/README.md
+++ b/README.md
@@ -11,13 +11,6 @@
 
 We've paid some attention to the implementation details - we try to avoid data copies, utilize multiple threads via OpenMP and use SIMD where appropriate. Package **allows to work on datasets with millions of rows and millions of columns**.
 
-
-### Support 
-
-Please reach us if you need **commercial support** - [hello@rexy.ai](mailto:hello@rexy.ai).
-
-
-
 # Features
 
 ### Classification/Regression
diff --git a/inst/include/wrmf_implicit.hpp b/inst/include/wrmf_implicit.hpp
index f2841b0..d8fe1c7 100644
--- a/inst/include/wrmf_implicit.hpp
+++ b/inst/include/wrmf_implicit.hpp
@@ -149,14 +149,10 @@ T als_implicit(const dMappedCSC& Conf, arma::Mat<T>& X, arma::Mat<T>& Y,
       // C = 1 (so we omit multiplication on eye matrix)
       // rhs = X * eye * (0 - x_biases) = -X * x_biases
       rhs_init *= -x_biases;
-    }
-
-    else {
+    } else {
       rhs_init = - (drop_row<T>(X, is_x_bias_last_row) * (x_biases + global_bias));
     }
-  }
-
-  else if (global_bias) {
+  } else if (global_bias) {
     rhs_init = arma::Mat<T>(&global_bias_base[0], rank - (int)with_biases, 1, false, true);
   }
 
@@ -164,7 +160,17 @@ T als_implicit(const dMappedCSC& Conf, arma::Mat<T>& X, arma::Mat<T>& Y,
   double loss = 0;
   size_t nc = Conf.n_cols;
 #ifdef _OPENMP
-#pragma omp parallel for num_threads(n_threads) schedule(dynamic, GRAIN_SIZE) reduction(+:loss)
+#pragma omp parallel num_threads(n_threads)
+#endif
+{
+  arma::Mat<T> X_nnz;
+  arma::Mat<T> X_nnz_t;
+  arma::Col<T> init;
+  arma::Col<T> Y_new;
+  arma::Mat<T> rhs;
+
+#ifdef _OPENMP
+#pragma omp for schedule(dynamic) reduction(+:loss)
 #endif
   for (size_t i = 0; i < nc; i++) {
     arma::uword p1 = Conf.col_ptrs[i];
@@ -175,8 +181,8 @@ T als_implicit(const dMappedCSC& Conf, arma::Mat<T>& X, arma::Mat<T>& Y,
       const arma::uvec idx = arma::uvec(&Conf.row_indices[p1], p2 - p1, false, true);
       arma::Col<T> confidence =
           arma::conv_to<arma::Col<T> >::from(arma::vec(&Conf.values[p1], p2 - p1));
-      arma::Mat<T> X_nnz = X.cols(idx);
-      arma::Col<T> init = Y.col(i);
+      X_nnz = X.cols(idx);
+      init = Y.col(i);
       // if is_x_bias_last_row == true
       // X_nnz = [1, ...]
       // if is_x_bias_last_row == false
@@ -185,22 +191,22 @@ T als_implicit(const dMappedCSC& Conf, arma::Mat<T>& X, arma::Mat<T>& Y,
         X_nnz = drop_row<T>(X_nnz, is_x_bias_last_row);
         init = drop_row<T>(init, !is_x_bias_last_row);
       }
-      arma::Col<T> Y_new;
 
       if (solver == CONJUGATE_GRADIENT) {
         if (!with_biases && !global_bias)
           Y_new = cg_solver_implicit<T>(X_nnz, confidence, init, cg_steps, XtX);
-        else if (with_biases)
+        else if (with_biases) {
+          init = drop_row<T>(init, !is_x_bias_last_row);
           Y_new = cg_solver_implicit_user_item_bias<T>(X_nnz, confidence, init, cg_steps, XtX,
                                                        rhs_init, x_biases(idx), global_bias);
-        else
+        } else {
           Y_new = cg_solver_implicit_global_bias<T>(X_nnz, confidence, init, cg_steps, XtX,
                                                     rhs_init, global_bias);
-
+        }
       } else {
         const arma::Mat<T> lhs =
-            XtX + X_nnz.each_row() % (confidence.t() - 1) * X_nnz.t();
-        arma::Mat<T> rhs;
+          XtX + X_nnz.each_row() % (confidence.t() - 1) * X_nnz.t();
+
         if (with_biases) {
           // now we need to update rhs with rhs_init and take into account
           // items with interactions (p=1)
@@ -227,7 +233,7 @@ T als_implicit(const dMappedCSC& Conf, arma::Mat<T>& X, arma::Mat<T>& Y,
         if (solver == SEQ_COORDINATE_WISE_NNLS) {
           Y_new = c_nnls<T>(lhs, rhs, init, SCD_MAX_ITER, SCD_TOL);
         } else {  // CHOLESKY
-          Y_new = solve(lhs, rhs, arma::solve_opts::fast);
+          Y_new = solve(lhs, rhs, arma::solve_opts::fast + arma::solve_opts::likely_sympd);
         }
       }
 
@@ -276,7 +282,7 @@ T als_implicit(const dMappedCSC& Conf, arma::Mat<T>& X, arma::Mat<T>& Y,
       }
     }
   }
-
+}
   if (lambda > 0) {
     if (with_biases) {
       // lambda applied to all learned parameters: