diff --git a/README.md b/README.md
index c890df2..ebf6f70 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ PyExaFMM
 
 [![Anaconda-Server Badge](https://img.shields.io/conda/v/skailasa/pyexafmm.svg)](https://anaconda.org/skailasa/pyexafmm) [![Anaconda-Server Badge](https://anaconda.org/skailasa/pyexafmm/badges/latest_release_date.svg)](https://anaconda.org/skailasa/pyexafmm) [![Anaconda-Server Badge](https://anaconda.org/skailasa/pyexafmm/badges/platforms.svg)](https://anaconda.org/skailasa/pyexafmm)
 
-PyExaFMM is an adaptive particle kernel-independent FMM based on [1], written in pure Python with some extensions. Representing a compromise between portability, ease of use, and performance. Optimisations are currently implemented  using Numba, Numpy, and CUDA acceleration.
+PyExaFMM is an adaptive particle kernel-independent FMM based on [1], written in pure Python with some extensions. Representing a compromise between portability, ease of use, and performance. Optimisations are currently implemented using Numba.
 
 The goal of PyExaFMM is to develop a highly performant implementation of the adaptive particle FMM written in Python, as the utility of FMM algorithms are hindered by their relatively complex implementation, especially for achieving high-performance.
 
@@ -14,13 +14,8 @@ PyExaFMM compares favourably on realistic problem sizes with naive direct evalua
 
 <img src="static/performance.png" alt="Laplace" width="800">
 
-
 Here we use an order 5 multipole expansion, an order 6 local expansion, and constrain leaf nodes to contain a maximum of 100 particles. The target rank in our M2L operator compression is kept at 1. The benchmark was run on an Intel i7-9750H processor.
 
-## System Requirements
-
-An NVidia GPU is required, as PyExaFMM is accellerated with CUDA.
-
 ## Install
 
 Download from Anaconda cloud into a conda/mini-conda environment:
@@ -60,19 +55,19 @@ This is done via a `config.json` file, PyExaFMM will look for this in your **cur
 
 ```json
 {
-    "experiment": "fmm",
-    "npoints": 100000,
+    "experiment": "test",
+    "npoints": 10000,
     "data_type": "random",
     "order_equivalent": 5,
-    "order_check": 6,
+    "order_check": 5,
     "kernel": "laplace",
     "backend": "numba",
     "alpha_inner": 1.05,
-    "alpha_outer": 2.95,
+    "alpha_outer": 2.9,
     "max_level": 10,
-    "max_points": 100,
-    "target_rank": 1,
-    "cond": 1e-16
+    "max_points": 150,
+    "target_rank": 10,
+    "tol": 1e-4
 }
 ```
 
@@ -89,7 +84,7 @@ This is done via a `config.json` file, PyExaFMM will look for this in your **cur
 | `alpha_outer`	| Relative size of outer surface's radius.           |
 | `max_level`   | Depth of octree to use in simulations.             |
 | `target_rank` | Target rank in low-rank compression of M2L matrix. |
-| `cond` | Threshold under which to ignore singular values in randomised SVD. |
+| `tol` | Threshold under which to ignore singular values in SVD taken to invert check-to-equivalent matrices |
 
 PyExaFMM provides some simple test-data generation functions, which can be configured for. However, to use your own data, simply create a HDF5 file, with the same name as `experiment` in your configuration file, with the following group hierarchy,
 
diff --git a/fmm/backend/api.py b/fmm/backend/api.py
index d50549f..c056d02 100644
--- a/fmm/backend/api.py
+++ b/fmm/backend/api.py
@@ -2,7 +2,6 @@
 Interface for compute backends.
 """
 import fmm.backend.numba as numba_backend
-import fmm.backend.openmp as openmp_backend
 
 BACKEND = {
     "numba": {
@@ -15,16 +14,5 @@
         "m2t": numba_backend.m2t,
         "near_field_u_list": numba_backend.near_field_u_list,
         "near_field_node": numba_backend.near_field_node,
-    },
-    "openmp": {
-        "p2m": openmp_backend.p2m,
-        "m2m": openmp_backend.m2m,
-        "m2l": openmp_backend.m2l,
-        "l2l": openmp_backend.l2l,
-        "s2l": openmp_backend.s2l,
-        "l2t": openmp_backend.l2t,
-        "m2t": openmp_backend.m2t,
-        "near_field_u_list": openmp_backend.near_field,
-        "near_field_node": openmp_backend.near_field,
     }
 }
diff --git a/fmm/backend/numba.py b/fmm/backend/numba.py
index f026884..c576127 100644
--- a/fmm/backend/numba.py
+++ b/fmm/backend/numba.py
@@ -319,7 +319,6 @@ def m2l_core(
     scale : np.float32
         Precomputed kernel scale for this level.
     """
-
     nv_list = len(v_list)
 
     # Index of local expansion
@@ -328,16 +327,16 @@ def m2l_core(
     for i in range(nv_list):
 
         source = v_list[i]
-        midx = key_to_index[source]*nequivalent_points
 
         transfer_vector = morton.find_transfer_vector(target, source)
-        u_idx = hash_to_index[transfer_vector]
-        u_lidx = u_idx*ncheck_points
-
-        u_sub = u[u_lidx:u_lidx+ncheck_points]
+        v_idx = hash_to_index[transfer_vector]
+        v_lidx = v_idx*ncheck_points
+        v_ridx = v_lidx+ncheck_points
+        vt_sub = np.copy(vt[:, v_lidx:v_ridx])
 
-        multipole_expansion = multipole_expansions[midx:midx+nequivalent_points]
-        local_expansions[lidx:lidx+nequivalent_points] += scale*(dc2e_inv @ (u_sub @ (s @ ( vt @ multipole_expansion))))
+        m_lidx = key_to_index[source]*nequivalent_points
+        m_ridx = m_lidx+nequivalent_points
+        local_expansions[lidx:lidx+nequivalent_points] += (scale*dc2e_inv) @ (u @ (s @ (vt_sub @ multipole_expansions[m_lidx:m_ridx])))
 
 
 @numba.njit(cache=True, parallel=True)
@@ -367,7 +366,7 @@ def m2l(
     u : np.array(np.float32)
         Compressed left singular vectors of SVD of M2L Gram matrix for nodes at this level.
     s : np.array(np.float32)
-        Compressed singular values of SVD of M2L Gram matrix for nodes at this level.
+        Compressed singular values of SVD of M2L Gram matrix for nodes at this level.`
     vt : np.array(np.float32)
         Compressed right singular vectors of SVD of M2L Gram matrix for nodes at this level.
     dc2e_inv : np.array(shape=(n_equivalent, n_check), dtype=np.float64)
@@ -443,13 +442,13 @@ def l2l(
     parent = morton.find_parent(key)
     parent_idx = key_to_index[parent]
     parent_lidx = parent_idx*nequivalent_points
-    parent_ridx = (parent_idx+1)*nequivalent_points
+    parent_ridx = parent_lidx+nequivalent_points
     parent_equivalent_density = local_expansions[parent_lidx:parent_ridx]
 
     # Compute expansion index
     child_idx = key_to_index[key]
     child_lidx = child_idx*nequivalent_points
-    child_ridx = (child_idx+1)*nequivalent_points
+    child_ridx = child_lidx+nequivalent_points
 
     # Compute operator index
     operator_idx = key == morton.find_siblings(key)
@@ -515,9 +514,10 @@ def s2l(
     p2p_function : function
         Function handle for kernel P2P.
     """
-    level = np.int32(morton.find_level(key))
-    scale = np.int32(scale_function(level))
-    center = morton.find_physical_center_from_key(key, x0, r0).astype(np.float32)
+
+    level = morton.find_level(key)
+    scale = scale_function(level)
+    center = morton.find_physical_center_from_key(key, x0, r0)
 
     key_idx = key_to_index[key]
     key_lidx = key_idx*nequivalent_points
@@ -601,10 +601,10 @@ def m2t(
 
         source_idx = key_to_index[source]
         source_lidx = source_idx*nequivalent_points
-        source_ridx = (source_idx+1)*nequivalent_points
+        source_ridx = source_lidx+nequivalent_points
 
-        source_level = np.int32(morton.find_level(source))
-        source_center = morton.find_physical_center_from_key(source, x0, r0).astype(np.float32)
+        source_level = morton.find_level(source)
+        source_center = morton.find_physical_center_from_key(source, x0, r0)
 
         upward_equivalent_surface = surface.scale_surface(
             surf=equivalent_surface,
@@ -673,8 +673,8 @@ def l2t(
     source_lidx = (source_idx)*nequivalent_points
     source_ridx = source_lidx+nequivalent_points
 
-    level = np.int32(morton.find_level(key))
-    center = morton.find_physical_center_from_key(key, x0, r0).astype(np.float32)
+    level = morton.find_level(key)
+    center = morton.find_physical_center_from_key(key, x0, r0)
 
     downward_equivalent_surface = surface.scale_surface(
         equivalent_surface,
@@ -878,6 +878,8 @@ def near_field_node(
         p2p_function
     ):
     """
+    Evaluate all near field particles for source particles within a given
+        target node
 
     Parameters:
     -----------
diff --git a/fmm/backend/openmp.py b/fmm/backend/openmp.py
deleted file mode 100644
index 81217aa..0000000
--- a/fmm/backend/openmp.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""
-Compute operators accelerated with Numba and OpenMP.
-"""
-import numpy as np
-import numba
-
-from adaptoctree import morton
-
-import fmm.surface as surface
-
-
-
-def p2m():
-    pass
-
-def m2m():
-    pass
-
-def l2l():
-    pass
-
-def m2l():
-    pass
-
-def s2l():
-    pass
-
-def l2t():
-    pass
-
-def m2t():
-    pass
-
-def near_field():
-    pass
\ No newline at end of file
diff --git a/fmm/fmm.py b/fmm/fmm.py
index c373f18..37cb8b1 100644
--- a/fmm/fmm.py
+++ b/fmm/fmm.py
@@ -180,7 +180,7 @@ def downward_pass(self):
 
             # Keys at this level
             keys = self.complete[self.complete_levels == level]
-            scale = self.scale_function(level)
+            scale = np.float32(self.scale_function(level))
 
             # V List interactions
             # M2L operator stored in terms of its SVD components for each level
@@ -218,8 +218,6 @@ def downward_pass(self):
 
             for key in keys:
 
-                idx = self.key_to_index[key]
-
                 # Translate local expansion from the node's parent
                 self.backend['l2l'](
                     key=key,
diff --git a/fmm/kernel.py b/fmm/kernel.py
index 69295bd..08dce50 100644
--- a/fmm/kernel.py
+++ b/fmm/kernel.py
@@ -1,15 +1,9 @@
 """
 Kernels accelerated with CUDA and Numba.
 """
-import math
-
 import numba
-from numba import cuda
 import numpy as np
 
-# GPU Kernel parameters
-BLOCK_WIDTH = 32
-BLOCK_HEIGHT = 1024
 M_INV_4PI = 1.0 / (4*np.pi)
 
 TOL = 1e-6
@@ -19,7 +13,10 @@
 ###############################################################################
 
 
-@numba.njit(cache=True)
+@numba.njit(
+    [numba.float32(numba.int32)],
+    cache=True
+)
 def laplace_scale(level):
     """
     Level scale for the Laplace kernel.
@@ -32,7 +29,7 @@ def laplace_scale(level):
     --------
     np.float32
     """
-    return np.float32(1/(2**level))
+    return numba.float32(1/(2**level))
 
 
 @numba.njit(cache=True)
@@ -57,242 +54,6 @@ def laplace_cpu(x, y):
     return res
 
 
-@cuda.jit(device=True)
-def laplace_cuda(ax, ay, az, bx, by, bz):
-    """
-    Numba-Cuda Laplace device kernel.
-
-    Parameters:
-    -----------
-    ax : np.float32
-        'x' coordinate of a point.
-    ay : np.float32
-        'y' coordinate of a point.
-    az : np.float32
-        'z' coordinate of a point.
-    bx : np.float32
-        'x' coordinate of a point.
-    by : np.float32
-        'y' coordinate of a point.
-    bz : np.float32
-        'z' coordinate of a point.
-
-    Returns:
-    --------
-    nb.cuda.float32
-    """
-    rx = ax-bx
-    ry = ay-by
-    rz = az-bz
-
-    dist = rx**2+ry**2+rz**2
-    dist_sqr = math.sqrt(dist)
-    inv_dist_sqr =  1./(4*math.pi*dist_sqr)
-
-    if math.isinf(inv_dist_sqr):
-        return 0.
-
-    return numba.float32(inv_dist_sqr)
-
-
-@cuda.jit
-def laplace_implicit_gram_matrix(
-        sources,
-        targets,
-        rhs,
-        result,
-        height,
-        width,
-        idx
-    ):
-    """
-    Implicitly apply the Gram matrix to a vector, computing Green's function on
-        the fly on the device, and multiplying by random matrices to approximate
-        the basis. Multiple RHS are computed in a loop, specified by the index.
-
-    Parameters:
-    -----------
-    sources : np.array(shape=(nsources, 3))
-        nsources rows of gram matrix
-    targets : np.array(shape=(ntargets, 3))
-        ntargets columns of gram matrix
-    rhs : np.array(shape=(ntargets, nrhs))
-        Multiple right hand sides, indexed by 'idx'
-    result : np.array(shape=(height, nrhs))
-        Dimensions of result matrix defined by matrix height
-        and number of right hand sides
-    height : int
-        'height' of implicit gram matrix. Defined by m, where
-        m > n, and m is either ntargets or nsources.
-    width : int
-        'width' of implicit gram matrix.
-    idx: int
-        RHS index.
-    """
-
-    blockWidth = cuda.shared.array(1, numba.int32)
-    blockxInd = cuda.shared.array(1, numba.int32)
-    blockyInd = cuda.shared.array(1, numba.int32)
-
-    # Calculate once, and share amongs thread-block
-    if cuda.threadIdx.x == 0:
-        if (cuda.blockIdx.x + 1)*BLOCK_WIDTH <= width:
-            blockWidth[0] = numba.int32(BLOCK_WIDTH)
-        else:
-            blockWidth[0] = numba.int32(width % BLOCK_WIDTH)
-
-        blockxInd[0] = numba.int32(cuda.blockIdx.x*BLOCK_WIDTH)
-        blockyInd[0] = numba.int32(cuda.blockIdx.y*BLOCK_HEIGHT)
-
-    cuda.syncthreads()
-
-    # Extract tile of RHS, of size up to BLOCK_WIDTH
-    tmp = cuda.shared.array(BLOCK_WIDTH, numba.float32)
-
-    if cuda.threadIdx.x < blockWidth[0]:
-        tmp[cuda.threadIdx.x] = rhs[idx][blockxInd[0]+cuda.threadIdx.x]
-
-    cuda.syncthreads()
-
-    # Accumulate matvec of tile into variable
-    row_sum = 0
-
-    threadyInd = blockyInd[0]+cuda.threadIdx.x
-
-    if threadyInd < height:
-        for i in range(blockWidth[0]):
-            col_idx = blockxInd[0] + i
-            row_idx = threadyInd
-            source = sources[row_idx]
-            target  = targets[col_idx]
-
-            sx = source[0]
-            sy = source[1]
-            sz = source[2]
-
-            tx = target[0]
-            ty = target[1]
-            tz = target[2]
-
-            row_sum += laplace_cuda(sx, sy, sz, tx, ty, tz)*tmp[i]
-
-    cuda.atomic.add(result, (threadyInd, idx), row_sum)
-
-
-@cuda.jit
-def laplace_implicit_gram_matrix_blocked(
-        sources,
-        targets,
-        rhs,
-        result,
-        height,
-        width,
-        sub_height,
-        sub_width,
-        idx
-    ):
-    """
-    Implicitly apply the Gram matrix to a vector, computing Green's function on
-        the fly on the device, and multiplying by random matrices to approximate
-        the basis. Multiple RHS are computed in a loop, specified by the index.
-        This kernel assumes that the sources/targets specified by the user
-        specify multiple Gram matrices, i.e. the global Gram matrix is 'blocked',
-
-        e.g. gram = ((g_1) | (g_2) | ... | (g_{n-1}) | g_n)
-
-        therefore, one additionally needs to specify the dimensions of the
-        'sub' gram matrices in order to pick out the correct matrix elements
-        for application to a given RHS.
-
-    Parameters:
-    -----------
-    sources : np.array(shape=(nsources, 3), dtype=np.float32)
-        nsources rows of gram matrix
-    targets : np.array(shape=(ntargets, 3), dtype=np.float32)
-        ntargets columns of gram matrix
-    rhs : np.array(shape=(ntargets, nrhs), dtype=np.float32)
-        Multiple right hand sides, indexed by 'idx'
-    result : np.array(shape=(height, nrhs), dtype=np.float32)
-        Dimensions of result matrix defined by matrix height
-        and number of right hand sides
-    height : np.int32
-        'height' of global Gram matrix. Defined by m, where
-        m > n, and m is either ntargets or nsources.
-    width : np.int32
-        'width' of Global gram matrix.
-    sub_height : np.int32
-        height of sub-Gram matrix. Defined by m, where
-        m > n, and m is either ntargets or nsources.
-    sub_width : np.int32
-        'width' of sub-Gram matrix.
-    idx: np.int32
-        RHS index.
-    """
-
-    blockWidth = cuda.shared.array(1, numba.int32)
-    blockxInd = cuda.shared.array(1, numba.int32)
-    blockyInd = cuda.shared.array(1, numba.int32)
-
-    # Calculate once, and share amongs thread-block
-    if cuda.threadIdx.x == 0:
-        if (cuda.blockIdx.x + 1)*BLOCK_WIDTH <= width:
-            blockWidth[0] = numba.int32(BLOCK_WIDTH)
-        else:
-            blockWidth[0] = numba.int32(width % BLOCK_WIDTH)
-
-        blockxInd[0] = numba.int32(cuda.blockIdx.x*BLOCK_WIDTH)
-        blockyInd[0] = numba.int32(cuda.blockIdx.y*BLOCK_HEIGHT)
-
-    cuda.syncthreads()
-
-    # Extract tile of RHS, of size up to BLOCK_WIDTH
-    tmp = cuda.shared.array(BLOCK_WIDTH, numba.float32)
-
-    if cuda.threadIdx.x < blockWidth[0]:
-        tmp[cuda.threadIdx.x] = rhs[idx][blockxInd[0]+cuda.threadIdx.x]
-
-    cuda.syncthreads()
-
-    # Accumulate matvec of tile into variable
-    row_sum = 0
-
-    threadyInd = blockyInd[0]+cuda.threadIdx.x
-    threadxInd = blockxInd[0]+cuda.threadIdx.x
-
-    # Find the index of the sub-Gram matrix, dependent on orientation.
-    if sub_width == width:
-        submatrixInd = numba.int32(math.floor(threadyInd/sub_height))*sub_width
-
-    if sub_height == height:
-        submatrixInd = numba.int32(math.floor(threadxInd/sub_width))*sub_height
-
-    if threadyInd < height:
-        for i in range(blockWidth[0]):
-            col_idx = (blockxInd[0] + i)
-            row_idx = threadyInd
-
-            # Pick out matrix element for sub-Gram matrix
-            if sub_width == width:
-                source = sources[col_idx+submatrixInd]
-                target  = targets[row_idx]
-
-            elif sub_height == height:
-                source = sources[col_idx]
-                target = targets[row_idx+submatrixInd]
-
-            sx = source[0]
-            sy = source[1]
-            sz = source[2]
-
-            tx = target[0]
-            ty = target[1]
-            tz = target[2]
-
-            row_sum += laplace_cuda(sx, sy, sz, tx, ty, tz)*tmp[i]
-
-    cuda.atomic.add(result, (threadyInd, idx), row_sum)
-
-
 @numba.njit(cache=True, parallel=False, fastmath=True, error_model="numpy")
 def laplace_p2p_serial(sources, targets, source_densities):
     """
@@ -439,11 +200,8 @@ def laplace_gram_matrix_parallel(sources, targets):
     'laplace': {
         'eval': laplace_cpu,
         'scale': laplace_scale,
-        'cuda': laplace_cuda,
         'dense_gram': laplace_gram_matrix_serial,
         'dense_gram_parallel': laplace_gram_matrix_parallel,
-        'implicit_gram': laplace_implicit_gram_matrix,
-        'implicit_gram_blocked': laplace_implicit_gram_matrix_blocked,
         'p2p': laplace_p2p_serial,
         'p2p_parallel': laplace_p2p_parallel
     },
diff --git a/fmm/linalg.py b/fmm/linalg.py
new file mode 100644
index 0000000..7a4b95b
--- /dev/null
+++ b/fmm/linalg.py
@@ -0,0 +1,30 @@
+"""
+Linear algebra utilities
+"""
+import numba
+import numpy as np
+
+
+@numba.njit(cache=True)
+def pinv(a, tol=1e-4):
+    """
+    Moore-Penrose Pseudo-Inverse calculation via SVD.
+
+    Parameters:
+    -----------
+    a : np.array(shape=(m, n), dtype=np.float32)
+    tol : np.float32
+        Cutoff singular values greater than 4*max(s)*tol, default 1e-4 works
+        well in practice.
+    """
+    u, s, vt = np.linalg.svd(a, full_matrices=False)
+
+    max_s = max(s)
+
+    for i in range(len(s)):
+        s[i] = 1./s[i] if s[i] > 4*max_s*tol else 0.
+
+    v = vt.T
+    ut = u.T
+
+    return v @ np.diag(s) @  ut
\ No newline at end of file
diff --git a/fmm/parameters.py b/fmm/parameters.py
deleted file mode 100644
index 79769e2..0000000
--- a/fmm/parameters.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""
-Global parameters
-"""
-
-DIGEST_SIZE = 5
diff --git a/fmm/surface.py b/fmm/surface.py
index c8d4639..f4ec092 100644
--- a/fmm/surface.py
+++ b/fmm/surface.py
@@ -1,7 +1,6 @@
 """
 Operator helper methods.
 """
-
 import numba
 import numpy as np
 
@@ -86,7 +85,7 @@ def scale_surface(surf, radius, level, center, alpha):
     scaled_radius = (0.5)**level * radius
     dilated_radius = alpha*scaled_radius
 
-    scaled_surf = np.copy(surf)
+    scaled_surf = np.zeros_like(surf, np.float32)
 
     for i in range(n_coeffs):
         scaled_surf[i] = surf[i]*dilated_radius + center
diff --git a/fmm/test/test_fmm.py b/fmm/test/test_fmm.py
index 6cc3ed2..b51d705 100644
--- a/fmm/test/test_fmm.py
+++ b/fmm/test/test_fmm.py
@@ -13,9 +13,6 @@
 from fmm.kernel import KERNELS
 
 
-RTOL = 1e-1
-
-
 def test_upward_pass():
     """
     Test that multipole expansion of root node is the same as a direct
@@ -25,36 +22,46 @@ def test_upward_pass():
     experiment = Fmm('test_config')
     experiment.upward_pass()
 
-    upward_equivalent_surface = surface.scale_surface(
-        surf=experiment.equivalent_surface,
-        radius=experiment.r0,
-        level=np.int32(0),
-        center=experiment.x0,
-        alpha=experiment.alpha_inner
-    )
+    for key in experiment.complete[experiment.complete_levels == experiment.depth]:
 
-    distant_point = np.array([[1e4, 0, 0]])
+        center = morton.find_physical_center_from_key(key, experiment.x0, experiment.r0)
+        radius = experiment.x0 / (1 << experiment.depth)
 
-    kernel = experiment.config['kernel']
-    p2p = KERNELS[kernel]['p2p']
+        upward_equivalent_surface = surface.scale_surface(
+            surf=experiment.equivalent_surface,
+            radius=experiment.r0,
+            level=experiment.depth,
+            center=center,
+            alpha=experiment.alpha_inner
+        )
 
-    direct = p2p(
-        sources=experiment.sources,
-        targets=distant_point,
-        source_densities=experiment.source_densities
-    )
+        distant_point = center+radius*3
 
-    idx = experiment.key_to_index[0]
-    lidx = idx*experiment.nequivalent_points
-    ridx = (idx+1)*experiment.nequivalent_points
+        kernel = experiment.config['kernel']
+        p2p = KERNELS[kernel]['p2p']
 
-    equivalent = p2p(
-        sources=upward_equivalent_surface,
-        targets=distant_point,
-        source_densities=experiment.multipole_expansions[lidx:ridx]
-    )
+        leaf_idx = experiment.key_to_leaf_index[key]
+        global_idx = experiment.key_to_index[key]
 
-    assert np.allclose(direct, equivalent, rtol=RTOL)
+        lidx = global_idx*experiment.nequivalent_points
+        ridx = lidx+experiment.nequivalent_points
+
+        source_coordinates = experiment.sources[experiment.source_index_pointer[leaf_idx]:experiment.source_index_pointer[leaf_idx+1]]
+        source_densities = experiment.source_densities[experiment.source_index_pointer[leaf_idx]:experiment.source_index_pointer[leaf_idx+1]]
+
+        direct = p2p(
+            sources=source_coordinates,
+            targets=distant_point,
+            source_densities=source_densities
+        )
+
+        equivalent = p2p(
+            sources=upward_equivalent_surface,
+            targets=distant_point,
+            source_densities=experiment.multipole_expansions[lidx:ridx]
+        )
+
+        assert np.allclose(direct, equivalent, atol=1e-2, rtol=0)
 
 
 def test_m2l():
@@ -70,62 +77,70 @@ def test_m2l():
     p2p = experiment.p2p_function
 
     level_2_idxs = experiment.complete_levels == 2
-    key = experiment.complete[level_2_idxs][0]
-    idx = experiment.key_to_index[key]
-    target_lidx = idx*experiment.nequivalent_points
-    target_ridx = target_lidx+experiment.nequivalent_points
-    v_list = experiment.v_lists[idx]
-    v_list = v_list[v_list != -1]
-
-    target_coordinates = experiment.targets[experiment.target_index_pointer[idx]:experiment.target_index_pointer[idx+1]]
-
-    level = np.int32(morton.find_level(key))
-    center = morton.find_physical_center_from_key(key, experiment.x0, experiment.r0).astype(np.float32)
-
-    local_expansion = experiment.local_expansions[target_lidx:target_ridx]
-
-    downward_equivalent_surface = surface.scale_surface(
-        surf=experiment.equivalent_surface,
-        radius=experiment.r0,
-        level=level,
-        center=center,
-        alpha=experiment.alpha_outer
-    )
 
-    equivalent = p2p(
-        sources=downward_equivalent_surface,
-        targets=target_coordinates,
-        source_densities=local_expansion
-    )
+    for i, key in enumerate(experiment.complete[level_2_idxs]):
+        idx = experiment.key_to_index[key]
+        target_lidx = idx*experiment.nequivalent_points
+        target_ridx = target_lidx+experiment.nequivalent_points
+        v_list = experiment.v_lists[idx]
+        v_list = v_list[v_list != -1]
 
-    direct = np.zeros_like(equivalent)
 
-    for source in v_list:
+        level = np.int32(morton.find_level(key))
+        center = morton.find_physical_center_from_key(key, experiment.x0, experiment.r0).astype(np.float32)
 
-        source_level = np.int32(morton.find_level(source))
-        source_center = morton.find_physical_center_from_key(source, experiment.x0, experiment.r0).astype(np.float32)
+        local_expansion = experiment.local_expansions[target_lidx:target_ridx]
 
-        upward_equivalent_surface = surface.scale_surface(
+        downward_check_surface = surface.scale_surface(
             surf=experiment.equivalent_surface,
             radius=experiment.r0,
-            level=source_level,
-            center=source_center,
+            level=level,
+            center=center,
             alpha=experiment.alpha_inner
         )
 
-        source_idx = experiment.key_to_index[source]
-        source_lidx = source_idx*experiment.nequivalent_points
-        source_ridx = source_lidx+experiment.nequivalent_points
+        downward_equivalent_surface = surface.scale_surface(
+            surf=experiment.equivalent_surface,
+            radius=experiment.r0,
+            level=level,
+            center=center,
+            alpha=experiment.alpha_outer
+        )
 
-        tmp = p2p(
-            sources=upward_equivalent_surface,
-            targets=target_coordinates,
-            source_densities=experiment.multipole_expansions[source_lidx:source_ridx]
+        equivalent = p2p(
+            sources=downward_equivalent_surface,
+            targets=downward_check_surface,
+            source_densities=local_expansion
         )
 
-        direct += tmp
+        direct = np.zeros_like(equivalent)
+
+        for source in v_list:
+
+            source_level = np.int32(morton.find_level(source))
+            source_center = morton.find_physical_center_from_key(source, experiment.x0, experiment.r0).astype(np.float32)
+
+            upward_equivalent_surface = surface.scale_surface(
+                surf=experiment.equivalent_surface,
+                radius=experiment.r0,
+                level=source_level,
+                center=source_center,
+                alpha=experiment.alpha_inner
+            )
+
+            source_idx = experiment.key_to_index[source]
+            source_lidx = source_idx*experiment.nequivalent_points
+            source_ridx = source_lidx+experiment.nequivalent_points
+
+            tmp = p2p(
+                sources=upward_equivalent_surface,
+                targets=downward_check_surface,
+                source_densities=experiment.multipole_expansions[source_lidx:source_ridx]
+            )
 
-    assert np.allclose(direct, equivalent, rtol=RTOL)
+            direct += tmp
+
+        assert np.allclose(direct, equivalent, rtol=0.01, atol=0)
 
 
 def test_fmm():
@@ -139,28 +154,7 @@ def test_fmm():
     kernel = experiment.config['kernel']
     p2p = KERNELS[kernel]['p2p']
 
-    for leaf in experiment.leaves:
-
-
-        leaf_idx = experiment.key_to_leaf_index[leaf]
-
-        res = experiment.target_potentials[
-                experiment.target_index_pointer[leaf_idx]:experiment.target_index_pointer[leaf_idx+1]
-            ]
-
-        targets = experiment.targets[
-            experiment.target_index_pointer[leaf_idx]:experiment.target_index_pointer[leaf_idx+1]
-        ]
-
-        direct = p2p(
-            targets=targets,
-            sources=experiment.sources,
-            source_densities=experiment.source_densities
-        )
-
-        diff = res-direct
-
-        err = 100*abs(diff)/direct
-        mean_err = np.mean(err)
+    direct = p2p(experiment.sources, experiment.targets, experiment.source_densities)
+    equivalent = experiment.target_potentials
 
-        assert mean_err < 4
+    np.allclose(direct, equivalent, rtol=1e-2, atol=0)
diff --git a/scripts/precompute_operators.py b/scripts/precompute_operators.py
index 66cf646..ad47639 100644
--- a/scripts/precompute_operators.py
+++ b/scripts/precompute_operators.py
@@ -6,32 +6,29 @@
 import sys
 import time
 
-import cupy as cp
 import h5py
 import numba
 import numpy as np
-import scipy.linalg as linalg
+from sklearn.utils.extmath import randomized_svd
 
 import adaptoctree.morton as morton
 import adaptoctree.tree as tree
 
-from fmm.kernel import KERNELS, BLOCK_WIDTH, BLOCK_HEIGHT
+from fmm.kernel import KERNELS
+import fmm.linalg as linalg
 import fmm.surface as surface
 
 import utils.data as data
 import utils.time
 
-
 HERE = pathlib.Path(os.path.dirname(os.path.abspath(__file__)))
 PARENT = HERE.parent
 WORKING_DIR = pathlib.Path(os.getcwd())
 
-TPB = BLOCK_HEIGHT
-
 
 def compress_m2l_gram_matrix(
         level, x0, r0, depth, alpha_inner, check_surface,
-        equivalent_surface, k, implicit_gram_matrix
+        equivalent_surface, k
     ):
     """
     Compute compressed representation of unique Gram matrices for targets and
@@ -78,11 +75,9 @@ def compress_m2l_gram_matrix(
 
     n_targets_per_node = len(check_surface)
     n_sources_per_node = len(equivalent_surface)
-    n_targets = len(targets)
     n_sources = len(sources)
 
-    dsources = cp.zeros((n_sources*n_sources_per_node, 3)).astype(np.float32)
-    dtargets = cp.zeros((n_targets*n_targets_per_node, 3)).astype(np.float32)
+    se2tc = np.zeros((n_targets_per_node, n_sources*n_sources_per_node), np.float32)
 
     for idx in range(len(targets)):
 
@@ -101,13 +96,10 @@ def compress_m2l_gram_matrix(
             r0=r0
         )
 
-        lidx_targets = (idx)*n_targets_per_node
-        ridx_targets = (idx+1)*n_targets_per_node
-
         lidx_sources = (idx)*n_sources_per_node
-        ridx_sources = (idx+1)*n_sources_per_node
+        ridx_sources = lidx_sources+n_sources_per_node
 
-        target_coordinates = surface.scale_surface(
+        target_check_surface = surface.scale_surface(
             surf=check_surface,
             radius=np.float32(r0),
             level=np.int32(level),
@@ -115,7 +107,7 @@ def compress_m2l_gram_matrix(
             alpha=np.float32(alpha_inner)
         )
 
-        source_coordinates = surface.scale_surface(
+        source_equivalent_surface = surface.scale_surface(
             surf=equivalent_surface,
             radius=np.float32(r0),
             level=np.int32(level),
@@ -123,60 +115,15 @@ def compress_m2l_gram_matrix(
             alpha=np.float32(alpha_inner)
         )
 
-        # Compress the Gram matrix between these sources/targets
-        dsources[lidx_sources:ridx_sources, :] = cp.asarray(source_coordinates)
-        dtargets[lidx_targets:ridx_targets, :] = cp.asarray(target_coordinates)
-
-
-    # Height and width of Gram matrix
-    height = n_targets_per_node*n_targets
-    sub_height = n_targets_per_node
+        tmp = KERNELS[config['kernel']]['dense_gram'](
+                sources=source_equivalent_surface, targets=target_check_surface
+            )
 
-    width = n_sources_per_node
-    sub_width = n_sources_per_node
+        se2tc[:, lidx_sources:ridx_sources] = tmp
 
-    bpg = (int(np.ceil(width/BLOCK_WIDTH)), int(np.ceil(height/BLOCK_HEIGHT)))
-
-    # Result data, in the language of RSVD
-    dY = cp.zeros((height, k)).astype(np.float32)
-
-    # Random matrix, for compressed basis
-    dOmega = cp.random.rand(n_sources_per_node, k).astype(np.float32)
-    dOmegaT = dOmega.T
-
-    # Perform implicit matrix matrix product between Gram matrix and random
-    # matrix Omega
-    for idx in range(k):
-        implicit_gram_matrix[bpg, TPB](
-            dsources, dtargets, dOmegaT, dY, height, width, sub_height, sub_width, idx
-        )
+    u, s, vt = randomized_svd(se2tc, k)
 
-    # Perform QR decomposition on the GPU
-    dQ, _ = cp.linalg.qr(dY)
-    dQT = dQ.T
-
-    # Perform transposed matrix-matrix multiplication implicitly
-    height = n_sources_per_node
-    sub_height = n_sources_per_node
-    width = n_targets_per_node*n_targets
-    sub_width = n_targets_per_node
-
-    dBT = cp.zeros((n_sources_per_node, k)).astype(np.float32)
-
-    # Blocking is transposed
-    bpg = (int(np.ceil(width/BLOCK_WIDTH)), int(np.ceil(height/BLOCK_HEIGHT)))
-
-    for idx in range(k):
-        implicit_gram_matrix[bpg, TPB](
-            dtargets, dsources, dQT, dBT, height, width, sub_height, sub_width, idx
-        )
-
-    # Perform SVD on reduced matrix
-    du, dS, dVT = cp.linalg.svd(dBT.T, full_matrices=False)
-    dU = cp.matmul(dQ, du)
-
-    # Return compressed SVD components
-    return dU.get(), dS.get(), dVT.get(), hashes
+    return u.astype(np.float32), s.astype(np.float32), vt.astype(np.float32), hashes
 
 
 def compute_surfaces(config, db):
@@ -279,7 +226,6 @@ def compute_octree(config, db):
     start_level = 1
 
     sources = db['particle_data']['sources'][...]
-    source_densities = db['particle_data']['source_densities'][...]
     targets = db['particle_data']['targets'][...]
     points = np.vstack((sources, targets))
 
@@ -387,6 +333,7 @@ def compute_inv_c2e(
 
     print("Computing Inverse of Check To Equivalent Gram Matrix")
 
+    equivalent_surface = surface.compute_surface(config['order_equivalent'])
     upward_equivalent_surface = surface.scale_surface(
         surf=equivalent_surface,
         radius=np.float32(r0),
@@ -395,6 +342,7 @@ def compute_inv_c2e(
         alpha=np.float32(config['alpha_inner'])
     )
 
+    check_surface = surface.compute_surface(config['order_check'])
     upward_check_surface = surface.scale_surface(
         surf=check_surface,
         radius=np.float32(r0),
@@ -403,6 +351,7 @@ def compute_inv_c2e(
         alpha=np.float32(config['alpha_outer'])
     )
 
+    equivalent_surface = surface.compute_surface(config['order_equivalent'])
     downward_equivalent_surface = surface.scale_surface(
         surf=equivalent_surface,
         radius=np.float32(r0),
@@ -411,6 +360,7 @@ def compute_inv_c2e(
         alpha=np.float32(config['alpha_outer'])
     )
 
+    check_surface = surface.compute_surface(config['order_check'])
     downward_check_surface = surface.scale_surface(
         surf=check_surface,
         radius=np.float32(r0),
@@ -429,14 +379,17 @@ def compute_inv_c2e(
         sources=downward_equivalent_surface,
     )
 
-    uc2e_inv = linalg.pinv2(uc2e, cond=config['cond'])
-    dc2e_inv = linalg.pinv2(dc2e, cond=config['cond'])
+    uc2e_inv = linalg.pinv(uc2e, config['tol'])
+    dc2e_inv = linalg.pinv(dc2e, config['tol'])
 
     if 'uc2e_inv' in db.keys() and 'dc2e_inv' in db.keys():
-
+        del db['uc2e']
+        del db['dc2e']
         del db['uc2e_inv']
         del db['dc2e_inv']
 
+    db['uc2e'] = uc2e
+    db['dc2e'] = dc2e
     db['uc2e_inv']= uc2e_inv
     db['dc2e_inv']= dc2e_inv
 
@@ -552,7 +505,7 @@ def compute_m2m_and_l2l(
 
 
 def compute_m2l(
-        config, db, implicit_gram_matrix, k,  equivalent_surface, check_surface,
+        config, db, k,  equivalent_surface, check_surface,
         x0, r0, depth
     ):
     """
@@ -602,7 +555,7 @@ def compute_m2l(
 
         u, s, vt, hashes = compress_m2l_gram_matrix(
             level, x0, r0, depth, config['alpha_inner'], check_surface,
-            equivalent_surface, k, implicit_gram_matrix
+            equivalent_surface, k
         )
 
         db['m2l'][str_level]['u'] = u
@@ -629,7 +582,7 @@ def main(**config):
     kernel = config['kernel']
     k = config['target_rank']
 
-    implicit_gram_matrix = KERNELS[kernel]['implicit_gram_blocked']
+    # implicit_gram_matrix = KERNELS[kernel]['implicit_gram_blocked']
     dense_gram_matrix = KERNELS[kernel]['dense_gram']
     kernel_scale = KERNELS[kernel]['scale']
 
@@ -651,7 +604,7 @@ def main(**config):
 
     # Step 4: Compute M2L operators for each level, and their transfer vectors
     compute_m2l(
-        config, db, implicit_gram_matrix, k,  equivalent_surface, check_surface,
+        config, db, k,  equivalent_surface, check_surface,
         x0, r0, depth
     )
 
diff --git a/scripts/test/test_precompute_operators.py b/scripts/test/test_precompute_operators.py
index aec8c40..67372cc 100644
--- a/scripts/test/test_precompute_operators.py
+++ b/scripts/test/test_precompute_operators.py
@@ -9,7 +9,6 @@
 import pytest
 
 import adaptoctree.morton as morton
-import adaptoctree.utils as utils
 
 from fmm.kernel import KERNELS
 import fmm.surface as surface
@@ -21,17 +20,18 @@
 CONFIG_FILEPATH = HERE.parent.parent / "test_config.json"
 CONFIG = data.load_json(CONFIG_FILEPATH)
 
-RTOL = 1e-1
-
 
 @pytest.fixture
 def db():
     experiment = CONFIG["experiment"]
     return h5py.File(ROOT / f"{experiment}.hdf5", "r")
 
-
 def test_m2m(db):
-
+    """
+    Test the convergence of the multipole expansion of the root node. If this
+        as expected, then the M2M translation operators as well as the P2M step
+        work.
+    """
     parent_key = 0
     child_key = morton.find_children(parent_key)[0]
 
@@ -58,7 +58,7 @@ def test_m2m(db):
         db["m2m"][operator_idx], child_equivalent_density
     )
 
-    distant_point = np.array([[1e2, 0, 0]])
+    distant_point = parent_center+r0*CONFIG['alpha_outer']*1.1
 
     child_equivalent_surface = surface.scale_surface(
         surf=equivalent_surface,
@@ -88,11 +88,14 @@ def test_m2m(db):
         source_densities=child_equivalent_density,
     )
 
-    assert np.isclose(parent_direct, child_direct, rtol=RTOL)
+    assert np.isclose(parent_direct, child_direct, atol=1e-3, rtol=0)
 
 
 def test_l2l(db):
-
+    """
+    Test the convergence of the local expansion after L2L has been performed
+        on a given parent/child pair.
+    """
     parent_key = 1
     child_key = morton.find_children(parent_key)[-1]
 
@@ -144,15 +147,20 @@ def test_l2l(db):
         source_densities=child_equivalent_density,
     )
 
-    assert np.isclose(parent_direct, child_direct, rtol=RTOL)
+    assert np.isclose(parent_direct, child_direct, atol=1e-4, rtol=0)
 
 
 def test_m2l(db):
-
+    """
+    Test convergence of local expansion after M2L translation has been applied,
+        as well as SVD compression by computing result from all multipole
+        expansions of source nodes for a given target node in its V list, and
+        comparing with local expansion result computed with compressed M2L
+        translation operator.
+    """
     x0 = db["octree"]["x0"][...]
     r0 = db["octree"]["r0"][...][0]
     dc2e_inv = db['dc2e_inv'][...]
-    depth = db['octree']['depth'][0]
     equivalent_surface = db["surface"]["equivalent"][...]
     npoints_equivalent = len(equivalent_surface)
     check_surface = db['surface']['check'][...]
@@ -195,12 +203,11 @@ def test_m2l(db):
         )
 
         lidx = idx*npoints_equivalent
-        ridx = (idx+1)*npoints_equivalent
+        ridx = lidx+npoints_equivalent
 
         sources[lidx:ridx] = source_equivalent_surface
 
-    # place unit densities on source boxes
-    source_equivalent_density = np.ones(len(v_list)*npoints_equivalent)
+    # place densities on source boxes
     source_equivalent_density = np.random.rand(len(v_list)*npoints_equivalent)
 
     u = db['m2l'][str(target_level)]['u']
@@ -216,13 +223,12 @@ def test_m2l(db):
         transfer_vec = morton.find_transfer_vector(target_key, source_key)
         m2l_idx = np.where(transfer_vec == hashes)[0][0]
         m2l_lidx = (m2l_idx)*npoints_check
-        m2l_ridx = (m2l_idx+1)*npoints_check
-        u_sub = u[m2l_lidx:m2l_ridx]
-        m2l_matrix = (scale*dc2e_inv) @ (u_sub @ np.diag(s) @ vt)
+        m2l_ridx = m2l_lidx+npoints_check
+        vt_sub = vt[:, m2l_lidx:m2l_ridx]
 
         lidx = i*npoints_equivalent
-        ridx = (i+1)*npoints_equivalent
-        target_equivalent_density_sub = m2l_matrix @ source_equivalent_density[lidx:ridx]
+        ridx = lidx+npoints_equivalent
+        target_equivalent_density_sub = (scale*dc2e_inv) @ (u @ np.diag(s) @ vt_sub) @ source_equivalent_density[lidx:ridx]
         target_equivalent_density += target_equivalent_density_sub
 
     targets = surface.scale_surface(
@@ -247,4 +253,4 @@ def test_m2l(db):
         source_densities=source_equivalent_density
     )
 
-    assert np.isclose(target_direct, source_direct, rtol=RTOL)
+    assert np.isclose(target_direct, source_direct, atol=1e-1, rtol=0)
diff --git a/setup.py b/setup.py
index 9758aff..f3a3d03 100644
--- a/setup.py
+++ b/setup.py
@@ -11,10 +11,10 @@
     exec(f.read(), ABOUT)
 
 requirements = [
-    "adaptoctree==1.2.0",
+    "adaptoctree==1.2.1",
     "scipy==1.6.1",
     "h5py==2.10.0",
-    "cupy==8.3.0",
+    "scikit-learn==0.24.2",
     "click==7.1.2",
 ]
 
diff --git a/test.hdf5 b/test.hdf5
index 46ee238..dd0d479 100644
Binary files a/test.hdf5 and b/test.hdf5 differ
diff --git a/test_config.json b/test_config.json
index 8bc49cb..4c16403 100644
--- a/test_config.json
+++ b/test_config.json
@@ -2,14 +2,14 @@
     "experiment": "test",
     "npoints": 10000,
     "data_type": "random",
-    "order_equivalent": 2,
-    "order_check": 3,
+    "order_equivalent": 5,
+    "order_check": 5,
     "kernel": "laplace",
     "backend": "numba",
     "alpha_inner": 1.05,
-    "alpha_outer": 2.95,
+    "alpha_outer": 2.9,
     "max_level": 10,
-    "max_points": 100,
+    "max_points": 150,
     "target_rank": 10,
-    "cond": 1e-1
+    "tol": 1e-4
 }
diff --git a/utils/time.py b/utils/time.py
index d9f919d..a67228f 100644
--- a/utils/time.py
+++ b/utils/time.py
@@ -24,11 +24,9 @@ def __exit__(self, *args):
 def seconds_to_minutes(seconds):
     """
     Convert seconds to minutes and seconds.
-
     Parameters:
     -----------
     seconds : float
-
     Returns:
     --------
     (float, float)
@@ -36,4 +34,4 @@ def seconds_to_minutes(seconds):
     """
     minutes = seconds // 60
     seconds = seconds % 60
-    return minutes, seconds
+    return minutes, seconds
\ No newline at end of file