diff --git a/README.md b/README.md index c890df2..ebf6f70 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ PyExaFMM [![Anaconda-Server Badge](https://img.shields.io/conda/v/skailasa/pyexafmm.svg)](https://anaconda.org/skailasa/pyexafmm) [![Anaconda-Server Badge](https://anaconda.org/skailasa/pyexafmm/badges/latest_release_date.svg)](https://anaconda.org/skailasa/pyexafmm) [![Anaconda-Server Badge](https://anaconda.org/skailasa/pyexafmm/badges/platforms.svg)](https://anaconda.org/skailasa/pyexafmm) -PyExaFMM is an adaptive particle kernel-independent FMM based on [1], written in pure Python with some extensions. Representing a compromise between portability, ease of use, and performance. Optimisations are currently implemented using Numba, Numpy, and CUDA acceleration. +PyExaFMM is an adaptive particle kernel-independent FMM based on [1], written in pure Python with some extensions. Representing a compromise between portability, ease of use, and performance. Optimisations are currently implemented using Numba. The goal of PyExaFMM is to develop a highly performant implementation of the adaptive particle FMM written in Python, as the utility of FMM algorithms are hindered by their relatively complex implementation, especially for achieving high-performance. @@ -14,13 +14,8 @@ PyExaFMM compares favourably on realistic problem sizes with naive direct evalua Laplace - Here we use an order 5 multipole expansion, an order 6 local expansion, and constrain leaf nodes to contain a maximum of 100 particles. The target rank in our M2L operator compression is kept at 1. The benchmark was run on an Intel i7-9750H processor. -## System Requirements - -An NVidia GPU is required, as PyExaFMM is accellerated with CUDA. - ## Install Download from Anaconda cloud into a conda/mini-conda environment: @@ -60,19 +55,19 @@ This is done via a `config.json` file, PyExaFMM will look for this in your **cur ```json { - "experiment": "fmm", - "npoints": 100000, + "experiment": "test", + "npoints": 10000, "data_type": "random", "order_equivalent": 5, - "order_check": 6, + "order_check": 5, "kernel": "laplace", "backend": "numba", "alpha_inner": 1.05, - "alpha_outer": 2.95, + "alpha_outer": 2.9, "max_level": 10, - "max_points": 100, - "target_rank": 1, - "cond": 1e-16 + "max_points": 150, + "target_rank": 10, + "tol": 1e-4 } ``` @@ -89,7 +84,7 @@ This is done via a `config.json` file, PyExaFMM will look for this in your **cur | `alpha_outer` | Relative size of outer surface's radius. | | `max_level` | Depth of octree to use in simulations. | | `target_rank` | Target rank in low-rank compression of M2L matrix. | -| `cond` | Threshold under which to ignore singular values in randomised SVD. | +| `tol` | Threshold under which to ignore singular values in SVD taken to invert check-to-equivalent matrices | PyExaFMM provides some simple test-data generation functions, which can be configured for. However, to use your own data, simply create a HDF5 file, with the same name as `experiment` in your configuration file, with the following group hierarchy, diff --git a/fmm/backend/api.py b/fmm/backend/api.py index d50549f..c056d02 100644 --- a/fmm/backend/api.py +++ b/fmm/backend/api.py @@ -2,7 +2,6 @@ Interface for compute backends. """ import fmm.backend.numba as numba_backend -import fmm.backend.openmp as openmp_backend BACKEND = { "numba": { @@ -15,16 +14,5 @@ "m2t": numba_backend.m2t, "near_field_u_list": numba_backend.near_field_u_list, "near_field_node": numba_backend.near_field_node, - }, - "openmp": { - "p2m": openmp_backend.p2m, - "m2m": openmp_backend.m2m, - "m2l": openmp_backend.m2l, - "l2l": openmp_backend.l2l, - "s2l": openmp_backend.s2l, - "l2t": openmp_backend.l2t, - "m2t": openmp_backend.m2t, - "near_field_u_list": openmp_backend.near_field, - "near_field_node": openmp_backend.near_field, } } diff --git a/fmm/backend/numba.py b/fmm/backend/numba.py index f026884..c576127 100644 --- a/fmm/backend/numba.py +++ b/fmm/backend/numba.py @@ -319,7 +319,6 @@ def m2l_core( scale : np.float32 Precomputed kernel scale for this level. """ - nv_list = len(v_list) # Index of local expansion @@ -328,16 +327,16 @@ def m2l_core( for i in range(nv_list): source = v_list[i] - midx = key_to_index[source]*nequivalent_points transfer_vector = morton.find_transfer_vector(target, source) - u_idx = hash_to_index[transfer_vector] - u_lidx = u_idx*ncheck_points - - u_sub = u[u_lidx:u_lidx+ncheck_points] + v_idx = hash_to_index[transfer_vector] + v_lidx = v_idx*ncheck_points + v_ridx = v_lidx+ncheck_points + vt_sub = np.copy(vt[:, v_lidx:v_ridx]) - multipole_expansion = multipole_expansions[midx:midx+nequivalent_points] - local_expansions[lidx:lidx+nequivalent_points] += scale*(dc2e_inv @ (u_sub @ (s @ ( vt @ multipole_expansion)))) + m_lidx = key_to_index[source]*nequivalent_points + m_ridx = m_lidx+nequivalent_points + local_expansions[lidx:lidx+nequivalent_points] += (scale*dc2e_inv) @ (u @ (s @ (vt_sub @ multipole_expansions[m_lidx:m_ridx]))) @numba.njit(cache=True, parallel=True) @@ -367,7 +366,7 @@ def m2l( u : np.array(np.float32) Compressed left singular vectors of SVD of M2L Gram matrix for nodes at this level. s : np.array(np.float32) - Compressed singular values of SVD of M2L Gram matrix for nodes at this level. + Compressed singular values of SVD of M2L Gram matrix for nodes at this level.` vt : np.array(np.float32) Compressed right singular vectors of SVD of M2L Gram matrix for nodes at this level. dc2e_inv : np.array(shape=(n_equivalent, n_check), dtype=np.float64) @@ -443,13 +442,13 @@ def l2l( parent = morton.find_parent(key) parent_idx = key_to_index[parent] parent_lidx = parent_idx*nequivalent_points - parent_ridx = (parent_idx+1)*nequivalent_points + parent_ridx = parent_lidx+nequivalent_points parent_equivalent_density = local_expansions[parent_lidx:parent_ridx] # Compute expansion index child_idx = key_to_index[key] child_lidx = child_idx*nequivalent_points - child_ridx = (child_idx+1)*nequivalent_points + child_ridx = child_lidx+nequivalent_points # Compute operator index operator_idx = key == morton.find_siblings(key) @@ -515,9 +514,10 @@ def s2l( p2p_function : function Function handle for kernel P2P. """ - level = np.int32(morton.find_level(key)) - scale = np.int32(scale_function(level)) - center = morton.find_physical_center_from_key(key, x0, r0).astype(np.float32) + + level = morton.find_level(key) + scale = scale_function(level) + center = morton.find_physical_center_from_key(key, x0, r0) key_idx = key_to_index[key] key_lidx = key_idx*nequivalent_points @@ -601,10 +601,10 @@ def m2t( source_idx = key_to_index[source] source_lidx = source_idx*nequivalent_points - source_ridx = (source_idx+1)*nequivalent_points + source_ridx = source_lidx+nequivalent_points - source_level = np.int32(morton.find_level(source)) - source_center = morton.find_physical_center_from_key(source, x0, r0).astype(np.float32) + source_level = morton.find_level(source) + source_center = morton.find_physical_center_from_key(source, x0, r0) upward_equivalent_surface = surface.scale_surface( surf=equivalent_surface, @@ -673,8 +673,8 @@ def l2t( source_lidx = (source_idx)*nequivalent_points source_ridx = source_lidx+nequivalent_points - level = np.int32(morton.find_level(key)) - center = morton.find_physical_center_from_key(key, x0, r0).astype(np.float32) + level = morton.find_level(key) + center = morton.find_physical_center_from_key(key, x0, r0) downward_equivalent_surface = surface.scale_surface( equivalent_surface, @@ -878,6 +878,8 @@ def near_field_node( p2p_function ): """ + Evaluate all near field particles for source particles within a given + target node Parameters: ----------- diff --git a/fmm/backend/openmp.py b/fmm/backend/openmp.py deleted file mode 100644 index 81217aa..0000000 --- a/fmm/backend/openmp.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Compute operators accelerated with Numba and OpenMP. -""" -import numpy as np -import numba - -from adaptoctree import morton - -import fmm.surface as surface - - - -def p2m(): - pass - -def m2m(): - pass - -def l2l(): - pass - -def m2l(): - pass - -def s2l(): - pass - -def l2t(): - pass - -def m2t(): - pass - -def near_field(): - pass \ No newline at end of file diff --git a/fmm/fmm.py b/fmm/fmm.py index c373f18..37cb8b1 100644 --- a/fmm/fmm.py +++ b/fmm/fmm.py @@ -180,7 +180,7 @@ def downward_pass(self): # Keys at this level keys = self.complete[self.complete_levels == level] - scale = self.scale_function(level) + scale = np.float32(self.scale_function(level)) # V List interactions # M2L operator stored in terms of its SVD components for each level @@ -218,8 +218,6 @@ def downward_pass(self): for key in keys: - idx = self.key_to_index[key] - # Translate local expansion from the node's parent self.backend['l2l']( key=key, diff --git a/fmm/kernel.py b/fmm/kernel.py index 69295bd..08dce50 100644 --- a/fmm/kernel.py +++ b/fmm/kernel.py @@ -1,15 +1,9 @@ """ Kernels accelerated with CUDA and Numba. """ -import math - import numba -from numba import cuda import numpy as np -# GPU Kernel parameters -BLOCK_WIDTH = 32 -BLOCK_HEIGHT = 1024 M_INV_4PI = 1.0 / (4*np.pi) TOL = 1e-6 @@ -19,7 +13,10 @@ ############################################################################### -@numba.njit(cache=True) +@numba.njit( + [numba.float32(numba.int32)], + cache=True +) def laplace_scale(level): """ Level scale for the Laplace kernel. @@ -32,7 +29,7 @@ def laplace_scale(level): -------- np.float32 """ - return np.float32(1/(2**level)) + return numba.float32(1/(2**level)) @numba.njit(cache=True) @@ -57,242 +54,6 @@ def laplace_cpu(x, y): return res -@cuda.jit(device=True) -def laplace_cuda(ax, ay, az, bx, by, bz): - """ - Numba-Cuda Laplace device kernel. - - Parameters: - ----------- - ax : np.float32 - 'x' coordinate of a point. - ay : np.float32 - 'y' coordinate of a point. - az : np.float32 - 'z' coordinate of a point. - bx : np.float32 - 'x' coordinate of a point. - by : np.float32 - 'y' coordinate of a point. - bz : np.float32 - 'z' coordinate of a point. - - Returns: - -------- - nb.cuda.float32 - """ - rx = ax-bx - ry = ay-by - rz = az-bz - - dist = rx**2+ry**2+rz**2 - dist_sqr = math.sqrt(dist) - inv_dist_sqr = 1./(4*math.pi*dist_sqr) - - if math.isinf(inv_dist_sqr): - return 0. - - return numba.float32(inv_dist_sqr) - - -@cuda.jit -def laplace_implicit_gram_matrix( - sources, - targets, - rhs, - result, - height, - width, - idx - ): - """ - Implicitly apply the Gram matrix to a vector, computing Green's function on - the fly on the device, and multiplying by random matrices to approximate - the basis. Multiple RHS are computed in a loop, specified by the index. - - Parameters: - ----------- - sources : np.array(shape=(nsources, 3)) - nsources rows of gram matrix - targets : np.array(shape=(ntargets, 3)) - ntargets columns of gram matrix - rhs : np.array(shape=(ntargets, nrhs)) - Multiple right hand sides, indexed by 'idx' - result : np.array(shape=(height, nrhs)) - Dimensions of result matrix defined by matrix height - and number of right hand sides - height : int - 'height' of implicit gram matrix. Defined by m, where - m > n, and m is either ntargets or nsources. - width : int - 'width' of implicit gram matrix. - idx: int - RHS index. - """ - - blockWidth = cuda.shared.array(1, numba.int32) - blockxInd = cuda.shared.array(1, numba.int32) - blockyInd = cuda.shared.array(1, numba.int32) - - # Calculate once, and share amongs thread-block - if cuda.threadIdx.x == 0: - if (cuda.blockIdx.x + 1)*BLOCK_WIDTH <= width: - blockWidth[0] = numba.int32(BLOCK_WIDTH) - else: - blockWidth[0] = numba.int32(width % BLOCK_WIDTH) - - blockxInd[0] = numba.int32(cuda.blockIdx.x*BLOCK_WIDTH) - blockyInd[0] = numba.int32(cuda.blockIdx.y*BLOCK_HEIGHT) - - cuda.syncthreads() - - # Extract tile of RHS, of size up to BLOCK_WIDTH - tmp = cuda.shared.array(BLOCK_WIDTH, numba.float32) - - if cuda.threadIdx.x < blockWidth[0]: - tmp[cuda.threadIdx.x] = rhs[idx][blockxInd[0]+cuda.threadIdx.x] - - cuda.syncthreads() - - # Accumulate matvec of tile into variable - row_sum = 0 - - threadyInd = blockyInd[0]+cuda.threadIdx.x - - if threadyInd < height: - for i in range(blockWidth[0]): - col_idx = blockxInd[0] + i - row_idx = threadyInd - source = sources[row_idx] - target = targets[col_idx] - - sx = source[0] - sy = source[1] - sz = source[2] - - tx = target[0] - ty = target[1] - tz = target[2] - - row_sum += laplace_cuda(sx, sy, sz, tx, ty, tz)*tmp[i] - - cuda.atomic.add(result, (threadyInd, idx), row_sum) - - -@cuda.jit -def laplace_implicit_gram_matrix_blocked( - sources, - targets, - rhs, - result, - height, - width, - sub_height, - sub_width, - idx - ): - """ - Implicitly apply the Gram matrix to a vector, computing Green's function on - the fly on the device, and multiplying by random matrices to approximate - the basis. Multiple RHS are computed in a loop, specified by the index. - This kernel assumes that the sources/targets specified by the user - specify multiple Gram matrices, i.e. the global Gram matrix is 'blocked', - - e.g. gram = ((g_1) | (g_2) | ... | (g_{n-1}) | g_n) - - therefore, one additionally needs to specify the dimensions of the - 'sub' gram matrices in order to pick out the correct matrix elements - for application to a given RHS. - - Parameters: - ----------- - sources : np.array(shape=(nsources, 3), dtype=np.float32) - nsources rows of gram matrix - targets : np.array(shape=(ntargets, 3), dtype=np.float32) - ntargets columns of gram matrix - rhs : np.array(shape=(ntargets, nrhs), dtype=np.float32) - Multiple right hand sides, indexed by 'idx' - result : np.array(shape=(height, nrhs), dtype=np.float32) - Dimensions of result matrix defined by matrix height - and number of right hand sides - height : np.int32 - 'height' of global Gram matrix. Defined by m, where - m > n, and m is either ntargets or nsources. - width : np.int32 - 'width' of Global gram matrix. - sub_height : np.int32 - height of sub-Gram matrix. Defined by m, where - m > n, and m is either ntargets or nsources. - sub_width : np.int32 - 'width' of sub-Gram matrix. - idx: np.int32 - RHS index. - """ - - blockWidth = cuda.shared.array(1, numba.int32) - blockxInd = cuda.shared.array(1, numba.int32) - blockyInd = cuda.shared.array(1, numba.int32) - - # Calculate once, and share amongs thread-block - if cuda.threadIdx.x == 0: - if (cuda.blockIdx.x + 1)*BLOCK_WIDTH <= width: - blockWidth[0] = numba.int32(BLOCK_WIDTH) - else: - blockWidth[0] = numba.int32(width % BLOCK_WIDTH) - - blockxInd[0] = numba.int32(cuda.blockIdx.x*BLOCK_WIDTH) - blockyInd[0] = numba.int32(cuda.blockIdx.y*BLOCK_HEIGHT) - - cuda.syncthreads() - - # Extract tile of RHS, of size up to BLOCK_WIDTH - tmp = cuda.shared.array(BLOCK_WIDTH, numba.float32) - - if cuda.threadIdx.x < blockWidth[0]: - tmp[cuda.threadIdx.x] = rhs[idx][blockxInd[0]+cuda.threadIdx.x] - - cuda.syncthreads() - - # Accumulate matvec of tile into variable - row_sum = 0 - - threadyInd = blockyInd[0]+cuda.threadIdx.x - threadxInd = blockxInd[0]+cuda.threadIdx.x - - # Find the index of the sub-Gram matrix, dependent on orientation. - if sub_width == width: - submatrixInd = numba.int32(math.floor(threadyInd/sub_height))*sub_width - - if sub_height == height: - submatrixInd = numba.int32(math.floor(threadxInd/sub_width))*sub_height - - if threadyInd < height: - for i in range(blockWidth[0]): - col_idx = (blockxInd[0] + i) - row_idx = threadyInd - - # Pick out matrix element for sub-Gram matrix - if sub_width == width: - source = sources[col_idx+submatrixInd] - target = targets[row_idx] - - elif sub_height == height: - source = sources[col_idx] - target = targets[row_idx+submatrixInd] - - sx = source[0] - sy = source[1] - sz = source[2] - - tx = target[0] - ty = target[1] - tz = target[2] - - row_sum += laplace_cuda(sx, sy, sz, tx, ty, tz)*tmp[i] - - cuda.atomic.add(result, (threadyInd, idx), row_sum) - - @numba.njit(cache=True, parallel=False, fastmath=True, error_model="numpy") def laplace_p2p_serial(sources, targets, source_densities): """ @@ -439,11 +200,8 @@ def laplace_gram_matrix_parallel(sources, targets): 'laplace': { 'eval': laplace_cpu, 'scale': laplace_scale, - 'cuda': laplace_cuda, 'dense_gram': laplace_gram_matrix_serial, 'dense_gram_parallel': laplace_gram_matrix_parallel, - 'implicit_gram': laplace_implicit_gram_matrix, - 'implicit_gram_blocked': laplace_implicit_gram_matrix_blocked, 'p2p': laplace_p2p_serial, 'p2p_parallel': laplace_p2p_parallel }, diff --git a/fmm/linalg.py b/fmm/linalg.py new file mode 100644 index 0000000..7a4b95b --- /dev/null +++ b/fmm/linalg.py @@ -0,0 +1,30 @@ +""" +Linear algebra utilities +""" +import numba +import numpy as np + + +@numba.njit(cache=True) +def pinv(a, tol=1e-4): + """ + Moore-Penrose Pseudo-Inverse calculation via SVD. + + Parameters: + ----------- + a : np.array(shape=(m, n), dtype=np.float32) + tol : np.float32 + Cutoff singular values greater than 4*max(s)*tol, default 1e-4 works + well in practice. + """ + u, s, vt = np.linalg.svd(a, full_matrices=False) + + max_s = max(s) + + for i in range(len(s)): + s[i] = 1./s[i] if s[i] > 4*max_s*tol else 0. + + v = vt.T + ut = u.T + + return v @ np.diag(s) @ ut \ No newline at end of file diff --git a/fmm/parameters.py b/fmm/parameters.py deleted file mode 100644 index 79769e2..0000000 --- a/fmm/parameters.py +++ /dev/null @@ -1,5 +0,0 @@ -""" -Global parameters -""" - -DIGEST_SIZE = 5 diff --git a/fmm/surface.py b/fmm/surface.py index c8d4639..f4ec092 100644 --- a/fmm/surface.py +++ b/fmm/surface.py @@ -1,7 +1,6 @@ """ Operator helper methods. """ - import numba import numpy as np @@ -86,7 +85,7 @@ def scale_surface(surf, radius, level, center, alpha): scaled_radius = (0.5)**level * radius dilated_radius = alpha*scaled_radius - scaled_surf = np.copy(surf) + scaled_surf = np.zeros_like(surf, np.float32) for i in range(n_coeffs): scaled_surf[i] = surf[i]*dilated_radius + center diff --git a/fmm/test/test_fmm.py b/fmm/test/test_fmm.py index 6cc3ed2..b51d705 100644 --- a/fmm/test/test_fmm.py +++ b/fmm/test/test_fmm.py @@ -13,9 +13,6 @@ from fmm.kernel import KERNELS -RTOL = 1e-1 - - def test_upward_pass(): """ Test that multipole expansion of root node is the same as a direct @@ -25,36 +22,46 @@ def test_upward_pass(): experiment = Fmm('test_config') experiment.upward_pass() - upward_equivalent_surface = surface.scale_surface( - surf=experiment.equivalent_surface, - radius=experiment.r0, - level=np.int32(0), - center=experiment.x0, - alpha=experiment.alpha_inner - ) + for key in experiment.complete[experiment.complete_levels == experiment.depth]: - distant_point = np.array([[1e4, 0, 0]]) + center = morton.find_physical_center_from_key(key, experiment.x0, experiment.r0) + radius = experiment.x0 / (1 << experiment.depth) - kernel = experiment.config['kernel'] - p2p = KERNELS[kernel]['p2p'] + upward_equivalent_surface = surface.scale_surface( + surf=experiment.equivalent_surface, + radius=experiment.r0, + level=experiment.depth, + center=center, + alpha=experiment.alpha_inner + ) - direct = p2p( - sources=experiment.sources, - targets=distant_point, - source_densities=experiment.source_densities - ) + distant_point = center+radius*3 - idx = experiment.key_to_index[0] - lidx = idx*experiment.nequivalent_points - ridx = (idx+1)*experiment.nequivalent_points + kernel = experiment.config['kernel'] + p2p = KERNELS[kernel]['p2p'] - equivalent = p2p( - sources=upward_equivalent_surface, - targets=distant_point, - source_densities=experiment.multipole_expansions[lidx:ridx] - ) + leaf_idx = experiment.key_to_leaf_index[key] + global_idx = experiment.key_to_index[key] - assert np.allclose(direct, equivalent, rtol=RTOL) + lidx = global_idx*experiment.nequivalent_points + ridx = lidx+experiment.nequivalent_points + + source_coordinates = experiment.sources[experiment.source_index_pointer[leaf_idx]:experiment.source_index_pointer[leaf_idx+1]] + source_densities = experiment.source_densities[experiment.source_index_pointer[leaf_idx]:experiment.source_index_pointer[leaf_idx+1]] + + direct = p2p( + sources=source_coordinates, + targets=distant_point, + source_densities=source_densities + ) + + equivalent = p2p( + sources=upward_equivalent_surface, + targets=distant_point, + source_densities=experiment.multipole_expansions[lidx:ridx] + ) + + assert np.allclose(direct, equivalent, atol=1e-2, rtol=0) def test_m2l(): @@ -70,62 +77,70 @@ def test_m2l(): p2p = experiment.p2p_function level_2_idxs = experiment.complete_levels == 2 - key = experiment.complete[level_2_idxs][0] - idx = experiment.key_to_index[key] - target_lidx = idx*experiment.nequivalent_points - target_ridx = target_lidx+experiment.nequivalent_points - v_list = experiment.v_lists[idx] - v_list = v_list[v_list != -1] - - target_coordinates = experiment.targets[experiment.target_index_pointer[idx]:experiment.target_index_pointer[idx+1]] - - level = np.int32(morton.find_level(key)) - center = morton.find_physical_center_from_key(key, experiment.x0, experiment.r0).astype(np.float32) - - local_expansion = experiment.local_expansions[target_lidx:target_ridx] - - downward_equivalent_surface = surface.scale_surface( - surf=experiment.equivalent_surface, - radius=experiment.r0, - level=level, - center=center, - alpha=experiment.alpha_outer - ) - equivalent = p2p( - sources=downward_equivalent_surface, - targets=target_coordinates, - source_densities=local_expansion - ) + for i, key in enumerate(experiment.complete[level_2_idxs]): + idx = experiment.key_to_index[key] + target_lidx = idx*experiment.nequivalent_points + target_ridx = target_lidx+experiment.nequivalent_points + v_list = experiment.v_lists[idx] + v_list = v_list[v_list != -1] - direct = np.zeros_like(equivalent) - for source in v_list: + level = np.int32(morton.find_level(key)) + center = morton.find_physical_center_from_key(key, experiment.x0, experiment.r0).astype(np.float32) - source_level = np.int32(morton.find_level(source)) - source_center = morton.find_physical_center_from_key(source, experiment.x0, experiment.r0).astype(np.float32) + local_expansion = experiment.local_expansions[target_lidx:target_ridx] - upward_equivalent_surface = surface.scale_surface( + downward_check_surface = surface.scale_surface( surf=experiment.equivalent_surface, radius=experiment.r0, - level=source_level, - center=source_center, + level=level, + center=center, alpha=experiment.alpha_inner ) - source_idx = experiment.key_to_index[source] - source_lidx = source_idx*experiment.nequivalent_points - source_ridx = source_lidx+experiment.nequivalent_points + downward_equivalent_surface = surface.scale_surface( + surf=experiment.equivalent_surface, + radius=experiment.r0, + level=level, + center=center, + alpha=experiment.alpha_outer + ) - tmp = p2p( - sources=upward_equivalent_surface, - targets=target_coordinates, - source_densities=experiment.multipole_expansions[source_lidx:source_ridx] + equivalent = p2p( + sources=downward_equivalent_surface, + targets=downward_check_surface, + source_densities=local_expansion ) - direct += tmp + direct = np.zeros_like(equivalent) + + for source in v_list: + + source_level = np.int32(morton.find_level(source)) + source_center = morton.find_physical_center_from_key(source, experiment.x0, experiment.r0).astype(np.float32) + + upward_equivalent_surface = surface.scale_surface( + surf=experiment.equivalent_surface, + radius=experiment.r0, + level=source_level, + center=source_center, + alpha=experiment.alpha_inner + ) + + source_idx = experiment.key_to_index[source] + source_lidx = source_idx*experiment.nequivalent_points + source_ridx = source_lidx+experiment.nequivalent_points + + tmp = p2p( + sources=upward_equivalent_surface, + targets=downward_check_surface, + source_densities=experiment.multipole_expansions[source_lidx:source_ridx] + ) - assert np.allclose(direct, equivalent, rtol=RTOL) + direct += tmp + + assert np.allclose(direct, equivalent, rtol=0.01, atol=0) def test_fmm(): @@ -139,28 +154,7 @@ def test_fmm(): kernel = experiment.config['kernel'] p2p = KERNELS[kernel]['p2p'] - for leaf in experiment.leaves: - - - leaf_idx = experiment.key_to_leaf_index[leaf] - - res = experiment.target_potentials[ - experiment.target_index_pointer[leaf_idx]:experiment.target_index_pointer[leaf_idx+1] - ] - - targets = experiment.targets[ - experiment.target_index_pointer[leaf_idx]:experiment.target_index_pointer[leaf_idx+1] - ] - - direct = p2p( - targets=targets, - sources=experiment.sources, - source_densities=experiment.source_densities - ) - - diff = res-direct - - err = 100*abs(diff)/direct - mean_err = np.mean(err) + direct = p2p(experiment.sources, experiment.targets, experiment.source_densities) + equivalent = experiment.target_potentials - assert mean_err < 4 + np.allclose(direct, equivalent, rtol=1e-2, atol=0) diff --git a/scripts/precompute_operators.py b/scripts/precompute_operators.py index 66cf646..ad47639 100644 --- a/scripts/precompute_operators.py +++ b/scripts/precompute_operators.py @@ -6,32 +6,29 @@ import sys import time -import cupy as cp import h5py import numba import numpy as np -import scipy.linalg as linalg +from sklearn.utils.extmath import randomized_svd import adaptoctree.morton as morton import adaptoctree.tree as tree -from fmm.kernel import KERNELS, BLOCK_WIDTH, BLOCK_HEIGHT +from fmm.kernel import KERNELS +import fmm.linalg as linalg import fmm.surface as surface import utils.data as data import utils.time - HERE = pathlib.Path(os.path.dirname(os.path.abspath(__file__))) PARENT = HERE.parent WORKING_DIR = pathlib.Path(os.getcwd()) -TPB = BLOCK_HEIGHT - def compress_m2l_gram_matrix( level, x0, r0, depth, alpha_inner, check_surface, - equivalent_surface, k, implicit_gram_matrix + equivalent_surface, k ): """ Compute compressed representation of unique Gram matrices for targets and @@ -78,11 +75,9 @@ def compress_m2l_gram_matrix( n_targets_per_node = len(check_surface) n_sources_per_node = len(equivalent_surface) - n_targets = len(targets) n_sources = len(sources) - dsources = cp.zeros((n_sources*n_sources_per_node, 3)).astype(np.float32) - dtargets = cp.zeros((n_targets*n_targets_per_node, 3)).astype(np.float32) + se2tc = np.zeros((n_targets_per_node, n_sources*n_sources_per_node), np.float32) for idx in range(len(targets)): @@ -101,13 +96,10 @@ def compress_m2l_gram_matrix( r0=r0 ) - lidx_targets = (idx)*n_targets_per_node - ridx_targets = (idx+1)*n_targets_per_node - lidx_sources = (idx)*n_sources_per_node - ridx_sources = (idx+1)*n_sources_per_node + ridx_sources = lidx_sources+n_sources_per_node - target_coordinates = surface.scale_surface( + target_check_surface = surface.scale_surface( surf=check_surface, radius=np.float32(r0), level=np.int32(level), @@ -115,7 +107,7 @@ def compress_m2l_gram_matrix( alpha=np.float32(alpha_inner) ) - source_coordinates = surface.scale_surface( + source_equivalent_surface = surface.scale_surface( surf=equivalent_surface, radius=np.float32(r0), level=np.int32(level), @@ -123,60 +115,15 @@ def compress_m2l_gram_matrix( alpha=np.float32(alpha_inner) ) - # Compress the Gram matrix between these sources/targets - dsources[lidx_sources:ridx_sources, :] = cp.asarray(source_coordinates) - dtargets[lidx_targets:ridx_targets, :] = cp.asarray(target_coordinates) - - - # Height and width of Gram matrix - height = n_targets_per_node*n_targets - sub_height = n_targets_per_node + tmp = KERNELS[config['kernel']]['dense_gram']( + sources=source_equivalent_surface, targets=target_check_surface + ) - width = n_sources_per_node - sub_width = n_sources_per_node + se2tc[:, lidx_sources:ridx_sources] = tmp - bpg = (int(np.ceil(width/BLOCK_WIDTH)), int(np.ceil(height/BLOCK_HEIGHT))) - - # Result data, in the language of RSVD - dY = cp.zeros((height, k)).astype(np.float32) - - # Random matrix, for compressed basis - dOmega = cp.random.rand(n_sources_per_node, k).astype(np.float32) - dOmegaT = dOmega.T - - # Perform implicit matrix matrix product between Gram matrix and random - # matrix Omega - for idx in range(k): - implicit_gram_matrix[bpg, TPB]( - dsources, dtargets, dOmegaT, dY, height, width, sub_height, sub_width, idx - ) + u, s, vt = randomized_svd(se2tc, k) - # Perform QR decomposition on the GPU - dQ, _ = cp.linalg.qr(dY) - dQT = dQ.T - - # Perform transposed matrix-matrix multiplication implicitly - height = n_sources_per_node - sub_height = n_sources_per_node - width = n_targets_per_node*n_targets - sub_width = n_targets_per_node - - dBT = cp.zeros((n_sources_per_node, k)).astype(np.float32) - - # Blocking is transposed - bpg = (int(np.ceil(width/BLOCK_WIDTH)), int(np.ceil(height/BLOCK_HEIGHT))) - - for idx in range(k): - implicit_gram_matrix[bpg, TPB]( - dtargets, dsources, dQT, dBT, height, width, sub_height, sub_width, idx - ) - - # Perform SVD on reduced matrix - du, dS, dVT = cp.linalg.svd(dBT.T, full_matrices=False) - dU = cp.matmul(dQ, du) - - # Return compressed SVD components - return dU.get(), dS.get(), dVT.get(), hashes + return u.astype(np.float32), s.astype(np.float32), vt.astype(np.float32), hashes def compute_surfaces(config, db): @@ -279,7 +226,6 @@ def compute_octree(config, db): start_level = 1 sources = db['particle_data']['sources'][...] - source_densities = db['particle_data']['source_densities'][...] targets = db['particle_data']['targets'][...] points = np.vstack((sources, targets)) @@ -387,6 +333,7 @@ def compute_inv_c2e( print("Computing Inverse of Check To Equivalent Gram Matrix") + equivalent_surface = surface.compute_surface(config['order_equivalent']) upward_equivalent_surface = surface.scale_surface( surf=equivalent_surface, radius=np.float32(r0), @@ -395,6 +342,7 @@ def compute_inv_c2e( alpha=np.float32(config['alpha_inner']) ) + check_surface = surface.compute_surface(config['order_check']) upward_check_surface = surface.scale_surface( surf=check_surface, radius=np.float32(r0), @@ -403,6 +351,7 @@ def compute_inv_c2e( alpha=np.float32(config['alpha_outer']) ) + equivalent_surface = surface.compute_surface(config['order_equivalent']) downward_equivalent_surface = surface.scale_surface( surf=equivalent_surface, radius=np.float32(r0), @@ -411,6 +360,7 @@ def compute_inv_c2e( alpha=np.float32(config['alpha_outer']) ) + check_surface = surface.compute_surface(config['order_check']) downward_check_surface = surface.scale_surface( surf=check_surface, radius=np.float32(r0), @@ -429,14 +379,17 @@ def compute_inv_c2e( sources=downward_equivalent_surface, ) - uc2e_inv = linalg.pinv2(uc2e, cond=config['cond']) - dc2e_inv = linalg.pinv2(dc2e, cond=config['cond']) + uc2e_inv = linalg.pinv(uc2e, config['tol']) + dc2e_inv = linalg.pinv(dc2e, config['tol']) if 'uc2e_inv' in db.keys() and 'dc2e_inv' in db.keys(): - + del db['uc2e'] + del db['dc2e'] del db['uc2e_inv'] del db['dc2e_inv'] + db['uc2e'] = uc2e + db['dc2e'] = dc2e db['uc2e_inv']= uc2e_inv db['dc2e_inv']= dc2e_inv @@ -552,7 +505,7 @@ def compute_m2m_and_l2l( def compute_m2l( - config, db, implicit_gram_matrix, k, equivalent_surface, check_surface, + config, db, k, equivalent_surface, check_surface, x0, r0, depth ): """ @@ -602,7 +555,7 @@ def compute_m2l( u, s, vt, hashes = compress_m2l_gram_matrix( level, x0, r0, depth, config['alpha_inner'], check_surface, - equivalent_surface, k, implicit_gram_matrix + equivalent_surface, k ) db['m2l'][str_level]['u'] = u @@ -629,7 +582,7 @@ def main(**config): kernel = config['kernel'] k = config['target_rank'] - implicit_gram_matrix = KERNELS[kernel]['implicit_gram_blocked'] + # implicit_gram_matrix = KERNELS[kernel]['implicit_gram_blocked'] dense_gram_matrix = KERNELS[kernel]['dense_gram'] kernel_scale = KERNELS[kernel]['scale'] @@ -651,7 +604,7 @@ def main(**config): # Step 4: Compute M2L operators for each level, and their transfer vectors compute_m2l( - config, db, implicit_gram_matrix, k, equivalent_surface, check_surface, + config, db, k, equivalent_surface, check_surface, x0, r0, depth ) diff --git a/scripts/test/test_precompute_operators.py b/scripts/test/test_precompute_operators.py index aec8c40..67372cc 100644 --- a/scripts/test/test_precompute_operators.py +++ b/scripts/test/test_precompute_operators.py @@ -9,7 +9,6 @@ import pytest import adaptoctree.morton as morton -import adaptoctree.utils as utils from fmm.kernel import KERNELS import fmm.surface as surface @@ -21,17 +20,18 @@ CONFIG_FILEPATH = HERE.parent.parent / "test_config.json" CONFIG = data.load_json(CONFIG_FILEPATH) -RTOL = 1e-1 - @pytest.fixture def db(): experiment = CONFIG["experiment"] return h5py.File(ROOT / f"{experiment}.hdf5", "r") - def test_m2m(db): - + """ + Test the convergence of the multipole expansion of the root node. If this + as expected, then the M2M translation operators as well as the P2M step + work. + """ parent_key = 0 child_key = morton.find_children(parent_key)[0] @@ -58,7 +58,7 @@ def test_m2m(db): db["m2m"][operator_idx], child_equivalent_density ) - distant_point = np.array([[1e2, 0, 0]]) + distant_point = parent_center+r0*CONFIG['alpha_outer']*1.1 child_equivalent_surface = surface.scale_surface( surf=equivalent_surface, @@ -88,11 +88,14 @@ def test_m2m(db): source_densities=child_equivalent_density, ) - assert np.isclose(parent_direct, child_direct, rtol=RTOL) + assert np.isclose(parent_direct, child_direct, atol=1e-3, rtol=0) def test_l2l(db): - + """ + Test the convergence of the local expansion after L2L has been performed + on a given parent/child pair. + """ parent_key = 1 child_key = morton.find_children(parent_key)[-1] @@ -144,15 +147,20 @@ def test_l2l(db): source_densities=child_equivalent_density, ) - assert np.isclose(parent_direct, child_direct, rtol=RTOL) + assert np.isclose(parent_direct, child_direct, atol=1e-4, rtol=0) def test_m2l(db): - + """ + Test convergence of local expansion after M2L translation has been applied, + as well as SVD compression by computing result from all multipole + expansions of source nodes for a given target node in its V list, and + comparing with local expansion result computed with compressed M2L + translation operator. + """ x0 = db["octree"]["x0"][...] r0 = db["octree"]["r0"][...][0] dc2e_inv = db['dc2e_inv'][...] - depth = db['octree']['depth'][0] equivalent_surface = db["surface"]["equivalent"][...] npoints_equivalent = len(equivalent_surface) check_surface = db['surface']['check'][...] @@ -195,12 +203,11 @@ def test_m2l(db): ) lidx = idx*npoints_equivalent - ridx = (idx+1)*npoints_equivalent + ridx = lidx+npoints_equivalent sources[lidx:ridx] = source_equivalent_surface - # place unit densities on source boxes - source_equivalent_density = np.ones(len(v_list)*npoints_equivalent) + # place densities on source boxes source_equivalent_density = np.random.rand(len(v_list)*npoints_equivalent) u = db['m2l'][str(target_level)]['u'] @@ -216,13 +223,12 @@ def test_m2l(db): transfer_vec = morton.find_transfer_vector(target_key, source_key) m2l_idx = np.where(transfer_vec == hashes)[0][0] m2l_lidx = (m2l_idx)*npoints_check - m2l_ridx = (m2l_idx+1)*npoints_check - u_sub = u[m2l_lidx:m2l_ridx] - m2l_matrix = (scale*dc2e_inv) @ (u_sub @ np.diag(s) @ vt) + m2l_ridx = m2l_lidx+npoints_check + vt_sub = vt[:, m2l_lidx:m2l_ridx] lidx = i*npoints_equivalent - ridx = (i+1)*npoints_equivalent - target_equivalent_density_sub = m2l_matrix @ source_equivalent_density[lidx:ridx] + ridx = lidx+npoints_equivalent + target_equivalent_density_sub = (scale*dc2e_inv) @ (u @ np.diag(s) @ vt_sub) @ source_equivalent_density[lidx:ridx] target_equivalent_density += target_equivalent_density_sub targets = surface.scale_surface( @@ -247,4 +253,4 @@ def test_m2l(db): source_densities=source_equivalent_density ) - assert np.isclose(target_direct, source_direct, rtol=RTOL) + assert np.isclose(target_direct, source_direct, atol=1e-1, rtol=0) diff --git a/setup.py b/setup.py index 9758aff..f3a3d03 100644 --- a/setup.py +++ b/setup.py @@ -11,10 +11,10 @@ exec(f.read(), ABOUT) requirements = [ - "adaptoctree==1.2.0", + "adaptoctree==1.2.1", "scipy==1.6.1", "h5py==2.10.0", - "cupy==8.3.0", + "scikit-learn==0.24.2", "click==7.1.2", ] diff --git a/test.hdf5 b/test.hdf5 index 46ee238..dd0d479 100644 Binary files a/test.hdf5 and b/test.hdf5 differ diff --git a/test_config.json b/test_config.json index 8bc49cb..4c16403 100644 --- a/test_config.json +++ b/test_config.json @@ -2,14 +2,14 @@ "experiment": "test", "npoints": 10000, "data_type": "random", - "order_equivalent": 2, - "order_check": 3, + "order_equivalent": 5, + "order_check": 5, "kernel": "laplace", "backend": "numba", "alpha_inner": 1.05, - "alpha_outer": 2.95, + "alpha_outer": 2.9, "max_level": 10, - "max_points": 100, + "max_points": 150, "target_rank": 10, - "cond": 1e-1 + "tol": 1e-4 } diff --git a/utils/time.py b/utils/time.py index d9f919d..a67228f 100644 --- a/utils/time.py +++ b/utils/time.py @@ -24,11 +24,9 @@ def __exit__(self, *args): def seconds_to_minutes(seconds): """ Convert seconds to minutes and seconds. - Parameters: ----------- seconds : float - Returns: -------- (float, float) @@ -36,4 +34,4 @@ def seconds_to_minutes(seconds): """ minutes = seconds // 60 seconds = seconds % 60 - return minutes, seconds + return minutes, seconds \ No newline at end of file