diff --git a/external/dace b/external/dace index 13402cbf..82541a94 160000 --- a/external/dace +++ b/external/dace @@ -1 +1 @@ -Subproject commit 13402cbfeeb6969cbd3915acfb7a30bdb543071b +Subproject commit 82541a9401dcadca43edc33cf1db61a0fe21d0e5 diff --git a/external/gt4py b/external/gt4py index 45324c88..68eea74b 160000 --- a/external/gt4py +++ b/external/gt4py @@ -1 +1 @@ -Subproject commit 45324c88e57b5e8dfc974efa70fa2f2e5e10677f +Subproject commit 68eea74b748747ac5415c93e479d7964f3ec6947 diff --git a/ndsl/dsl/dace/dace_config.py b/ndsl/dsl/dace/dace_config.py index 78dcb7a5..f6e6bb26 100644 --- a/ndsl/dsl/dace/dace_config.py +++ b/ndsl/dsl/dace/dace_config.py @@ -7,6 +7,7 @@ import dace.config from dace.codegen.compiled_sdfg import CompiledSDFG from dace.frontend.python.parser import DaceProgram +from gt4py.cartesian.config import GT4PY_COMPILE_OPT_LEVEL from ndsl.comm.communicator import Communicator from ndsl.comm.partitioner import Partitioner @@ -181,6 +182,12 @@ def __init__( # We control this Dace configuration below with our own override dace_debug_env_var = os.getenv("PACE_DACE_DEBUG", "False") == "True" + # We hijack the optimization level of GT4Py because we don't + # have the configuration at NDSL level, but we do use the GT4Py + # level + # TODO: if GT4PY opt level is funnled via NDSL - use it here + optimization_level = GT4PY_COMPILE_OPT_LEVEL + # Set the configuration of DaCe to a rigid & tested set of divergence # from the defaults when orchestrating if orchestration != DaCeOrchestration.Python: @@ -195,7 +202,7 @@ def __init__( "compiler", "cpu", "args", - value="-std=c++14 -fPIC -Wall -Wextra -O3", + value=f"-std=c++14 -fPIC -Wall -Wextra -O{optimization_level}", ) # Potentially buggy - deactivate dace.config.Config.set( diff --git a/ndsl/dsl/dace/orchestration.py b/ndsl/dsl/dace/orchestration.py index 0b5283e6..38e7be09 100644 --- a/ndsl/dsl/dace/orchestration.py +++ b/ndsl/dsl/dace/orchestration.py @@ -3,9 +3,12 @@ import os from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union -import dace -import gt4py.storage +from dace import SDFG from dace import compiletime as DaceCompiletime +from dace import dtypes +from dace import method as dace_method +from dace import nodes +from dace import program as dace_program from dace.dtypes import DeviceType as DaceDeviceType from dace.dtypes import StorageType as DaceStorageType from dace.frontend.python.common import SDFGConvertible @@ -13,6 +16,7 @@ from dace.transformation.auto.auto_optimize import make_transients_persistent from dace.transformation.helpers import get_parent_map from dace.transformation.passes.simplify import SimplifyPass +from gt4py import storage from ndsl.comm.mpi import MPI from ndsl.dsl.dace.build import get_sdfg_path, write_build_info @@ -27,7 +31,6 @@ negative_qtracers_checker, sdfg_nan_checker, ) -from ndsl.dsl.dace.sdfg_opt_passes import splittable_region_expansion from ndsl.dsl.dace.utils import ( DaCeProgress, memory_static_analysis, @@ -61,10 +64,10 @@ def _download_results_from_dace( return None backend = config.get_backend() - return [gt4py.storage.from_array(result, backend=backend) for result in dace_result] + return [storage.from_array(result, backend=backend) for result in dace_result] -def _to_gpu(sdfg: dace.SDFG): +def _to_gpu(sdfg: SDFG): """Flag memory in SDFG to GPU. Force deactivate OpenMP sections for sanity.""" @@ -72,7 +75,7 @@ def _to_gpu(sdfg: dace.SDFG): allmaps = [ (me, state) for me, state in sdfg.all_nodes_recursive() - if isinstance(me, dace.nodes.MapEntry) + if isinstance(me, nodes.MapEntry) ] topmaps = [ (me, state) for me, state in allmaps if get_parent_map(state, me) is None @@ -81,13 +84,13 @@ def _to_gpu(sdfg: dace.SDFG): # Set storage of arrays to GPU, scalarizable arrays will be set on registers for sd, _aname, arr in sdfg.arrays_recursive(): if arr.shape == (1,): - arr.storage = dace.StorageType.Register + arr.storage = dtypes.StorageType.Register else: - arr.storage = dace.StorageType.GPU_Global + arr.storage = dtypes.StorageType.GPU_Global # All maps will be schedule on GPU for mapentry, _state in topmaps: - mapentry.schedule = dace.ScheduleType.GPU_Device + mapentry.schedule = dtypes.ScheduleType.GPU_Device # Deactivate OpenMP sections for sd in sdfg.all_sdfgs_recursive(): @@ -95,7 +98,7 @@ def _to_gpu(sdfg: dace.SDFG): def _simplify( - sdfg: dace.SDFG, + sdfg: SDFG, *, validate: bool = True, validate_all: bool = False, @@ -108,24 +111,33 @@ def _simplify( validate=validate, validate_all=validate_all, verbose=verbose, + skip=["ScalarToSymbolPromotion"], ).apply_pass(sdfg, {}) def _build_sdfg( - dace_program: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args, kwargs + dace_program: DaceProgram, sdfg: SDFG, config: DaceConfig, args, kwargs ): """Build the .so out of the SDFG on the top tile ranks only""" is_compiling = True if DEACTIVATE_DISTRIBUTED_DACE_COMPILE else config.do_compile if is_compiling: + with DaCeProgress(config, "Validate original SDFG"): + sdfg.validate() + # Make the transients array persistents if config.is_gpu_backend(): + # TODO + # The following should happen on the stree level _to_gpu(sdfg) + make_transients_persistent(sdfg=sdfg, device=DaceDeviceType.GPU) # Upload args to device _upload_to_device(list(args) + list(kwargs.values())) else: + # TODO + # The following should happen on the stree level for _sd, _aname, arr in sdfg.arrays_recursive(): if arr.shape == (1,): arr.storage = DaceStorageType.Register @@ -141,18 +153,7 @@ def _build_sdfg( if k in sdfg_kwargs and tup[1].transient: del sdfg_kwargs[k] - with DaCeProgress(config, "Simplify (1/2)"): - _simplify(sdfg, validate=False, verbose=True) - - # Perform pre-expansion fine tuning - with DaCeProgress(config, "Split regions"): - splittable_region_expansion(sdfg, verbose=True) - - # Expand the stencil computation Library Nodes with the right expansion - with DaCeProgress(config, "Expand"): - sdfg.expand_library_nodes() - - with DaCeProgress(config, "Simplify (2/2)"): + with DaCeProgress(config, "Simplify"): _simplify(sdfg, validate=False, verbose=True) # Move all memory that can be into a pool to lower memory pressure. @@ -160,10 +161,10 @@ def _build_sdfg( with DaCeProgress(config, "Turn Persistents into pooled Scope"): memory_pooled = 0.0 for _sd, _aname, arr in sdfg.arrays_recursive(): - if arr.lifetime == dace.AllocationLifetime.Persistent: + if arr.lifetime == dtypes.AllocationLifetime.Persistent: arr.pool = True memory_pooled += arr.total_size * arr.dtype.bytes - arr.lifetime = dace.AllocationLifetime.Scope + arr.lifetime = dtypes.AllocationLifetime.Scope memory_pooled = float(memory_pooled) / (1024 * 1024) ndsl_log.debug( f"{DaCeProgress.default_prefix(config)} Pooled {memory_pooled} mb", @@ -180,7 +181,9 @@ def _build_sdfg( # Compile with DaCeProgress(config, "Codegen & compile"): sdfg.compile() - write_build_info(sdfg, config.layout, config.tile_resolution, config._backend) + write_build_info( + sdfg, config.layout, config.tile_resolution, config.get_backend() + ) # Printing analysis of the compiled SDFG with DaCeProgress(config, "Build finished. Running memory static analysis"): @@ -223,9 +226,7 @@ def _build_sdfg( return _call_sdfg(dace_program, sdfg, config, args, kwargs) -def _call_sdfg( - dace_program: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args, kwargs -): +def _call_sdfg(dace_program: DaceProgram, sdfg: SDFG, config: DaceConfig, args, kwargs): """Dispatch the SDFG execution and/or build""" # Pre-compiled SDFG code path does away with any data checks and # cached the marshalling - leading to almost direct C call @@ -259,7 +260,7 @@ def _parse_sdfg( config: DaceConfig, *args, **kwargs, -) -> Optional[dace.SDFG]: +) -> Optional[SDFG]: """Return an SDFG depending on cache existence. Either parses, load a .sdfg or load .so (as a compiled sdfg) @@ -318,7 +319,7 @@ class _LazyComputepathFunction(SDFGConvertible): def __init__(self, func: Callable, config: DaceConfig): self.func = func self.config = config - self.daceprog: DaceProgram = dace.program(self.func) + self.daceprog: DaceProgram = dace_program(self.func) self._sdfg = None def __call__(self, *args, **kwargs): @@ -373,7 +374,7 @@ class _LazyComputepathMethod: class SDFGEnabledCallable(SDFGConvertible): def __init__(self, lazy_method: _LazyComputepathMethod, obj_to_bind): - methodwrapper = dace.method(lazy_method.func) + methodwrapper = dace_method(lazy_method.func) self.obj_to_bind = obj_to_bind self.lazy_method = lazy_method self.daceprog: DaceProgram = methodwrapper.__get__(obj_to_bind) diff --git a/ndsl/dsl/dace/sdfg_opt_passes.py b/ndsl/dsl/dace/sdfg_opt_passes.py deleted file mode 100644 index b7582cc5..00000000 --- a/ndsl/dsl/dace/sdfg_opt_passes.py +++ /dev/null @@ -1,24 +0,0 @@ -import dace - -from ndsl.logging import ndsl_log - - -def splittable_region_expansion(sdfg: dace.SDFG, verbose: bool = False): - """ - Set certain StencilComputation library nodes to expand to a different - schedule if they contain small splittable regions. - """ - from gt4py.cartesian.gtc.dace.nodes import StencilComputation - - for node, _ in sdfg.all_nodes_recursive(): - if isinstance(node, StencilComputation): - if node.has_splittable_regions() and "corner" in node.label: - node.expansion_specification = [ - "Sections", - "Stages", - "J", - "I", - "K", - ] - if verbose: - ndsl_log.debug(f"Reordered schedule for {node.label}")