diff --git a/.gitignore b/.gitignore
index 74cd9b72..c8918d08 100644
--- a/.gitignore
+++ b/.gitignore
@@ -150,6 +150,7 @@ dmypy.json
 
 # GT4Py
 **/.gt_cache*/
+**/.gt4py_cache/*
 
 # Run outputs
 plot_output/
diff --git a/ORIG.README.md b/ORIG.README.md
new file mode 100644
index 00000000..a6192aac
--- /dev/null
+++ b/ORIG.README.md
@@ -0,0 +1,459 @@
+> DISCLAIMER: Work in progress
+
+# FV3core
+
+FV3core is a Python version, using GridTools GT4Py with CPU and GPU backend options, of the FV3 dynamical core (fv3gfs-fortran repo).
+The code here includes regression test data of computation units coming from serialized output from the Fortran model generated using the `GridTools/serialbox` framework.
+
+As of January 10, 2021 this documentation is outdated in that it was written when we had fv3core as its own single repository. Some functionality, such as linting, has been moved to the top level but may still be described in this document as occuring inside the fv3core folder.
+
+**WARNING** This repo is under active development and relies on code and data that is not publicly available at this point.
+
+## QuickStart
+
+1. Ensure you have docker installed and available for building and running and has access to the VCM cloud
+
+Be sure to complete any required post-installation instructions (e.g. [for linux](https://docs.docker.com/engine/install/linux-postinstall/)). Also [authorize Docker to pull from gcr](https://cloud.google.com/container-registry/docs/advanced-authentication). Your user will need to have read access to the `us.gcr.io/vcm-ml` repository.
+
+2.  You can build the image, download the data, and run the tests using:
+
+```shell
+$ make tests savepoint_tests savepoint_tests_mpi
+```
+
+If you want to develop code, you should also install the linting requirements and git hooks locally
+
+```shell
+$ pip install -c constraints.txt -r requirements/requirements_lint.txt
+$ pre-commit install
+
+## Getting started, in more detail
+If you want to build the main fv3core docker image, run
+
+```shell
+$ make build
+```
+
+If you want to download test data run
+
+```shell
+$ make get_test_data
+```
+
+And the c12_6ranks_standard data will download into the `test_data` directory.
+
+If you do not have a GCP account, there is an option to download basic test data from a public FTP server and you can skip the GCP authentication step above. To download test data from the FTP server, use `make USE_FTP=yes get_test_data` instead and this will avoid fetching from a GCP storage bucket. You will need a valid in stallation of the `lftp` command.
+
+MPI parallel tests (that run that way to exercise halo updates in the model) can also be run with:
+
+```shell
+$ make savepoint_tests_mpi
+```
+
+The environment image that the fv3core container uses is prebuilt and lives in the GCR. The above commands will by default pull this image before building the fv3core image and running the tests.
+To build the environment from scratch (including GT4py) before running tests, either run
+
+```
+make build_environment
+```
+
+or
+
+```shell
+$ PULL=False make savepoint_tests
+```
+
+which will execute the target `build_environment` for you before running the tests.
+
+There are `push_environment` and `rebuild_environment` targets, but these should normally not be done manually. Updating the install image should only be done by Jenkins after the tests pass using a new environment.
+
+### Test data options
+
+If you want to run different test data, discover the possible options with
+```shell
+$ make list_test_data_options
+```
+This will list the storage buckets in the cloud. Then to run one of them, set EXPERIMENT to the folder name of the data you'd like to use:
+
+e.g.
+```shell
+$EXPERIMENT=c48_6ranks_standard make tests
+```
+
+If you choose an experiment with a different number of ranks than 6, also set `NUM_RANKS=<num ranks>`
+
+## Testing interactively outside the container
+
+After `make savepoint_tests` has been run at least once (or you have data in test_data and the docker image fv3core exists because `make build` has been run), you can iterate on code changes using
+
+```shell
+$ DEV=y make savepoint_tests
+```
+or for the parallel or non-savepoint tests:
+
+```shell
+$ DEV=y make tests savepoint_tests_mpi
+```
+These will mount your current code into the fv3core container and run it rather than the code that was built when `make build` ran.
+
+## Running tests inside a container
+
+If you to prefer to work interactively inside the fv3core container, get the test data and build the docker image (see above if you do not have a GCP account and want to get test data):
+```shell
+$ make get_test_data
+```
+
+```shell
+$ make build
+```
+Testing can be run with this data from `/port_dev` inside the container:
+
+```shell
+$ make dev
+```
+
+Then in the container:
+
+```shell
+$ pytest -v -s --data_path=/test_data/ /port_dev/tests --which_modules=<stencil name>
+```
+The 'stencil name' can be determined from the associated Translate class. e.g. TranslateXPPM is a test class that translate data serialized from a run of the fortran model, and 'XPPM' is the name you can use with --which_modules.
+
+
+
+
+### Test options
+
+All of the make endpoints involved running tests can be prefixed with the `TEST_ARGS` environment variable to set test options or pytest CLI args (see below) when running inside the container.
+
+* `--which_modules <modules to run tests for>` - comma separated list of which modules to test (defaults to running all of them).
+
+* `--print_failures` - if your test fails, it will only report the first datapoint. If you want all the nonmatching regression data to print out (so you can see if there are patterns, e.g. just incorrect for the first 'i' or whatever'), this will print out for every failing test all the non-matching data.
+
+* `--failure_stride` - when printing failures, print every n failures only.
+
+* `--data_path` - path to where you have the `Generator*.dat` and `*.json` serialization regression data. Defaults to current directory.
+
+* `--backend` - which backend to use for the computation. Options: `[numpy, gt:cpu_ifirst, gt:cpu_first, gt:gpu, cuda]`. Defaults to `numpy`.
+* `--python_regression` - Run the tests that have Python based regression data. Only applies to running parallel tests (savepoint_tests_mpi)
+Pytest provides a lot of options, which you can see by `pytest --help`. Here are some
+common options for our tests, which you can add to `TEST_ARGS`:
+
+* `-r` - is used to report test types other than failure. It can be provided `s` for skipped (e.g. tests which were not run because earlier tests of the same stencil failed), `x` for xfail or "expected to fail" tests (like tests with no translate class), or `p` for pass. For example, to report skipped and xfail tests you would use `-rsx`.
+
+* `--disable-warnings` - will stop all warnings from being printed at the end of the tests, for example warnings that translate classes are not yet implemented.
+
+* `-v` - will increase test verbosity, while `-q` will decrease it.
+
+* `-s` - will let stdout print directly to console instead of capturing the output and printing it when a test fails only. Note that logger lines will always be printed both during (by setting log_cli in our pytest.ini file) and after tests.
+
+* `-m` - will let you run only certain groups of tests. For example, `-m=parallel` will run only parallel stencils, while `-m=sequential` will run only stencils that operate on one rank at a time.
+
+* `--threshold_overrides_file` - will read a yaml file with error thresholds specified for specific backend and platform (docker or metal) configurations, overriding the max_error thresholds defined in the Translate classes. Format of the yaml file is described [here](tests/savepoint/translate/overrides/README.md).
+
+* `--dperiodic` - run tests on a doubly-periodic domain. Will look for only one tile's worth of test data and parallel tests will be run with a TileCommunicator instead of a CubedSphereCommunicator.
+
+**NOTE:** FV3 is current assumed to be by default in a "development mode", where stencils are checked each time they execute for code changes (which can trigger regeneration). This process is somewhat expensive, so there is an option to put FV3 in a performance mode by telling it that stencils should not automatically be rebuilt:
+
+```shell
+$ export FV3_STENCIL_REBUILD_FLAG=False
+```
+
+## Porting a new stencil
+
+1. Find the location in the fv3gfs-fortran repo code where the save-point is to be added, e.g. using
+
+```shell
+$ git grep <stencil_name> <checkout of fv3gfs-fortran>
+```
+
+2. Create a `translate` class from the serialized save-point data to a call to the stencil or function that calls the relevant stencil(s).
+
+These are usually named `tests/savepoint/translate/translate_<lowercase name>`
+
+Import this class in the `tests/savepoint/translate/__init__.py` file
+
+3. Write a Python function wrapper that the translate function (created above) calls.
+
+By convention, we name these `fv3core/stencils/<lower case stencil name>.py`
+
+4. Run the test, either with one name or a comma-separated list
+
+```shell
+$ make dev_tests TEST_ARGS="-–which_modules=<stencil name(s)>"
+```
+
+**Please also review the [Porting conventions](#porting-conventions) section for additional explanation**
+## Installation
+
+### Docker Image
+
+To build the `us.gcr.io/vcm-ml/fv3core` image with required dependencies for running the Python code, run
+
+```shell
+$ make build
+```
+
+Add `PULL=False` to build from scratch without running `docker pull`:
+
+```shell
+PULL=False make build
+```
+
+## Relevant repositories
+
+- https://github.com/GridTools/serialbox -
+  Serialbox generates serialized data when the Fortran model runs and has bindings to manage data from Python
+
+- https://github.com/VulcanClimateModeling/fv3gfs-fortran -
+  This is the existing Fortran model decorated with serialization statements from which the test data is generated
+
+- https://github.com/GridTools/gt4py -
+  Python package for the DSL language
+
+- https://github.com/VulcanClimateModeling/util
+  Python specific model functionality, such as halo updates.
+
+- https://github.com/VulcanClimateModeling/fv3gfs-wrapper
+  A Python based wrapper for running the Fortran version of the FV3GFS model.
+
+Some of these are submodules.
+While tests can work without these, it may be necessary for development to have these as well.
+To add these to the local repository, run
+
+```shell
+$ git submodule update --init
+```
+
+The submodules include:
+
+- `external/util` - git@github.com:VulcanClimateModeling/util.git
+- `external/daint_venv` -  git@github.com:VulcanClimateModeling/daint_venv.git
+
+## Dockerfiles and building
+
+There are two main docker files:
+
+1. `docker/dependencies.Dockerfile` - defines dependency images such as for mpi, serialbox, and GT4py
+
+2. `docker/Dockerfile` - uses the dependencies to define the final fv3core images.
+
+The dependencies are separated out into their own images to expedite rebuilding the docker image without having to rebuild dependencies, especially on CI.
+
+For the commands below using `make -C docker`, you can alternatively run `make` from within the `docker` directory.
+
+These dependencies can be updated, pushed, and pulled with `make -C docker build_deps`, `make -C docker push_deps`, and `make -C docker pull_deps`. The tag of the dependencies is based on the tag of the current build in the Makefile, which we will expand on below.
+
+Building from scratch requires both a deps and build command, such as `make -C docker pull_deps fv3core_image`.
+
+If any example fails for "pulled dependencies", it means the dependencies have never been built. You can
+build them and push them to GCR with:
+
+```shell
+$ make -C docker build_deps push_deps
+```
+
+### Building examples
+
+fv3core image with pulled dependencies:
+
+```shell
+$ make -C docker pull_deps fv3core_image
+```
+
+CUDA-enabled fv3core image with pulled dependencies:
+```
+$ CUDA=y make -C docker pull_deps fv3core_image
+```
+
+fv3core image with locally-built dependencies:
+```shell
+$ make -C docker build_deps fv3core_image
+```
+
+### Updating Serialbox
+
+If you need to install an updated version of Serialbox, you must first install cmake into the development environment. To install an updated version of Serialbox from within the container run
+
+```shell
+$ wget https://github.com/Kitware/CMake/releases/download/v3.17.3/cmake-3.17.3.tar.gz && \
+  tar xzf cmake-3.17.3.tar.gz && \
+  cd cmake-3.17.3 && \
+  ./bootstrap && make -j4 && make install
+$ git clone -b v2.6.1 --depth 1 https://github.com/GridTools/serialbox.git /tmp/serialbox
+$ cd /tmp/serialbox
+$ cmake -B build -S /tmp/serialbox -DSERIALBOX_USE_NETCDF=ON -DSERIALBOX_TESTING=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/serialbox
+$ cmake --build build/ -j $(nproc) --target install
+$ cd -
+$ rm -rf build /tmp/serialbox
+```
+
+## Pinned dependencies
+
+Dependencies are pinned using `constraints.txt`. This is auto-generated by pip-compile from the `pip-tools` package, which reads `requirements.txt` and `requirements/requirements_lint.txt`, determines the latest versions of all dependencies (including recursive dependencies) compatible those files, and writes pinned versions for all dependencies. This can be updated using:
+
+```shell
+$ make constraints.txt
+```
+
+This file is committed to the repository, and gives more reproducible tests if an old commit of the repository is checked out in the future. The constraints are followed when creating the `fv3core` docker images. To ensure consistency this should ideally be run from inside a docker development environment, but you can also run it on your local system with an appropriate Python 3 environment.
+
+## Development
+
+To develop fv3core, you need to install the linting requirements in `requirements/requirements_lint.txt`. To install the pinned versions, use:
+
+```shell
+$ pip install -c constraints.txt -r requirements/requirements_lint.txt
+```
+
+This adds `pre-commit`, which we use to lint and enforce style on the code. The first time you install `pre-commit`, install its git hooks using:
+
+```shell
+$ pre-commit install
+pre-commit installed at .git/hooks/pre-commit
+```
+
+As a convenience, the `lint` target of the top-level makefile executes `pre-commit run --all-files`.
+Linting, which formats files and checks for some style conventions, is required, as the same checks are the first step in the continuous integration testing that happens when creating a pull request.
+Linting locally saves time and literal energy, since CI tests do not have to be launched so many times!
+
+ Please see the 'Development Guidelines' below for more information on the structure of the code to align your new code with the current conventions, as well as the CONTRIBUTING.md document for style guidelines.
+
+## GT4Py version
+
+FV3Core does not actually use the [GridTools/gt4py](https://github.com/gridtools/gt4py) main, it instead uses a Vulcan Climate Modeling development branch.
+This is publically available version at [VCM/gt4py](https://github.com/vulcanclimatemodeling/gt4py).
+
+Situation: There is a new stable feature in a gt4py PR, but it is not yet merged into the GridTools/gt4py main branch.
+[branches.cfg](https://github.com/VulcanClimateModeling/gt4py/blob/develop/branches.cfg) lists these features.
+Steps:
+
+1. Add any new branches to `branches.cfg`
+2. Rebuild the develop branch, either:
+  a. `make_develop gt4py-dev path/to/branches.cfg` (you may have to resolve conflicts...)
+  b. Adding new commits on top of the existing develop branch (e.g. merge or cherry-pick)
+3. Force push to the develop branch: `git push -f upstream develop`
+
+The last step will launch Jenkins tests. If these pass:
+
+1. Create a git tag: `git tag v-$(git rev-parse --short HEAD)`
+2. Push the tag: `git push upstream --tags`
+3. Make a PR to [VCM/gt4py](https://github.com/vulcanclimatemodeling/fv3core) that updates the version in `docker/Makefile` to the new tag.
+
+## License
+FV3Core is provided under the terms of the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html) license.
+
+# Development guidelines
+
+## File structure / conventions
+The main functionality of the FV3 dynamical core, which has been ported from the Fortran version in the fv3gfs-fortran repo, is defined using GT4py stencils and python 'compute' functions in fv3core/stencils. The core is comprised of units of calculations defined for regression testing. These were initially generally separated into distinct files in fv3core/stencils with corresponding files in tests/savepoint/translate/translate_<unit>.py defining the translation of variables from Fortran to Python. Exceptions exist in cases where topical and logical grouping allowed for code reuse. As refactors optimize the model, these units may be merged to occupy the same files and even methods/stencils, but the units should still be tested separately, unless determined to be redundant.
+
+The core has most of its calculations happening in GT4py stencils, but there are still several instances of operations happening in Python directly, which will need to be replaced with GT4py code for optimal performance.
+
+The namelist and grid are global variables defined in fv3core/_config.py The namelist is 'flattened' so that the grouping name of the option is not required to access the data (we may want to change this).
+
+The grid variables are mostly 2d variables and are 'global' to the model thread per mpi rank. The grid object also contains domain and layout information relevant to the current rank being operated on.
+
+Utility functions in `fv3core/utils/` include:
+  - `gt4py_utils.py`:
+    - default gt4py and model settings
+    - methods for generating gt4py storages
+    - methods for using numpy and cupy arrays in python functions that have not been put into GT4py
+    - methods for handling complex patterns that did not immediately map to gt4py, and will mostly be removed with future refactors (e.g. k_split_run)
+    - some general model math computations (e.g. great_circle_dist), that will eventually be put into gt4py with a future refactor
+  - `grid.py`:
+    - A Grid class definition that provides information about the grid layout, current tile informationm access to grid variables used globally, and convenience methods related to tile indexing, origins and domains commonly used
+    - A grid is defined for each MPI rank (minimum 6 ranks, 1 for each tile face of the cubed sphere grid represnting the whole Earth)
+    - Also provides functionality for generating a Quantity object used for halo updates and other utilities
+  - `corners`: port of corner calculations, initially direct Python calculations, being replaced with GT4py gtscript functions as the GT4py regions feature is implemented
+  - `mpi.py`: a wrapper for importing mpi4py when available
+  - `global_constants.py`: constants for use throughout the model
+  - `typing.py`: Clean names for common types we use in the model. This is new and
+    hasn't been adopted throughout the model yet, but will eventually be our
+    standard. A shorthand 'sd' has been used in the intial version.
+
+The `tests/` directory currently includes a framework for translating fields serialized (using
+Serialbox from GridTools) from a Fortran run into gt4py storages that can be inputs to
+fv3core unit computations, and compares the results of the ported code to serialized
+data following a unit computation.
+
+The `docker/` directory provides Dockerfiles for building a repeatable environment in which
+to run the core
+
+The `external/` directory is for submoduled repos that provide essential functionality
+
+The build system uses Makefiles following the convention of other repos within VulcanClimateModeling.
+
+## Model Interface
+
+The top level functions fv_dynamics and fv_sugridz can currenty only be run in parallel using mpi with a minimum of 6 ranks (there are a few other units that also require this, e.g. whenever there is a halo update involved in a unit). These are the interface to the rest of the model and currently have different conventions than the rest of the model.
+ - A 'state' object (currently a SimpleNamespace) stores pointers to the allocated data fields
+ - Most functions within dyn_core can be run sequentially per rank
+ - Currently a list of ArgSpecs must decorate an interface function, where each ArgSpec provides useful information about the argument, e.g.: `@state_inputs( ArgSpec("qvapor", "specific_humidity", "kg/kg", intent="inout")`
+   - The format is (fortran_name, long_name, units, intent)
+   - We currently provide a duplicate of most of the metadata in the specification of the unit test, but that may be removed eventually.
+ - Then the function itself, e.g. fv_dynamics, has arguments of 'state', 'comm' (the communicator) and all of the scalar parameters being provided.
+
+### Porting conventions
+
+Generation of regression data occurs in the fv3gfs-fortran repo (https://github.com/VulcanClimateModeling/fv3gfs-fortran) with serialization statements and a build procedure defined in `tests/serialized_test_data_generation`. The version of data this repo currently tests against is defined in `FORTRAN_SERIALIZED_DATA_VERSION` in this repo's `docker/Makefile.image_names`. Fields serialized are defined in Fortran code with serialization comment statements such as:
+
+```
+    !$ser savepoint C_SW-In
+    !$ser data delpcd=delpc delpd=delp ptcd=ptc
+```
+
+where the name being assigned is the name the fv3core uses to identify the variable in the test code. When this name is not equal to the name of the variable, this was usually done to avoid conflicts with other parts of the code where the same name is used to reference a differently sized field.
+
+The majority of the logic for translating from data serialized from Fortran to something that can be used by Python, and the comparison of the results, is encompassed by the main Translate class in the tests/savepoint/translate/translate.py file. Any units not involving a halo update can be run using this framework, while those that need to be run in parallel can look to the ParallelTranslate class as the parent class in tests/savepoint/translate/parallel_translate.py. These parent classes provide generally useful operations for translating serialized data between Fortran and Python specifications, and for applying regression tests.
+
+A new unit test can be defined as a new child class of one of these, with a naming convention of `Translate<Savepoint Name>` where `Savepoint Name` is the name used in the serialization statements in the Fortran code, without the `-In` and `-Out` part of the name. A translate class can usually be minimally specify the input and output fields. Then, in cases where the parent compute function is insuffient to handle the complexity of either the data translation or the compute function, the appropriate methods can be overridden.
+
+For Translate objects
+  - The init function establishes the assumed translation setup for the class, which can be dynamically overridden as needed.
+  - the parent compute function does:
+    - Makes gt4py storages of the max shape (grid.npx+1, grid.npy+1, grid.npz+1) aligning the data based on the start indices specified. (gt4py requires data fields have the same shape, so in this model we have buffer points so all calculations can be done easily without worrying about shape matching).
+    - runs the compute function (defined in self.compute_func) on the input data storages
+    - slices the computed Python fields to be compared to fortran regression data
+  - The unit test then uses a modified relative error metric to determine whether the unit passes
+  - The init method for a Translate class:
+    - The input (self.in_vars["data_vars"]) and output(self.out_vars) variables are specified in dictionaries, where the keys are the name of the variable used in the model and the values are dictionaries specifying metadata for translation of serialized data to gt4py storages. The metadata that can be specied to override defaults are:
+    - Indices to line up data arrays into gt4py storages (which all get created as the max possible size needed by all operations, for simplicity): "istart", "iend", "jstart", "jend", "kstart", "kend". These should be set using the 'grid' object available to the Translate object, using equivalent index names as in the declaration of variables in the Fortran code, e.g. real:: cx(bd%is:bd%ie+1,bd%jsd:bd%jed ) means we should assign. Example:
+
+```python
+      self.in_vars["data_vars"]["cx"] = {"istart": self.is\_, "iend": self.ie + 1,
+                                         "jstart": self.jsd, "jend": self.jed,}
+```
+  - There is only a limited set of Fortran shapes declared, so abstractions defined in the grid can also be used,
+    e.g.: `self.out_vars["cx"] = self.grid.x3d_compute_domain_y_dict()`. Note that the variables, e.g. `grid.is\_` and `grid.ie` specify the 'compute' domain in the x direction of the current tile, equivalent to `bd%is` and `bd%ie` in the Fortran model EXCEPT that the Python variables are local to the current MPI rank (a subset of the tile face), while the Fortran values are global to the tile face. This is because these indices are used to slice into fields, which in Python is 0-based, and in Fortran is based on however the variables are declared. But, for the purposes of aligning data for computations and comparisons, we can match them in this framework. Shapes need to be defined in a dictionary per variable including `"istart"`, `"iend"`, `"jstart"`, `"jend"`, `"kstart"`, `"kend"` that represent the shape of that variable as defined in the Fortran code. The default shape assumed if a variable is specified with an empty dictionary is `isd:ied, jsd:jed, 0:npz - 1` inclusive, and variables that aren't that shape in the Fortran code need to have the 'start' indices specified for the in_vars dictionary , and 'start' and 'end' for the out_vars.
+    - `"serialname"` can be used to specify a name used in the Fortran code declaration if we'd like the model to use a different name
+    - `"kaxis"`: which dimension is the vertical direction. For most variables this is '2' and does not need to be specified. For Fortran variables that assign the vertical dimension to a different axis, this can be set to ensure we end up with 3d storages that have the vertical dimension where it is expected by GT4py.
+    - `"dummy_axes"`: If set this will set of the storage to have singleton dimensions in the axes defined. This is to enable testing stencils where the full 3d data has not been collected and we want to run stencil tests on the data for a particular slice.
+    - `"names_4d"`: If a 4d variable is being serialized, this can be set to specify the names of each 3d field. By default this is the list of tracers.
+    - input variables that are scalars should be added to `self.in_vars["parameters"]`
+    - `self.compute_func` is the name of the model function that should be run by the compute method in the translate class
+    - `self.max_error` overrides the parent classes relative error threshold. This should only be changed when the reasons for non-bit reproducibility are understood.
+    - `self.max_shape` sets the size of the gt4py storage created for testing
+    - `self.ignore_near_zero_errors[<varname>] = True`: This is an option to let some fields pass with higher relative error if the absolute error is very small
+    - `self.skip_test`: This is an option to jump over the test case, to be used in the override file for temporary deactivation of tests.
+
+For `ParallelTranslate` objects:
+  - Inputs and outputs are defined at the class level, and these include metadata such as the "name" (e.g. understandable name for the symbol), dimensions, units and n_halo(numb er of halo lines)
+  - Both `compute_sequential` and `compute_parallel` methods may be defined, where a mock communicator is used in the `compute_sequential` case
+  - The parent assumes a state object for tracking fields and methods exist for translating from inputs to a state object and extracting the output variables from the state. It is assumed that Quantity objects are needed in the model method in order to do halo updates.
+  - `ParallelTranslate2Py` is a slight variation of this used for many of the parallel units that do not yet utilize a state object and relies on the specification of the same index metadata of the Translate classes
+  - `ParallelTranslateBaseSlicing` makes use of the state but relies on the Translate object of self._base, a Translate class object, to align the data before making quantities, computing and comparing.
+
+### Debugging Tests
+
+Pytest can be configured to give you a pdb session when a test fails. To route this properly through docker, you can run:
+
+```bash
+TEST_ARGS="-v -s --pdb" RUN_FLAGS="--rm -it" make tests
+```
+
+This can be done with any pytest target, such as `make savepoint_tests` and `make savepoint_tests_mpi`.
+
+### GEOS API
+
+The `GeosDycoreWrapper` class provides an API to run the dynamical core in a Python component of a GEOS model run. A `GeosDycoreWrapper` object is initialized with a namelist, communicator, and backend, which creates the communicators, partitioners, dycore state, and dycore object required to run the Pace dycore. A wrapper object takes numpy arrays of `u, v, w, delz, pt, delp, q, ps, pe, pk, peln, pkz, phis, q_con, omga, ua, va, uc, vc, mfxd, mfyd, cxd, cyd,` and `diss_estd` and returns a dictionary containing numpy arrays of those same variables. Wrapper objects contain a `timer` attrubite that tracks the amount of time moving input data to the dycore state, running the dynamical core, and retrieving the data from the state.
diff --git a/README.md b/README.md
index a6192aac..a961837b 100644
--- a/README.md
+++ b/README.md
@@ -1,459 +1,51 @@
-> DISCLAIMER: Work in progress
-
-# FV3core
-
-FV3core is a Python version, using GridTools GT4Py with CPU and GPU backend options, of the FV3 dynamical core (fv3gfs-fortran repo).
-The code here includes regression test data of computation units coming from serialized output from the Fortran model generated using the `GridTools/serialbox` framework.
-
-As of January 10, 2021 this documentation is outdated in that it was written when we had fv3core as its own single repository. Some functionality, such as linting, has been moved to the top level but may still be described in this document as occuring inside the fv3core folder.
-
-**WARNING** This repo is under active development and relies on code and data that is not publicly available at this point.
-
-## QuickStart
-
-1. Ensure you have docker installed and available for building and running and has access to the VCM cloud
-
-Be sure to complete any required post-installation instructions (e.g. [for linux](https://docs.docker.com/engine/install/linux-postinstall/)). Also [authorize Docker to pull from gcr](https://cloud.google.com/container-registry/docs/advanced-authentication). Your user will need to have read access to the `us.gcr.io/vcm-ml` repository.
-
-2.  You can build the image, download the data, and run the tests using:
-
-```shell
-$ make tests savepoint_tests savepoint_tests_mpi
-```
-
-If you want to develop code, you should also install the linting requirements and git hooks locally
-
-```shell
-$ pip install -c constraints.txt -r requirements/requirements_lint.txt
-$ pre-commit install
-
-## Getting started, in more detail
-If you want to build the main fv3core docker image, run
-
-```shell
-$ make build
-```
-
-If you want to download test data run
-
-```shell
-$ make get_test_data
-```
-
-And the c12_6ranks_standard data will download into the `test_data` directory.
-
-If you do not have a GCP account, there is an option to download basic test data from a public FTP server and you can skip the GCP authentication step above. To download test data from the FTP server, use `make USE_FTP=yes get_test_data` instead and this will avoid fetching from a GCP storage bucket. You will need a valid in stallation of the `lftp` command.
-
-MPI parallel tests (that run that way to exercise halo updates in the model) can also be run with:
-
-```shell
-$ make savepoint_tests_mpi
-```
-
-The environment image that the fv3core container uses is prebuilt and lives in the GCR. The above commands will by default pull this image before building the fv3core image and running the tests.
-To build the environment from scratch (including GT4py) before running tests, either run
-
-```
-make build_environment
-```
-
-or
-
-```shell
-$ PULL=False make savepoint_tests
-```
-
-which will execute the target `build_environment` for you before running the tests.
-
-There are `push_environment` and `rebuild_environment` targets, but these should normally not be done manually. Updating the install image should only be done by Jenkins after the tests pass using a new environment.
-
-### Test data options
-
-If you want to run different test data, discover the possible options with
-```shell
-$ make list_test_data_options
-```
-This will list the storage buckets in the cloud. Then to run one of them, set EXPERIMENT to the folder name of the data you'd like to use:
-
-e.g.
-```shell
-$EXPERIMENT=c48_6ranks_standard make tests
-```
-
-If you choose an experiment with a different number of ranks than 6, also set `NUM_RANKS=<num ranks>`
-
-## Testing interactively outside the container
-
-After `make savepoint_tests` has been run at least once (or you have data in test_data and the docker image fv3core exists because `make build` has been run), you can iterate on code changes using
-
-```shell
-$ DEV=y make savepoint_tests
-```
-or for the parallel or non-savepoint tests:
-
-```shell
-$ DEV=y make tests savepoint_tests_mpi
-```
-These will mount your current code into the fv3core container and run it rather than the code that was built when `make build` ran.
-
-## Running tests inside a container
-
-If you to prefer to work interactively inside the fv3core container, get the test data and build the docker image (see above if you do not have a GCP account and want to get test data):
-```shell
-$ make get_test_data
-```
-
-```shell
-$ make build
-```
-Testing can be run with this data from `/port_dev` inside the container:
-
-```shell
-$ make dev
-```
-
-Then in the container:
-
-```shell
-$ pytest -v -s --data_path=/test_data/ /port_dev/tests --which_modules=<stencil name>
-```
-The 'stencil name' can be determined from the associated Translate class. e.g. TranslateXPPM is a test class that translate data serialized from a run of the fortran model, and 'XPPM' is the name you can use with --which_modules.
-
-
-
-
-### Test options
-
-All of the make endpoints involved running tests can be prefixed with the `TEST_ARGS` environment variable to set test options or pytest CLI args (see below) when running inside the container.
-
-* `--which_modules <modules to run tests for>` - comma separated list of which modules to test (defaults to running all of them).
-
-* `--print_failures` - if your test fails, it will only report the first datapoint. If you want all the nonmatching regression data to print out (so you can see if there are patterns, e.g. just incorrect for the first 'i' or whatever'), this will print out for every failing test all the non-matching data.
-
-* `--failure_stride` - when printing failures, print every n failures only.
-
-* `--data_path` - path to where you have the `Generator*.dat` and `*.json` serialization regression data. Defaults to current directory.
-
-* `--backend` - which backend to use for the computation. Options: `[numpy, gt:cpu_ifirst, gt:cpu_first, gt:gpu, cuda]`. Defaults to `numpy`.
-* `--python_regression` - Run the tests that have Python based regression data. Only applies to running parallel tests (savepoint_tests_mpi)
-Pytest provides a lot of options, which you can see by `pytest --help`. Here are some
-common options for our tests, which you can add to `TEST_ARGS`:
-
-* `-r` - is used to report test types other than failure. It can be provided `s` for skipped (e.g. tests which were not run because earlier tests of the same stencil failed), `x` for xfail or "expected to fail" tests (like tests with no translate class), or `p` for pass. For example, to report skipped and xfail tests you would use `-rsx`.
-
-* `--disable-warnings` - will stop all warnings from being printed at the end of the tests, for example warnings that translate classes are not yet implemented.
-
-* `-v` - will increase test verbosity, while `-q` will decrease it.
-
-* `-s` - will let stdout print directly to console instead of capturing the output and printing it when a test fails only. Note that logger lines will always be printed both during (by setting log_cli in our pytest.ini file) and after tests.
-
-* `-m` - will let you run only certain groups of tests. For example, `-m=parallel` will run only parallel stencils, while `-m=sequential` will run only stencils that operate on one rank at a time.
-
-* `--threshold_overrides_file` - will read a yaml file with error thresholds specified for specific backend and platform (docker or metal) configurations, overriding the max_error thresholds defined in the Translate classes. Format of the yaml file is described [here](tests/savepoint/translate/overrides/README.md).
-
-* `--dperiodic` - run tests on a doubly-periodic domain. Will look for only one tile's worth of test data and parallel tests will be run with a TileCommunicator instead of a CubedSphereCommunicator.
-
-**NOTE:** FV3 is current assumed to be by default in a "development mode", where stencils are checked each time they execute for code changes (which can trigger regeneration). This process is somewhat expensive, so there is an option to put FV3 in a performance mode by telling it that stencils should not automatically be rebuilt:
-
-```shell
-$ export FV3_STENCIL_REBUILD_FLAG=False
-```
-
-## Porting a new stencil
-
-1. Find the location in the fv3gfs-fortran repo code where the save-point is to be added, e.g. using
-
-```shell
-$ git grep <stencil_name> <checkout of fv3gfs-fortran>
-```
-
-2. Create a `translate` class from the serialized save-point data to a call to the stencil or function that calls the relevant stencil(s).
-
-These are usually named `tests/savepoint/translate/translate_<lowercase name>`
-
-Import this class in the `tests/savepoint/translate/__init__.py` file
-
-3. Write a Python function wrapper that the translate function (created above) calls.
-
-By convention, we name these `fv3core/stencils/<lower case stencil name>.py`
-
-4. Run the test, either with one name or a comma-separated list
-
-```shell
-$ make dev_tests TEST_ARGS="-–which_modules=<stencil name(s)>"
-```
-
-**Please also review the [Porting conventions](#porting-conventions) section for additional explanation**
-## Installation
-
-### Docker Image
-
-To build the `us.gcr.io/vcm-ml/fv3core` image with required dependencies for running the Python code, run
-
-```shell
-$ make build
-```
-
-Add `PULL=False` to build from scratch without running `docker pull`:
-
-```shell
-PULL=False make build
-```
-
-## Relevant repositories
-
-- https://github.com/GridTools/serialbox -
-  Serialbox generates serialized data when the Fortran model runs and has bindings to manage data from Python
-
-- https://github.com/VulcanClimateModeling/fv3gfs-fortran -
-  This is the existing Fortran model decorated with serialization statements from which the test data is generated
-
-- https://github.com/GridTools/gt4py -
-  Python package for the DSL language
-
-- https://github.com/VulcanClimateModeling/util
-  Python specific model functionality, such as halo updates.
-
-- https://github.com/VulcanClimateModeling/fv3gfs-wrapper
-  A Python based wrapper for running the Fortran version of the FV3GFS model.
-
-Some of these are submodules.
-While tests can work without these, it may be necessary for development to have these as well.
-To add these to the local repository, run
-
-```shell
-$ git submodule update --init
-```
-
-The submodules include:
-
-- `external/util` - git@github.com:VulcanClimateModeling/util.git
-- `external/daint_venv` -  git@github.com:VulcanClimateModeling/daint_venv.git
-
-## Dockerfiles and building
-
-There are two main docker files:
-
-1. `docker/dependencies.Dockerfile` - defines dependency images such as for mpi, serialbox, and GT4py
-
-2. `docker/Dockerfile` - uses the dependencies to define the final fv3core images.
-
-The dependencies are separated out into their own images to expedite rebuilding the docker image without having to rebuild dependencies, especially on CI.
-
-For the commands below using `make -C docker`, you can alternatively run `make` from within the `docker` directory.
-
-These dependencies can be updated, pushed, and pulled with `make -C docker build_deps`, `make -C docker push_deps`, and `make -C docker pull_deps`. The tag of the dependencies is based on the tag of the current build in the Makefile, which we will expand on below.
-
-Building from scratch requires both a deps and build command, such as `make -C docker pull_deps fv3core_image`.
-
-If any example fails for "pulled dependencies", it means the dependencies have never been built. You can
-build them and push them to GCR with:
-
-```shell
-$ make -C docker build_deps push_deps
-```
-
-### Building examples
-
-fv3core image with pulled dependencies:
-
-```shell
-$ make -C docker pull_deps fv3core_image
-```
-
-CUDA-enabled fv3core image with pulled dependencies:
-```
-$ CUDA=y make -C docker pull_deps fv3core_image
-```
-
-fv3core image with locally-built dependencies:
-```shell
-$ make -C docker build_deps fv3core_image
-```
-
-### Updating Serialbox
-
-If you need to install an updated version of Serialbox, you must first install cmake into the development environment. To install an updated version of Serialbox from within the container run
-
-```shell
-$ wget https://github.com/Kitware/CMake/releases/download/v3.17.3/cmake-3.17.3.tar.gz && \
-  tar xzf cmake-3.17.3.tar.gz && \
-  cd cmake-3.17.3 && \
-  ./bootstrap && make -j4 && make install
-$ git clone -b v2.6.1 --depth 1 https://github.com/GridTools/serialbox.git /tmp/serialbox
-$ cd /tmp/serialbox
-$ cmake -B build -S /tmp/serialbox -DSERIALBOX_USE_NETCDF=ON -DSERIALBOX_TESTING=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/serialbox
-$ cmake --build build/ -j $(nproc) --target install
-$ cd -
-$ rm -rf build /tmp/serialbox
-```
-
-## Pinned dependencies
-
-Dependencies are pinned using `constraints.txt`. This is auto-generated by pip-compile from the `pip-tools` package, which reads `requirements.txt` and `requirements/requirements_lint.txt`, determines the latest versions of all dependencies (including recursive dependencies) compatible those files, and writes pinned versions for all dependencies. This can be updated using:
-
-```shell
-$ make constraints.txt
-```
-
-This file is committed to the repository, and gives more reproducible tests if an old commit of the repository is checked out in the future. The constraints are followed when creating the `fv3core` docker images. To ensure consistency this should ideally be run from inside a docker development environment, but you can also run it on your local system with an appropriate Python 3 environment.
-
-## Development
-
-To develop fv3core, you need to install the linting requirements in `requirements/requirements_lint.txt`. To install the pinned versions, use:
-
-```shell
-$ pip install -c constraints.txt -r requirements/requirements_lint.txt
-```
-
-This adds `pre-commit`, which we use to lint and enforce style on the code. The first time you install `pre-commit`, install its git hooks using:
-
-```shell
-$ pre-commit install
-pre-commit installed at .git/hooks/pre-commit
-```
-
-As a convenience, the `lint` target of the top-level makefile executes `pre-commit run --all-files`.
-Linting, which formats files and checks for some style conventions, is required, as the same checks are the first step in the continuous integration testing that happens when creating a pull request.
-Linting locally saves time and literal energy, since CI tests do not have to be launched so many times!
-
- Please see the 'Development Guidelines' below for more information on the structure of the code to align your new code with the current conventions, as well as the CONTRIBUTING.md document for style guidelines.
-
-## GT4Py version
-
-FV3Core does not actually use the [GridTools/gt4py](https://github.com/gridtools/gt4py) main, it instead uses a Vulcan Climate Modeling development branch.
-This is publically available version at [VCM/gt4py](https://github.com/vulcanclimatemodeling/gt4py).
-
-Situation: There is a new stable feature in a gt4py PR, but it is not yet merged into the GridTools/gt4py main branch.
-[branches.cfg](https://github.com/VulcanClimateModeling/gt4py/blob/develop/branches.cfg) lists these features.
-Steps:
-
-1. Add any new branches to `branches.cfg`
-2. Rebuild the develop branch, either:
-  a. `make_develop gt4py-dev path/to/branches.cfg` (you may have to resolve conflicts...)
-  b. Adding new commits on top of the existing develop branch (e.g. merge or cherry-pick)
-3. Force push to the develop branch: `git push -f upstream develop`
-
-The last step will launch Jenkins tests. If these pass:
-
-1. Create a git tag: `git tag v-$(git rev-parse --short HEAD)`
-2. Push the tag: `git push upstream --tags`
-3. Make a PR to [VCM/gt4py](https://github.com/vulcanclimatemodeling/fv3core) that updates the version in `docker/Makefile` to the new tag.
-
-## License
-FV3Core is provided under the terms of the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html) license.
-
-# Development guidelines
-
-## File structure / conventions
-The main functionality of the FV3 dynamical core, which has been ported from the Fortran version in the fv3gfs-fortran repo, is defined using GT4py stencils and python 'compute' functions in fv3core/stencils. The core is comprised of units of calculations defined for regression testing. These were initially generally separated into distinct files in fv3core/stencils with corresponding files in tests/savepoint/translate/translate_<unit>.py defining the translation of variables from Fortran to Python. Exceptions exist in cases where topical and logical grouping allowed for code reuse. As refactors optimize the model, these units may be merged to occupy the same files and even methods/stencils, but the units should still be tested separately, unless determined to be redundant.
-
-The core has most of its calculations happening in GT4py stencils, but there are still several instances of operations happening in Python directly, which will need to be replaced with GT4py code for optimal performance.
-
-The namelist and grid are global variables defined in fv3core/_config.py The namelist is 'flattened' so that the grouping name of the option is not required to access the data (we may want to change this).
-
-The grid variables are mostly 2d variables and are 'global' to the model thread per mpi rank. The grid object also contains domain and layout information relevant to the current rank being operated on.
-
-Utility functions in `fv3core/utils/` include:
-  - `gt4py_utils.py`:
-    - default gt4py and model settings
-    - methods for generating gt4py storages
-    - methods for using numpy and cupy arrays in python functions that have not been put into GT4py
-    - methods for handling complex patterns that did not immediately map to gt4py, and will mostly be removed with future refactors (e.g. k_split_run)
-    - some general model math computations (e.g. great_circle_dist), that will eventually be put into gt4py with a future refactor
-  - `grid.py`:
-    - A Grid class definition that provides information about the grid layout, current tile informationm access to grid variables used globally, and convenience methods related to tile indexing, origins and domains commonly used
-    - A grid is defined for each MPI rank (minimum 6 ranks, 1 for each tile face of the cubed sphere grid represnting the whole Earth)
-    - Also provides functionality for generating a Quantity object used for halo updates and other utilities
-  - `corners`: port of corner calculations, initially direct Python calculations, being replaced with GT4py gtscript functions as the GT4py regions feature is implemented
-  - `mpi.py`: a wrapper for importing mpi4py when available
-  - `global_constants.py`: constants for use throughout the model
-  - `typing.py`: Clean names for common types we use in the model. This is new and
-    hasn't been adopted throughout the model yet, but will eventually be our
-    standard. A shorthand 'sd' has been used in the intial version.
-
-The `tests/` directory currently includes a framework for translating fields serialized (using
-Serialbox from GridTools) from a Fortran run into gt4py storages that can be inputs to
-fv3core unit computations, and compares the results of the ported code to serialized
-data following a unit computation.
-
-The `docker/` directory provides Dockerfiles for building a repeatable environment in which
-to run the core
-
-The `external/` directory is for submoduled repos that provide essential functionality
-
-The build system uses Makefiles following the convention of other repos within VulcanClimateModeling.
-
-## Model Interface
-
-The top level functions fv_dynamics and fv_sugridz can currenty only be run in parallel using mpi with a minimum of 6 ranks (there are a few other units that also require this, e.g. whenever there is a halo update involved in a unit). These are the interface to the rest of the model and currently have different conventions than the rest of the model.
- - A 'state' object (currently a SimpleNamespace) stores pointers to the allocated data fields
- - Most functions within dyn_core can be run sequentially per rank
- - Currently a list of ArgSpecs must decorate an interface function, where each ArgSpec provides useful information about the argument, e.g.: `@state_inputs( ArgSpec("qvapor", "specific_humidity", "kg/kg", intent="inout")`
-   - The format is (fortran_name, long_name, units, intent)
-   - We currently provide a duplicate of most of the metadata in the specification of the unit test, but that may be removed eventually.
- - Then the function itself, e.g. fv_dynamics, has arguments of 'state', 'comm' (the communicator) and all of the scalar parameters being provided.
-
-### Porting conventions
-
-Generation of regression data occurs in the fv3gfs-fortran repo (https://github.com/VulcanClimateModeling/fv3gfs-fortran) with serialization statements and a build procedure defined in `tests/serialized_test_data_generation`. The version of data this repo currently tests against is defined in `FORTRAN_SERIALIZED_DATA_VERSION` in this repo's `docker/Makefile.image_names`. Fields serialized are defined in Fortran code with serialization comment statements such as:
-
-```
-    !$ser savepoint C_SW-In
-    !$ser data delpcd=delpc delpd=delp ptcd=ptc
-```
-
-where the name being assigned is the name the fv3core uses to identify the variable in the test code. When this name is not equal to the name of the variable, this was usually done to avoid conflicts with other parts of the code where the same name is used to reference a differently sized field.
-
-The majority of the logic for translating from data serialized from Fortran to something that can be used by Python, and the comparison of the results, is encompassed by the main Translate class in the tests/savepoint/translate/translate.py file. Any units not involving a halo update can be run using this framework, while those that need to be run in parallel can look to the ParallelTranslate class as the parent class in tests/savepoint/translate/parallel_translate.py. These parent classes provide generally useful operations for translating serialized data between Fortran and Python specifications, and for applying regression tests.
-
-A new unit test can be defined as a new child class of one of these, with a naming convention of `Translate<Savepoint Name>` where `Savepoint Name` is the name used in the serialization statements in the Fortran code, without the `-In` and `-Out` part of the name. A translate class can usually be minimally specify the input and output fields. Then, in cases where the parent compute function is insuffient to handle the complexity of either the data translation or the compute function, the appropriate methods can be overridden.
-
-For Translate objects
-  - The init function establishes the assumed translation setup for the class, which can be dynamically overridden as needed.
-  - the parent compute function does:
-    - Makes gt4py storages of the max shape (grid.npx+1, grid.npy+1, grid.npz+1) aligning the data based on the start indices specified. (gt4py requires data fields have the same shape, so in this model we have buffer points so all calculations can be done easily without worrying about shape matching).
-    - runs the compute function (defined in self.compute_func) on the input data storages
-    - slices the computed Python fields to be compared to fortran regression data
-  - The unit test then uses a modified relative error metric to determine whether the unit passes
-  - The init method for a Translate class:
-    - The input (self.in_vars["data_vars"]) and output(self.out_vars) variables are specified in dictionaries, where the keys are the name of the variable used in the model and the values are dictionaries specifying metadata for translation of serialized data to gt4py storages. The metadata that can be specied to override defaults are:
-    - Indices to line up data arrays into gt4py storages (which all get created as the max possible size needed by all operations, for simplicity): "istart", "iend", "jstart", "jend", "kstart", "kend". These should be set using the 'grid' object available to the Translate object, using equivalent index names as in the declaration of variables in the Fortran code, e.g. real:: cx(bd%is:bd%ie+1,bd%jsd:bd%jed ) means we should assign. Example:
-
-```python
-      self.in_vars["data_vars"]["cx"] = {"istart": self.is\_, "iend": self.ie + 1,
-                                         "jstart": self.jsd, "jend": self.jed,}
-```
-  - There is only a limited set of Fortran shapes declared, so abstractions defined in the grid can also be used,
-    e.g.: `self.out_vars["cx"] = self.grid.x3d_compute_domain_y_dict()`. Note that the variables, e.g. `grid.is\_` and `grid.ie` specify the 'compute' domain in the x direction of the current tile, equivalent to `bd%is` and `bd%ie` in the Fortran model EXCEPT that the Python variables are local to the current MPI rank (a subset of the tile face), while the Fortran values are global to the tile face. This is because these indices are used to slice into fields, which in Python is 0-based, and in Fortran is based on however the variables are declared. But, for the purposes of aligning data for computations and comparisons, we can match them in this framework. Shapes need to be defined in a dictionary per variable including `"istart"`, `"iend"`, `"jstart"`, `"jend"`, `"kstart"`, `"kend"` that represent the shape of that variable as defined in the Fortran code. The default shape assumed if a variable is specified with an empty dictionary is `isd:ied, jsd:jed, 0:npz - 1` inclusive, and variables that aren't that shape in the Fortran code need to have the 'start' indices specified for the in_vars dictionary , and 'start' and 'end' for the out_vars.
-    - `"serialname"` can be used to specify a name used in the Fortran code declaration if we'd like the model to use a different name
-    - `"kaxis"`: which dimension is the vertical direction. For most variables this is '2' and does not need to be specified. For Fortran variables that assign the vertical dimension to a different axis, this can be set to ensure we end up with 3d storages that have the vertical dimension where it is expected by GT4py.
-    - `"dummy_axes"`: If set this will set of the storage to have singleton dimensions in the axes defined. This is to enable testing stencils where the full 3d data has not been collected and we want to run stencil tests on the data for a particular slice.
-    - `"names_4d"`: If a 4d variable is being serialized, this can be set to specify the names of each 3d field. By default this is the list of tracers.
-    - input variables that are scalars should be added to `self.in_vars["parameters"]`
-    - `self.compute_func` is the name of the model function that should be run by the compute method in the translate class
-    - `self.max_error` overrides the parent classes relative error threshold. This should only be changed when the reasons for non-bit reproducibility are understood.
-    - `self.max_shape` sets the size of the gt4py storage created for testing
-    - `self.ignore_near_zero_errors[<varname>] = True`: This is an option to let some fields pass with higher relative error if the absolute error is very small
-    - `self.skip_test`: This is an option to jump over the test case, to be used in the override file for temporary deactivation of tests.
-
-For `ParallelTranslate` objects:
-  - Inputs and outputs are defined at the class level, and these include metadata such as the "name" (e.g. understandable name for the symbol), dimensions, units and n_halo(numb er of halo lines)
-  - Both `compute_sequential` and `compute_parallel` methods may be defined, where a mock communicator is used in the `compute_sequential` case
-  - The parent assumes a state object for tracking fields and methods exist for translating from inputs to a state object and extracting the output variables from the state. It is assumed that Quantity objects are needed in the model method in order to do halo updates.
-  - `ParallelTranslate2Py` is a slight variation of this used for many of the parallel units that do not yet utilize a state object and relies on the specification of the same index metadata of the Translate classes
-  - `ParallelTranslateBaseSlicing` makes use of the state but relies on the Translate object of self._base, a Translate class object, to align the data before making quantities, computing and comparing.
-
-### Debugging Tests
-
-Pytest can be configured to give you a pdb session when a test fails. To route this properly through docker, you can run:
-
-```bash
-TEST_ARGS="-v -s --pdb" RUN_FLAGS="--rm -it" make tests
-```
-
-This can be done with any pytest target, such as `make savepoint_tests` and `make savepoint_tests_mpi`.
-
-### GEOS API
-
-The `GeosDycoreWrapper` class provides an API to run the dynamical core in a Python component of a GEOS model run. A `GeosDycoreWrapper` object is initialized with a namelist, communicator, and backend, which creates the communicators, partitioners, dycore state, and dycore object required to run the Pace dycore. A wrapper object takes numpy arrays of `u, v, w, delz, pt, delp, q, ps, pe, pk, peln, pkz, phis, q_con, omga, ua, va, uc, vc, mfxd, mfyd, cxd, cyd,` and `diss_estd` and returns a dictionary containing numpy arrays of those same variables. Wrapper objects contain a `timer` attrubite that tracks the amount of time moving input data to the dycore state, running the dynamical core, and retrieving the data from the state.
+# TEMPORARY BRANCH: Up-skilling to GEOS v11.4.2
+
+This branch exists solely for up-skilling pyFV3 to be able run GEOS in it's v11.4.2 FP configuration.
+The need for a seperate branch from `develop` rely in the following differences:
+
+- GEOS run a 32bit floating point precision version (with appropriate 64bit buffers for mass conservation). This means the translate test requires a new set of data _and_ will not pass on old 8.1.3 Pace data.
+- GEOS requires options that are deemed "legacy" and that we may want to replace rather than port
+- Project requirements demand quick iterative development, while `pyFV3` demands concertation between all stakeholders.
+
+The aim is to validate and benchmark GEOS v11.4.2 with this dynamics. Once done, we will aim to move _as much code as possible_ back into develop.
+The methodology goes as follows
+
+- Merge directly into `develop` any changes that do not demand a new set of data
+- Keep track of the feature branch (below) that can't be merged in `develop` for future PR
+- Keep track of GEOS vs SHiELD differences for future discussions
+
+## Feature branches
+
+Legend:
+
+- ⚙️ _GEOS - WIP_ : Ongoing work - can be merged temporarily
+- 🔶 _GEOS - Merged_:  Considered done - merged in GEOS v11.4.2 branch but NOT in `develop`
+- ✅ _Develop - Merged_: Work done as part of up-skilling done for GEOS merged in `develop` AND the GEOS v11.4.2 branch.
+
+Branches:
+
+- ✅ `fix/F32/UpdateDzC`@Florian: Fix for fluxes gradient
+- ✅ `fix/F32/DivergenceDamping`@Florian: Fix for 32-bit scalars in DivergenceDamping
+- ✅ `fix/F32/UpdateDzD`@Florian: Fix for fluxes gradient & python computation
+- ✅ `fix/F32/nh_p_grad` @ Florian: Fix for 32-bit NonHydrostaticPressureGradient
+- 🔶 `fix/RayleighDamping_mixed_precision`@Florian: fix the Ray_Fast test
+- 🔶 `GEOS_update/yppm_xppm`@Florian: fix the YPPM/XPPM with `hord = -6`
+- 🔶 `fix/DelnFlux_f32_support`@Florian: Fix for f32 support for DelnFlux (partial pass)
+- 🔶 `fix/GEOSv11_4_2/HyperDiffusionDamping`@Florian: fix the Hyperdiffusion Damping by restoring factor to be 64-bit float
+- ⚙️ `fix/GEOS/D_SW`@Florian: Fix D_SW heat dissipation, column calculation and new `dpx` accumulation (partial pass)
+- ⚙️ `fix/GEOSv11_4_2/A2B_Ord4`@Florian: Fix for 32-bit A2B_Ord4
+- ⚙️ `fix/GEOSv11_4_2/RiemanSolver`@Florian: Fix for 32-bit RiemanSolver
+- ⚙️ `fix/GEOSv11_4_2/C_SW`@Florian: Fix for C_SW for 32-bit
+- ⚙️ `fix/GEOSv11_4_2/Dyncore`@Florian: Fix for Acoustics and DycoreState for 32-bit and `dpx` calculation
+    - MERGE ORDER: after `fix/GEOS/D_SW`
+    - MERGE ORDER: after `fix/GEOSv11_4_2/HyperDiffusionDamping`
+- ⚙️ `feature/tracer_rework_part1` @Florian: Allow for update of N Tracers
+- ⚙️ `fix/GEOS/TracerAdvection` @Florian: Allow for non-update of mass fluxes and courant number, f32 fixes, correct computation of `cmax` and `nsplit`, overcomputation into the algorithm
+    - BASED ON `tracer_rework_part1`
+    - REQUIRES: `ndsl` with tracer rework
+- ⚙️ `feature/fv_mapz/GEOS` @ Chris K: Remapping for GEOS
+    - REQUIRES: `ndsl` with tracer rework
+- ⚙️ `fix/GEOSv11_4_2/Dynamics`@Florian: Fix for the f32 & GEOS version of dynamics
+    - REQUIRES: `ndsl` with tracer rework
+    - REQUIRES: `tracer_rework_part1`, `fix/GEOSv11_4_2/Dyncore`, `fix/GEOS/TracerAdvection`
+    - MERGE ORDER: after `fix/GEOSv11_4_2/HyperDiffusionDamping`
diff --git a/examples/standalone/runfile/dynamics.py b/examples/standalone/runfile/dynamics.py
index 88326001..103161d1 100755
--- a/examples/standalone/runfile/dynamics.py
+++ b/examples/standalone/runfile/dynamics.py
@@ -263,6 +263,7 @@ def setup_dycore(
         config=dycore_config,
         phis=state.phis,
         state=state,
+        exclude_tracers=[],
         timestep=timedelta(seconds=dycore_config.dt_atmos),
     )
     return dycore, state, stencil_factory
diff --git a/fv_mapz.F90.SER b/fv_mapz.F90.SER
new file mode 100644
index 00000000..455c0f07
--- /dev/null
+++ b/fv_mapz.F90.SER
@@ -0,0 +1,4814 @@
+!***********************************************************************
+!*                   GNU Lesser General Public License
+!*
+!* This file is part of the FV3 dynamical core.
+!*
+!* The FV3 dynamical core is free software: you can redistribute it
+!* and/or modify it under the terms of the
+!* GNU Lesser General Public License as published by the
+!* Free Software Foundation, either version 3 of the License, or
+!* (at your option) any later version.
+!*
+!* The FV3 dynamical core is distributed in the hope that it will be
+!* useful, but WITHOUT ANYWARRANTY; without even the implied warranty
+!* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+!* See the GNU General Public License for more details.
+!*
+!* You should have received a copy of the GNU Lesser General Public
+!* License along with the FV3 dynamical core.
+!* If not, see <http://www.gnu.org/licenses/>.
+!***********************************************************************
+
+!>@brief The module 'fv_mapz' contains the vertical mapping routines \cite lin2004vertically
+!>@note April 12, 2012 -SJL: This revision may actually produce rounding level differences
+!! due to the elimination of KS to compute pressure level for remapping.
+
+module fv_mapz_mod
+
+! Modules Included:
+! <table>
+! <tr>
+!     <th>Module Name</th>
+!     <th>Functions Included</th>
+!   </tr>
+! <table>
+!   <tr>
+!     <td>constants_mod</td>
+!     <td>radius, pi=>pi_8, rvgas, rdgas, grav, hlv, hlf, cp_air, cp_vapor</td>
+!   </tr>
+!     <td>field_manager_mod</td>
+!     <td>MODEL_ATMOS</td>
+!   </tr>
+!   <tr>
+!     <td>fv_arrays_mod</td>
+!     <td>fv_grid_type</td>
+!   </tr>
+!   <tr>
+!     <td>fv_cmp_mod</td>
+!     <td>qs_init, fv_sat_adj</td>
+!   </tr>
+!   <tr>
+!     <td>fv_fill_mod</td>
+!     <td>fillz</td>
+!   </tr>
+!   <tr>
+!     <td>fv_grid_utils_mod</td>
+!     <td>g_sum, ptop_min</td>
+!   </tr>
+!   <tr>
+!     <td>fv_mp_mod</td>
+!     <td>is_master</td>
+!   </tr>
+!   <tr>
+!     <td>fv_timing_mod</td>
+!     <td>timing_on, timing_off</td>
+!   </tr>
+!   <tr>
+!     <td>fv_tracer2d_mod</td>
+!     <td>tracer_2d, tracer_2d_1L, tracer_2d_nested</td>
+!   </tr>
+!   <tr>
+!     <td>mpp_mod/td>
+!     <td>NOTE, mpp_error, get_unit, mpp_root_pe, mpp_pe</td>
+!   </tr>
+!   <tr>
+!     <td>mpp_domains_mod/td>
+!     <td> mpp_update_domains, domain2d</td>
+!   </tr>
+!   <tr>
+!     <td>tracer_manager_mod</td>
+!     <td>get_tracer_index</td>
+!   </tr>
+! </table>
+  !$ser verbatim USE m_serialize, ONLY: fs_is_serialization_on
+  use constants_mod,     only: radius, pi=>pi_8, rvgas, rdgas, grav, hlv, hlf, hls, cp_air, cp_vapor
+  use tracer_manager_mod,only: get_tracer_index
+  use field_manager_mod, only: MODEL_ATMOS
+  use fv_grid_utils_mod, only: g_sum, ptop_min
+  use fv_fill_mod,       only: fillz
+  use mpp_domains_mod,   only: mpp_update_domains, domain2d, mpp_global_sum, BITWISE_EFP_SUM, BITWISE_EXACT_SUM
+  use mpp_mod,           only: NOTE, mpp_error, get_unit, mpp_root_pe, mpp_pe
+  use fv_arrays_mod,     only: fv_grid_type, fv_flags_type
+  use fv_timing_mod,     only: timing_on, timing_off
+  use fv_mp_mod,         only: is_master
+  use fv_cmp_mod,        only: qs_init, fv_sat_adj
+
+  implicit none
+  real, parameter:: consv_min= 0.001         !< below which no correction applies
+  real, parameter:: te_min= -1.e25
+  real, parameter:: t_min= 184.              !< below which applies stricter constraint
+  real, parameter:: r2=1./2., r0=0.0
+  real, parameter:: r3 = 1./3., r23 = 2./3., r12 = 1./12.
+  real, parameter:: cv_vap = 3.*rvgas        !< 1384.5
+  real, parameter:: cv_air =  cp_air - rdgas !< = rdgas * (7/2-1) = 2.5*rdgas=717.68
+! real, parameter:: c_ice = 2106.            !< heat capacity of ice at 0.C
+  real, parameter:: c_ice = 1972.            !< heat capacity of ice at -15.C
+  real, parameter:: c_liq = 4.1855e+3        !< GFS: heat capacity of water at 0C
+! real, parameter:: c_liq = 4218.            !< ECMWF-IFS
+  real, parameter:: cp_vap = cp_vapor        !< 1846.
+  real, parameter:: tice = 273.16
+
+  logical, parameter :: w_limiter = .true.
+  real, parameter :: w_max = 90.
+  real, parameter :: w_min = -60.
+
+  real(kind=8) :: E_Flux = 0.
+  private
+
+  public compute_total_energy, Lagrangian_to_Eulerian, moist_cv, moist_cp,   &
+         rst_remap, mappm, E_Flux, mapn_tracer, map1_q2
+
+!---- version number -----
+  character(len=128) :: version = '$Id$'
+  character(len=128) :: tagname = '$Name$'
+
+contains
+
+!>@brief The subroutine 'Lagrangian_to_Eulerian' remaps deformed Lagrangian layers back to the reference Eulerian coordinate.
+!>@details It also includes the entry point for calling fast microphysical processes. This is typically calle on the k_split loop.
+ subroutine Lagrangian_to_Eulerian(last_step, consv, ps, pe, delp, pkz, pk,   &
+                                   mdt, pdt, km, is,ie,js,je, isd,ied,jsd,jed,       &
+                      nq, nwat, sphum, q_con, u, v, w, delz, pt, q, hs, r_vir, cp,  &
+                      akap, cappa, kord_mt, kord_wz, kord_tr, kord_tm,  peln, te0_2d,        &
+                      ng, ua, va, omga, te, ws, fill, reproduce_sum, out_dt, dtdt,      &
+                      ptop, ak, bk, pfull, flagstruct, gridstruct, domain, do_sat_adj, &
+                      hydrostatic, hybrid_z, do_omega, adiabatic, do_adiabatic_init, &
+                      mfx, mfy, cx, cy, remap_option, gmao_remap)
+   logical, intent(in):: last_step
+   real,    intent(in):: mdt                    !< remap time step
+   real,    intent(in):: pdt                    !< phys time step
+   integer, intent(in):: km
+   integer, intent(in):: nq                     !< number of tracers (including h2o)
+   integer, intent(in):: nwat
+   integer, intent(in):: sphum                  !< index for water vapor (specific humidity)
+   integer, intent(in):: ng
+   integer, intent(in):: is,ie,isd,ied          !< starting & ending X-Dir index
+   integer, intent(in):: js,je,jsd,jed          !< starting & ending Y-Dir index
+   integer, intent(in):: kord_mt                !< Mapping order for the vector winds
+   integer, intent(in):: kord_wz                !< Mapping order/option for w
+   integer, intent(in):: kord_tr(nq)            !< Mapping order for tracers
+   integer, intent(in):: kord_tm                !< Mapping order for thermodynamics
+
+   real, intent(in):: consv                  !< factor for TE conservation
+   real, intent(in):: r_vir
+   real, intent(in):: cp
+   real, intent(in):: akap
+   real, intent(in):: hs(isd:ied,jsd:jed)  !< surface geopotential
+   real(kind=8), intent(inout):: te0_2d(is:ie,js:je)
+   real, intent(in):: ws(is:ie,js:je)
+
+   logical, intent(in):: do_sat_adj
+   logical, intent(in):: fill                  !< fill negative tracers
+   logical, intent(in):: reproduce_sum
+   logical, intent(in):: do_omega, adiabatic, do_adiabatic_init
+   real, intent(in) :: ptop
+   real, intent(in) :: ak(km+1)
+   real, intent(in) :: bk(km+1)
+   real, intent(in):: pfull(km)
+   type(fv_grid_type), intent(IN), target :: gridstruct
+   type(fv_flags_type), intent(INOUT) :: flagstruct
+   type(domain2d), intent(INOUT) :: domain
+
+   ! INPUT/OUTPUT
+   real, intent(inout):: pk(is:ie,js:je,km+1)          !< pe to the kappa
+   real, intent(inout):: q(isd:ied,jsd:jed,km,*)
+   real, intent(inout):: delp(isd:ied,jsd:jed,km)      !< pressure thickness
+   real, intent(inout)::  pe(is-1:ie+1,km+1,js-1:je+1) !< pressure at layer edges
+   real, intent(inout):: ps(isd:ied,jsd:jed)           !< surface pressure
+
+   ! u-wind will be ghosted one latitude to the north upon exit
+   real, intent(inout)::  u(isd:ied  ,jsd:jed+1,km)   !< u-wind (m/s)
+   real, intent(inout)::  v(isd:ied+1,jsd:jed  ,km)   !< v-wind (m/s)
+   real, intent(inout)::  w(isd:     ,jsd:     ,1:)   !< vertical velocity (m/s)
+   real, intent(inout):: pt(isd:ied  ,jsd:jed  ,km)   !< cp*virtual potential temperature
+                                                      !< as input; output: temperature
+   real, intent(inout), dimension(isd:,jsd:,1:)::delz, q_con, cappa
+   logical, intent(in):: hydrostatic
+   logical, intent(in):: hybrid_z
+   logical, intent(in):: out_dt
+
+   real, intent(inout)::   ua(isd:ied,jsd:jed,km)   !< u-wind (m/s) on physics grid
+   real, intent(inout)::   va(isd:ied,jsd:jed,km)   !< v-wind (m/s) on physics grid
+   real, intent(inout):: omga(isd:ied,jsd:jed,km)   !< vertical press. velocity (pascal/sec)
+   real, intent(inout)::   peln(is:ie,km+1,js:je)   !< log(pe)
+   real, intent(inout)::   dtdt(is:ie,js:je,km)
+   real, intent(out)::    pkz(is:ie,js:je,km)       !< layer-mean pk for converting t to pt
+   real, intent(out)::     te(isd:ied,jsd:jed,km)
+   ! Mass fluxes
+   real, optional, intent(inout):: mfx(is:ie+1,js:je  ,km)   ! X-dir Mass Flux
+   real, optional, intent(inout):: mfy(is:ie  ,js:je+1,km)   ! Y-dir Mass Flux
+   ! Courant numbers
+   real, optional, intent(inout)::  cx(is:ie+1, jsd:jed,km)
+   real, optional, intent(inout)::  cy(isd:ied ,js:je+1,km)
+
+   integer, intent(in):: remap_option, gmao_remap
+
+   ! !DESCRIPTION:
+   !
+   ! !REVISION HISTORY:
+   ! SJL 03.11.04: Initial version for partial remapping
+   !
+   !-----------------------------------------------------------------------
+   real(kind=8), dimension(is:ie,js:je):: te_2d, zsum0, zsum1
+   real, dimension(is:ie,js:je):: dpln
+   real, dimension(is:ie,km)  :: q2, dp2, w2
+   real, dimension(is:ie,km+1):: pe1, pe2, pk1, pk2, pn1, pn2, phis
+   real, dimension(is:ie+1,km+1):: pe0, pe3
+   real, dimension(is:ie):: gz, cvm
+   real(kind=8):: tesum, zsum, dtmp
+   real   :: rcp, rg, tmp, tpe, rrg, bkh, k1k, dlnp
+   logical:: fast_mp_consv
+   integer:: i,j,k
+   integer:: nt, liq_wat, ice_wat, rainwat, snowwat, cld_amt, graupel, iq, n, kmp, kp, k_next
+   logical:: remap_t, remap_pt, remap_te
+
+   real, dimension(is:ie, js:je, km+1) :: pe1_3d, pe2_3d, pn1_3d, pn2_3d, pk2_3d, peln_3d
+   real, dimension(is:ie, js:je, km+1) :: phis_3d
+   real, dimension(is-1:ie+1,js-1:je+1,km+1) :: pe_3d
+   real, dimension(is:ie+1, js:je+1, km+1) :: pe0_3d, pe3_3d
+
+   real, dimension(is:ie, js:je, km) :: w2_3d, dp2_3d
+
+   real, dimension(is:ie, js:je) :: gz_2d, te_2d_f32, te0_2d_f32, zsum1_f32
+   real, dimension(isd:ied,jsd:jed) :: rsin2, cosa_s, area_64_
+
+   logical :: serial_flag
+
+   real :: grav_, consv_min_, cv_air_
+
+!$ser verbatim real :: w_max_, w_min_
+
+   !$ser verbatim integer:: mode, abskord, iep1, iedp1, jedp1, js2d, o3mr, sgs_tke
+   !$ser verbatim real :: qmin
+   !$ser verbatim qmin = 0.0
+   !$ser verbatim iep1=ie+1
+   !$ser verbatim iedp1=ied+1
+   !$ser verbatim jedp1=jed+1
+   !$ser verbatim js2d=js
+   !$ser verbatim pe_3d = 0.0
+   !$ser verbatim pe1_3d = 0.0
+   !$ser verbatim pe2_3d = 0.0
+   !$ser verbatim peln_3d = 0.0
+   !$ser verbatim dp2_3d = 0.0
+   !$ser verbatim pn1_3d = 0.0
+   !$ser verbatim pn2_3d = 0.0
+   !$ser verbatim pk2_3d = pk
+   !$ser verbatim gz_2d = 0.0
+   !$ser verbatim phis_3d = 0.0
+   !$ser verbatim consv_min_=consv_min
+   !$ser verbatim cv_air_=cv_air
+
+   remap_t  = .false.
+   remap_pt = .false.
+   remap_te = .false.
+   select case (remap_option)
+   case(0)
+      remap_t  = .true.
+   case(1)
+      remap_pt = .true.
+   case(2)
+      remap_te = .true.
+   case default
+      print*, ' INVALID REMAP_OPTION '
+      stop
+   end select
+
+   select case (gmao_remap)
+   case(0)
+      ! use GFDL schemes
+   case(1)
+      ! GMAO linear remap
+   case(2)
+      ! GMAO quadratic remap
+   case(3)
+      ! GMAO cubic remap
+   case default
+      print*, ' INVALID GMAO_REMAP'
+      stop
+   end select
+
+   if (is_master() .and. flagstruct%fv_debug) then
+      print*, ''
+      select case (remap_option)
+         case(0)
+         print*, ' REMAPPING  T in logP'
+         case(1)
+         print*, ' REMAPPING PT in P'
+         case(2)
+         print*, ' REMAPPING TE in logP'
+      end select
+
+      print*, ''
+      select case (gmao_remap)
+         case(0)
+         print*, ' Using GFDL schemes'
+         case(1)
+         print*, ' Using GMAO linear scheme'
+         case(2)
+         print*, ' Using GMAO quadratic scheme'
+         case(3)
+         print*, ' Using GMAO cubic scheme'
+      end select
+
+      ! Total eergy conservation
+         print*, ''
+         print*, ' REMAPPING CONSV:     ', consv
+         print*, ' REMAPPING CONSV_MIN: ', consv_min
+         print*, ''
+   endif
+
+   k1k = rdgas/cv_air   ! akap / (1.-akap) = rg/Cv=0.4
+   rg = rdgas
+   rcp = 1./ cp
+   rrg = -rdgas/grav
+
+#ifdef MAPL_MODE
+   select case(nwat)
+      case(1)
+         liq_wat = -1
+         ice_wat = -1
+         rainwat = -1
+         snowwat = -1
+         graupel = -1
+         cld_amt = -1
+      case(3)
+         liq_wat = 2
+         ice_wat = 3
+         rainwat = -1
+         snowwat = -1
+         graupel = -1
+         cld_amt = -1
+      case(6:7)
+         liq_wat = 2
+         ice_wat = 3
+         rainwat = 4
+         snowwat = 5
+         graupel = 6
+         cld_amt = 7
+   end select
+#else
+   liq_wat = get_tracer_index (MODEL_ATMOS, 'liq_wat')
+   ice_wat = get_tracer_index (MODEL_ATMOS, 'ice_wat')
+   rainwat = get_tracer_index (MODEL_ATMOS, 'rainwat')
+   snowwat = get_tracer_index (MODEL_ATMOS, 'snowwat')
+   graupel = get_tracer_index (MODEL_ATMOS, 'graupel')
+   cld_amt = get_tracer_index (MODEL_ATMOS, 'cld_amt')
+   !$ser verbatim o3mr = get_tracer_index (MODEL_ATMOS, 'o3mr')
+   !$ser verbatim sgs_tke = get_tracer_index (MODEL_ATMOS, 'sgs_tke')
+#endif
+
+   if ( do_sat_adj .and. nwat>=6 ) then
+      print*,'CODE NOT TESTED HERE 1'
+      fast_mp_consv = (.not.do_adiabatic_init) .and. consv>consv_min
+      do k=1,km
+         kmp = k
+         if ( pfull(k) > 10.E2 ) exit
+      enddo
+      call qs_init(kmp)
+   endif
+
+!$OMP parallel do default(none) shared(is,ie,js,je,km,pe,ptop,kord_tm,remap_t, &
+!$OMP                                  remap_pt,remap_te,mfy,mfx,cx,cy,hydrostatic, &
+!$OMP                                  pt,pk,rg,peln,q,nwat,liq_wat,rainwat,ice_wat,snowwat,    &
+#ifdef SERIALIZE
+!$OMP  ppser_savepoint, ppser_serializer, ppser_serializer_ref, ppser_zrperturb, cld_amt, mode,qmin, abskord,iep1, iedp1, jedp1, js2d, o3mr, sgs_tke, &
+#endif
+!$OMP                                  graupel,q_con, sphum,cappa,r_vir,rcp,cp,k1k,delp, &
+!$OMP                                  delz,akap,pkz,te,u,v,ps, gridstruct, &
+!$OMP                                  ak,bk,nq,isd,ied,jsd,jed,kord_tr,fill, adiabatic, &
+!$OMP                                  hs,w,ws,kord_wz,rrg,kord_mt,consv,remap_option,gmao_remap,&
+!$OMP                                  pe_3d, pe1_3d, pe2_3d, pn1_3d, pn2_3d, pk2_3d, w2_3d, dp2_3d, &
+!$OMP                                  peln_3d, gz_2d, pe0_3d, pe3_3d, phis_3d, te_2d, te_2d_f32, te0_2d, te0_2d_f32) &
+!$OMP                          private(gz,cvm,bkh,dp2,   &
+!$OMP                                  pe0,pe1,pe2,pe3,pk1,pk2,pn1,pn2,phis,q2,w2,dpln,dlnp, w_max_, w_min_, rsin2, cosa_s, grav_, area_64_)
+
+!$ser verbatim do k = 1, km+1
+!$ser verbatim    do j = js, je
+!$ser verbatim       do i = is, ie
+!$ser verbatim         peln_3d(i,j,k) = peln(i,k,j)
+!$ser verbatim       enddo
+!$ser verbatim     enddo
+!$ser verbatim enddo
+!$ser verbatim do k = 1, km+1
+!$ser verbatim    do j = js-1, je+1 
+!$ser verbatim       do i = is-1, ie+1
+!$ser verbatim          pe_3d(i,j,k) = pe(i,k,j)
+!$ser verbatim       enddo
+!$ser verbatim    enddo
+!$ser verbatim enddo
+!$ser verbatim qmin=t_min
+!$ser verbatim w_max_ = w_max
+!$ser verbatim w_min_ = w_min
+!$ser verbatim rsin2=gridstruct%rsin2
+!$ser verbatim cosa_s=gridstruct%cosa_s
+!$ser verbatim grav_=grav
+!$ser verbatim area_64_ = real(gridstruct%area_64)
+!$ser verbatim te0_2d_f32 = real(te0_2d)
+!$ser savepoint Remapping_GEOS-In
+!$ser data pe_=pe_3d
+!$ser data ptop=ptop
+!$ser data qvapor=q(:,:,:,sphum)
+!$ser data qliquid=q(:,:,:,liq_wat) 
+!$ser data qice=q(:,:,:,ice_wat)
+!$ser data qrain=q(:,:,:,rainwat) 
+!$ser data qsnow=q(:,:,:,snowwat) 
+!$ser data qgraupel=q(:,:,:,graupel)
+!$ser data qcld=q(:,:,:,cld_amt)
+!$ser data qo3mr=q(:,:,:,8)
+!$ser data qsgs_tke=q(:,:,:,9)
+!$ser data q_con=q_con
+!$ser data pt=pt
+!$ser data cappa=cappa
+!$ser data delp=delp
+!$ser data delz=delz
+!$ser data ak=ak
+!$ser data bk=bk
+!$ser data ps=ps
+!$ser data peln_3d=peln_3d
+!$ser data r_vir=r_vir
+!$ser data cvm=cvm
+!$ser data pk=pk
+!$ser data pkz=pkz
+!$ser data akap=akap
+!$ser data t_min=qmin
+!$ser data kord_wz=kord_wz
+!$ser data ws_=ws
+!$ser data w=w
+!$ser data u=u
+!$ser data v=v
+!$ser verbatim if (present(mfy)) then
+!$ser data mfy=mfy
+!$ser verbatim endif
+!$ser verbatim if (present(cy)) then
+!$ser data cy=cy
+!$ser verbatim endif
+!$ser verbatim if (present(mfx)) then
+!$ser data mfx_=mfx
+!$ser verbatim endif
+!$ser verbatim if (present(cx)) then
+!$ser data cx_=cx
+!$ser verbatim endif
+!$ser data w_max=w_max_
+!$ser data w_min=w_min_
+!$ser data kord_mt=kord_mt
+!$ser data cosa_s=cosa_s
+!$ser data rsin2=rsin2
+!$ser data hs=hs
+!$ser data grav=grav_
+!$ser data te0_2d_=te0_2d_f32
+!$ser data area_64_=area_64_
+!$ser data last_step=last_step
+!$ser data do_adiabatic_init=do_adiabatic_init
+!$ser data consv=consv
+!$ser data consv_min=consv_min_
+!$ser data cv_air=cv_air_
+!$ser data adiabatic=adiabatic
+
+   do 1000 j=js,je+1
+
+      do k=1,km+1
+         do i=is,ie
+            pe1(i,k) = pe(i,k,j)
+         enddo
+      enddo
+
+      do i=is,ie
+         pe2(i,   1) = ptop
+         pe2(i,km+1) = pe(i,km+1,j)
+      enddo
+
+      if ( j /= (je+1) ) then
+
+         if (remap_t) then
+            ! Remap T in logP
+            ! Note: pt at this stage is Theta_v
+            if ( hydrostatic ) then
+               print*,'CODE NOT TESTED HERE 2'
+               ! Transform virtual pt to virtual Temp
+               do k=1,km
+                  do i=is,ie
+                     pt(i,j,k) = pt(i,j,k)*(pk(i,j,k+1)-pk(i,j,k))/(akap*(peln(i,k+1,j)-peln(i,k,j)))
+                  enddo
+               enddo
+            else
+               ! Transform "density pt" to "density temp"
+               !$ser verbatim if(j == js2d) then 
+               !!$ser verbatim print*,'MoistCVPlusPt_2D serialization'
+               !$ser savepoint MoistCVPlusPt_2d-In
+               !$ser data  qvapor_js=q(:,j,:,sphum) qliquid_js=q(:,j,:,liq_wat) qice_js=q(:,j,:,ice_wat) qrain_js=q(:,j,:,rainwat) qsnow_js=q(:,j,:,snowwat) qgraupel_js=q(:,j,:,graupel) qcld_js=q(:,j,:,cld_amt) gz1d=gz cvm=cvm r_vir=r_vir cappa=cappa rrg=rrg delp=delp delz=delz pt=pt k1k=k1k j_2d=js2d q_con=q_con
+               !$ser verbatim endif
+               do k=1,km
+#ifdef MOIST_CAPPA
+                  call moist_cv(is,ie,isd,ied,jsd,jed, km, j, k, nwat, sphum, liq_wat, rainwat,    &
+                                 ice_wat, snowwat, graupel, q, gz, cvm)
+                  do i=is,ie
+                  !$ser verbatim q_con(i,j,k) = gz(i)
+                     cappa(i,j,k) = rdgas / ( rdgas + cvm(i)/(1.+r_vir*q(i,j,k,sphum)) )
+                     pt(i,j,k) = pt(i,j,k)*exp(cappa(i,j,k)/(1.-cappa(i,j,k))*log(rrg*delp(i,j,k)/delz(i,j,k)*pt(i,j,k)))
+                  enddo
+#else
+                  do i=is,ie
+                     pt(i,j,k) = pt(i,j,k)*exp(k1k*log(rrg*delp(i,j,k)/delz(i,j,k)*pt(i,j,k)))
+                  enddo
+#endif
+               enddo
+               !$ser verbatim if(j == js2d) then 
+               !$ser savepoint MoistCVPlusPt_2d-Out
+               !$ser data  gz1d=gz cvm=cvm pt=pt cappa=cappa q_con=q_con
+               !$ser verbatim endif
+
+            endif         ! hydro test
+         elseif (remap_pt) then
+            print*,'CODE NOT TESTED HERE 3'
+            ! Remap PT in P
+            ! pt is already virtual PT
+         elseif (remap_te) then
+            print*,'CODE NOT TESTED HERE 4'
+         ! Remap TE in logP
+         ! Transform virtual pt to total energy
+            if ( hydrostatic ) then
+               call pkez(km, is, ie, js, je, j, pe, pk, akap, peln, pkz, ptop)
+               do i=is,ie
+                  phis(i,km+1) = hs(i,j)
+               enddo
+               do k=km,1,-1
+                  do i=is,ie
+                     phis(i,k) = phis(i,k+1) + cp_air*pt(i,j,k)*(pk(i,j,k+1)-pk(i,j,k))
+                  enddo
+               enddo
+               do k=1,km+1
+                  do i=is,ie
+                     phis(i,k) = phis(i,k) * pe(i,k,j)
+                  enddo
+               enddo
+               ! Compute cp*T + KE
+               do k=1,km
+                  do i=is,ie
+                     te(i,j,k) = 0.25*gridstruct%rsin2(i,j)*(u(i,j,k)**2+u(i,j+1,k)**2 +  &
+                                                               v(i,j,k)**2+v(i+1,j,k)**2 -  &
+                                 (u(i,j,k)+u(i,j+1,k))*(v(i,j,k)+v(i+1,j,k))*gridstruct%cosa_s(i,j))  &
+                              + cp_air*pt(i,j,k)*pkz(i,j,k) &
+                              + (phis(i,k+1)-phis(i,k))/(pe(i,k+1,j)-pe(i,k,j))
+                  enddo
+               enddo
+            else
+            ! TE using 3D winds (pt is virtual potential temperature):
+               do i=is,ie
+                  phis(i,km+1) = hs(i,j)
+               enddo
+               do k=km,1,-1
+                  do i=is,ie
+                     phis(i,k) = phis(i,k+1) - grav*delz(i,j,k)
+                  enddo
+               enddo
+               do k=1,km+1
+                  do i=is,ie
+                     phis(i,k) = phis(i,k) * pe(i,k,j)
+                  enddo
+               enddo
+               do k=1,km
+#ifdef MOIST_CAPPA
+                  call moist_cv(is,ie,isd,ied,jsd,jed, km, j, k, nwat, sphum, liq_wat, rainwat,    &
+                                 ice_wat, snowwat, graupel, q, gz, cvm)
+                  do i=is,ie
+                     cappa(i,j,k) = rdgas / ( rdgas + cvm(i)/(1.+r_vir*q(i,j,k,sphum)) )
+                     pkz(i,j,k) = exp(cappa(i,j,k)/(1.-cappa(i,j,k))*log(rrg*delp(i,j,k)/delz(i,j,k)*pt(i,j,k)))
+                     ! TE = KE + Cv*T_v + PE
+                     te(i,j,k) = 0.5*w(i,j,k)**2 + 0.25*gridstruct%rsin2(i,j)*( &
+                                       u(i,j,k)**2+u(i,j+1,k)**2 + v(i,j,k)**2+v(i+1,j,k)**2 -  &
+                                    (u(i,j,k)+u(i,j+1,k))*(v(i,j,k)+v(i+1,j,k))*gridstruct%cosa_s(i,j)) &
+                                    + cvm(i)*pt(i,j,k)*pkz(i,j,k) &
+                                    + (phis(i,k+1)-phis(i,k))/(pe(i,k+1,j)-pe(i,k,j))
+                  enddo
+#else
+                  do i=is,ie
+                     pkz(i,j,k) = exp(k1k*log(rrg*delp(i,j,k)/delz(i,j,k)*pt(i,j,k)))
+                  ! TE = KE + Cv*T_v + PE
+                     te(i,j,k) = 0.5*w(i,j,k)**2 + 0.25*gridstruct%rsin2(i,j)*( &
+                                    u(i,j,k)**2+u(i,j+1,k)**2 + v(i,j,k)**2+v(i+1,j,k)**2 -  &
+                                    (u(i,j,k)+u(i,j+1,k))*(v(i,j,k)+v(i+1,j,k))*gridstruct%cosa_s(i,j)) &
+                                 + cv_air*pt(i,j,k)*pkz(i,j,k) &
+                                 + (phis(i,k+1)-phis(i,k))/(pe(i,k+1,j)-pe(i,k,j))
+                  enddo
+#endif
+               enddo
+            endif ! hydro test
+         endif
+
+         ! update ps
+         do i=is,ie
+            ps(i,j) = pe1(i,km+1)
+         enddo
+         !
+         ! Hybrid sigma-P coordinate:
+         !
+         do k=2,km
+            do i=is,ie
+               pe2(i,k) = ak(k) + bk(k)*pe(i,km+1,j)
+            enddo
+         enddo
+         do k=1,km
+            do i=is,ie
+               dp2(i,k) = pe2(i,k+1) - pe2(i,k)
+            enddo
+         enddo
+         !------------------
+         ! Compute p**Kappa
+         !------------------
+         do k=1,km+1
+            do i=is,ie
+               pk1(i,k) = pk(i,j,k)
+               pn1(i,k) = peln(i,k,j)
+            enddo
+         enddo
+
+         do i=is,ie
+            pn2(i,   1) = peln(i,   1,j)
+            pn2(i,km+1) = peln(i,km+1,j)
+            pk2(i,   1) = pk1(i,   1)
+            pk2(i,km+1) = pk1(i,km+1)
+         enddo
+
+         do k=2,km
+            do i=is,ie
+               pn2(i,k) = log(pe2(i,k))
+               pk2(i,k) = exp(akap*pn2(i,k))
+            enddo
+         enddo
+
+         if (remap_te) then
+            print*,'CODE NOT TESTED HERE 5'
+         !----------------------------------
+         ! map TE in log P
+         !----------------------------------
+            if ( gmao_remap > 0 ) then
+               call map1_gmao (km,  pe1,  te,       &
+                              km,  pe2,  te,       &
+                              is, ie, j, isd, ied, jsd, jed, akap, gmao_remap, P_MAP=1, conserv=.true.)
+            else
+               call map_scalar(km,  pn1,  te, gz,   &
+                              km,  pn2,  te,       &
+                              is, ie, j, isd, ied, jsd, jed, 1, abs(kord_tm), te_min)
+            endif
+         else
+         !----------------------------------
+         ! map T or PT in log P
+         !----------------------------------
+            if ( gmao_remap > 0 ) then
+               print*,'CODE NOT TESTED HERE 6'
+               call map1_gmao (km,  pe1,  pt,       &
+                              km,  pe2,  pt,       &
+                              is, ie, j, isd, ied, jsd, jed, akap, gmao_remap, P_MAP=1, conserv=.false.)
+            else
+               !$ser verbatim if(j == js2d) then 
+               !$ser verbatim    do k = 1, km+1
+               !$ser verbatim      do i = is, ie
+               !$ser verbatim           pn1_3d(i,j,k) = pn1(i,k)
+               !$ser verbatim           pn2_3d(i,j,k) = pn2(i,k)
+               !$ser verbatim      enddo
+               !$ser verbatim     enddo
+               !$ser savepoint Map_Scalar-In
+               !$ser verbatim qmin=184.0
+               !$ser data  pe1_=pn1_3d pe2_=pn2_3d q1=pt j_2d=js2d q_min=qmin
+               !$ser verbatim endif
+               call map_scalar(km,  pn1,  pt, gz,   &
+                              km,  pn2,  pt,       &
+                              is, ie, j, isd, ied, jsd, jed, 1, abs(kord_tm), t_min)
+               !$ser verbatim if(j == js2d) then 
+               !$ser savepoint Map_Scalar-Out
+               !$ser data q1=pt
+               !$ser verbatim endif
+            endif
+         endif
+
+         !----------------
+         ! Map constituents
+         !----------------
+         if( nq > 5 ) then
+            !$ser verbatim if(j == js2d) then
+            !$ser verbatim    do k = 1, km+1
+            !$ser verbatim      do i = is, ie
+            !$ser verbatim           pe1_3d(i,j,k) = pe1(i,k)
+            !$ser verbatim           pe2_3d(i,j,k) = pe2(i,k)
+            !$ser verbatim           
+            !$ser verbatim           if (k < km+1) dp2_3d(i,j,k) = dp2(i,k)
+            !$ser verbatim      enddo
+            !$ser verbatim     enddo
+            !$ser savepoint MapN_Tracer_2d-In
+            !$ser data j_2d=js2d nq=nq pe1_=pe1_3d pe2_=pe2_3d dp2_=dp2_3d
+            !$ser data qvapor=q(:,:,:,1)
+            !$ser data qliquid=q(:,:,:,2)
+            !$ser data qice=q(:,:,:,3)
+            !$ser data qrain=q(:,:,:,4)
+            !$ser data qsnow=q(:,:,:,5)
+            !$ser data qgraupel=q(:,:,:,6)
+            !$ser data qcld=q(:,:,:,7)
+            !$ser data qo3mr=q(:,:,:,8)
+            !$ser data qsgs_tke=q(:,:,:,9)
+            !$ser verbatim endif
+            call mapn_tracer(nq, km, pe1, pe2, q, dp2, kord_tr, j,     &
+                              is, ie, isd, ied, jsd, jed, 0., fill)
+            !$ser verbatim if(j == js2d) then 
+            !$ser savepoint MapN_Tracer_2d-Out
+            !$ser data qvapor=q(:,:,:,1)
+            !$ser data qliquid=q(:,:,:,2)
+            !$ser data qice=q(:,:,:,3)
+            !$ser data qrain=q(:,:,:,4)
+            !$ser data qsnow=q(:,:,:,5)
+            !$ser data qgraupel=q(:,:,:,6)
+            !$ser data qcld=q(:,:,:,7)
+            !$ser data qo3mr=q(:,:,:,8)
+            !$ser data qsgs_tke=q(:,:,:,9)
+            !$ser verbatim endif
+
+         elseif ( nq > 0 ) then
+            print*,'CODE NOT TESTED HERE 7'
+         ! Remap one tracer at a time
+            do iq=1,nq
+               call map1_q2(km, pe1, q(isd,jsd,1,iq),     &
+                           km, pe2, q2, dp2,             &
+                           is, ie, 0, kord_tr(iq), j, isd, ied, jsd, jed, 0.)
+               if (fill) call fillz(ie-is+1, km, 1, q2, dp2)
+               do k=1,km
+                  do i=is,ie
+                     q(i,j,k,iq) = q2(i,k)
+                  enddo
+               enddo
+            enddo
+         endif
+
+         if ( .not. hydrostatic ) then
+            ! Remap vertical wind:
+               !$ser verbatim if(j == js2d) then 
+               !$ser verbatim    do k = 1, km+1
+               !$ser verbatim      do i = is, ie
+               !$ser verbatim           pn1_3d(i,j,k) = pn1(i,k)
+               !$ser verbatim           pn2_3d(i,j,k) = pn2(i,k)
+               !$ser verbatim      enddo
+               !$ser verbatim     enddo
+               !$ser savepoint Map1_PPM_W-In
+               !$ser data pe1_=pe1_3d
+               !$ser data pe2_=pe2_3d
+               !$ser data ws_=ws
+               !$ser data w_=w
+               !$ser data kord_wz=kord_wz
+               !$ser verbatim endif
+            call map1_ppm (km,   pe1,  w,  ws(is,j),   &
+                           km,   pe2,  w,              &
+                           is, ie, j, isd, ied, jsd, jed, -2, kord_wz)
+               !$ser verbatim if(j == js2d) then 
+               !$ser savepoint Map1_PPM_W-Out
+               !$ser data w_=w
+               !$ser verbatim endif
+
+            !$ser verbatim if(j == js2d) then 
+            !$ser verbatim    do k = 1, km+1
+            !$ser verbatim      do i = is, ie
+            !$ser verbatim           pn1_3d(i,j,k) = pn1(i,k)
+            !$ser verbatim           pn2_3d(i,j,k) = pn2(i,k)
+            !$ser verbatim      enddo
+            !$ser verbatim     enddo
+            !$ser verbatim    do i = is, ie
+            !$ser verbatim       gz_2d(i,j) = gz(i)
+            !$ser verbatim    enddo
+            !$ser verbatim do k = 1, km
+            !$ser verbatim            do i = is, ie
+            !$ser verbatim               dp2_3d(i,j,k) = dp2(i,k)
+            !$ser verbatim            enddo
+            !$ser verbatim         enddo
+            !$ser savepoint Map1_PPM_delz-In
+            !$ser data pe1_=pe1_3d
+            !$ser data pe2_=pe2_3d
+            !$ser data dp2_3d=dp2_3d
+            !$ser data gz_=gz_2d
+            !$ser data delz_=delz
+            !$ser data delp=delp
+            !$ser data kord_wz=kord_wz
+            !$ser verbatim endif
+            ! Remap delz for hybrid sigma-p coordinate
+            do k=1,km
+               do i=is,ie
+                  delz(i,j,k) = -delz(i,j,k) / delp(i,j,k) ! ="specific volume"/grav
+               enddo
+            enddo
+
+            call map1_ppm (km,   pe1, delz,  gz,   &
+                           km,   pe2, delz,              &
+                           is, ie, j, isd,  ied,  jsd,  jed,  1, abs(kord_wz))
+
+            do k=1,km
+               do i=is,ie
+                  delz(i,j,k) = -delz(i,j,k)*dp2(i,k)
+               enddo
+            enddo
+            !$ser verbatim if(j == js2d) then 
+            !$ser savepoint Map1_PPM_delz-Out
+            !$ser data delz_=delz
+            !$ser verbatim endif
+
+            !Fix excessive w - momentum conserving --- sjl
+            ! gz(:) used here as a temporary array
+            if ( w_limiter ) then
+!$ser verbatim if(j == js2d) then 
+!$ser verbatim do k = 1, km
+!$ser verbatim            do i = is, ie
+!$ser verbatim               dp2_3d(i,:,k) = dp2(i,k)
+!$ser verbatim            enddo
+!$ser verbatim         enddo
+!$ser verbatim  w_max_ = w_max
+!$ser verbatim  w_min_ = w_min
+
+!$ser savepoint W_fix_consrv_moment-In
+!$ser data w=w dp2_W=dp2_3d w_max=w_max_ w_min=w_min_
+!$ser verbatim endif
+               do k=1,km
+                  do i=is,ie
+                     w2(i,k) = w(i,j,k)
+                  enddo
+               enddo
+               do k=1, km-1
+                  do i=is,ie
+                     if ( w2(i,k) > w_max ) then
+                        if(j == js2d) then
+                           print*,"ENTERED LOOP 1"
+                        endif
+                        gz(i) = (w2(i,k)-w_max) * dp2(i,k)
+                        w2(i,k  ) = w_max
+                        w2(i,k+1) = w2(i,k+1) + gz(i)/dp2(i,k+1)
+                        !print*, ' W_LIMITER down: ', i,j,k, w2(i,k:k+1), w(i,j,k:k+1)
+                     elseif ( w2(i,k) < w_min ) then
+                        if(j == js2d) then
+                           print*,"ENTERED LOOP 2"
+                        endif
+                        gz(i) = (w2(i,k)-w_min) * dp2(i,k)
+                        w2(i,k  ) = w_min
+                        w2(i,k+1) = w2(i,k+1) + gz(i)/dp2(i,k+1)
+                        !print*, ' W_LIMITER down: ', i,j,k, w2(i,k:k+1), w(i,j,k:k+1)
+                     endif
+                  enddo
+               enddo
+               do k=km, 2, -1
+                  do i=is,ie
+                     if ( w2(i,k) > w_max ) then
+                        if(j == js2d) then
+                           print*,"ENTERED LOOP 3"
+                        endif
+                        gz(i) = (w2(i,k)-w_max) * dp2(i,k)
+                        w2(i,k  ) = w_max
+                        w2(i,k-1) = w2(i,k-1) + gz(i)/dp2(i,k-1)
+                        !print*, ' W_LIMITER up: ', i,j,k, w2(i,k-1:k), w(i,j,k-1:k)
+                     elseif ( w2(i,k) < w_min ) then
+                        if(j == js2d) then
+                           print*,"ENTERED LOOP 4"
+                        endif
+                        gz(i) = (w2(i,k)-w_min) * dp2(i,k)
+                        w2(i,k  ) = w_min
+                        w2(i,k-1) = w2(i,k-1) + gz(i)/dp2(i,k-1)
+                        !print*, ' W_LIMITER up: ', i,j,k, w2(i,k-1:k), w(i,j,k-1:k)
+                     endif
+                  enddo
+               enddo
+               do i=is,ie
+                  if (w2(i,1) > w_max*2. ) then
+                     if(j == js2d) then
+                        print*,'ENTERED LOOP 5'
+                     endif
+                     w2(i,1) = w_max*2 ! sink out of the top of the domain
+                     !print*, ' W_LIMITER top limited: ', i,j,1, w2(i,1), w(i,j,1)
+                  elseif (w2(i,1) < w_min*2. ) then
+                     if(j == js2d) then
+                        print*,'ENTERED LOOP 6'
+                     endif
+                     w2(i,1) = w_min*2.
+                     !print*, ' W_LIMITER top limited: ', i,j,1, w2(i,1), w(i,j,1)
+                  endif
+               enddo
+               do k=1,km
+                  do i=is,ie
+                     w(i,j,k) = w2(i,k)
+                  enddo
+               enddo
+!$ser verbatim if(j == js2d) then 
+!$ser savepoint W_fix_consrv_moment-Out
+!$ser data w=w
+!$ser verbatim endif
+            endif
+         endif
+
+      endif !(j < je+1)
+
+!$ser verbatim if(j == js2d) then
+!$ser verbatim do k = 1, km+1
+!$ser verbatim     do i = is-1, ie+1
+!$ser verbatim        pe_3d(i,j,k) = pe(i,k,j)
+!$ser verbatim        pe_3d(i,j-1,k) = pe(i,k,j-1)
+!$ser verbatim     enddo
+!$ser verbatim enddo
+!$ser verbatim do k = 1, km+1
+!$ser verbatim     do i = is, ie+1
+!$ser verbatim        pe0_3d(i,j,k) = pe0(i,k)
+!$ser verbatim        pe3_3d(i,j,k) = pe3(i,k)
+!$ser verbatim     enddo
+!$ser verbatim enddo
+!$ser savepoint Pressures_mapU-In
+!$ser data pe_=pe_3d
+!$ser data pe0_=pe0_3d
+!$ser data pe3_=pe3_3d
+!$ser data ak=ak 
+!$ser data bk=bk 
+!$ser data ptop=ptop
+!$ser data u_=u
+!$ser data kord_mt=kord_mt
+!$ser verbatim if (present(mfy)) then
+!$ser data mfy_=mfy
+!$ser verbatim endif
+!$ser verbatim if (present(cy)) then
+!$ser data cy_=cy
+!$ser verbatim endif
+!$ser verbatim endif
+      !------
+      ! map u
+      !------
+      do i=is,ie+1
+         pe0(i,1) = ptop
+      enddo
+
+      do k=2,km+1
+         do i=is,ie
+            pe0(i,k) = 0.5*(pe(i,k,j-1)+pe(i,k,j))
+         enddo
+      enddo
+
+      do k=1,km+1
+         bkh = 0.5*bk(k)
+         do i=is,ie
+            pe3(i,k) = ak(k) + bkh*(pe(i,km+1,j-1)+pe(i,km+1,j))
+         enddo
+      enddo
+
+      call map1_ppm( km, pe0(is:ie,:),   u,   gz,   &
+                     km, pe3(is:ie,:),   u,               &
+                     is, ie, j, isd, ied, jsd, jed+1, -1, kord_mt)
+      if (present(mfy)) then
+         ! print*, 'mfy present'
+         call map1_ppm( km, pe0(is:ie,:), mfy,  gz,  &
+                        km, pe3(is:ie,:), mfy,       &
+                        is, ie, j, is, ie, js, je+1, -1, kord_mt)
+      endif
+      if (present(cy)) then
+         ! print*, 'cy present'
+         call map1_ppm( km, pe0(is:ie,:), cy,  gz,  &
+                        km, pe3(is:ie,:), cy,       &
+                        is, ie, j, isd, ied, js, je+1, -1, kord_mt)
+      endif
+
+!$ser verbatim if(j == js2d) then
+!$ser verbatim do k = 1, km+1
+!$ser verbatim     do i = is, ie+1
+!$ser verbatim        pe0_3d(i,j,k) = pe0(i,k)
+!$ser verbatim        pe3_3d(i,j,k) = pe3(i,k)
+!$ser verbatim     enddo
+!$ser verbatim enddo
+!$ser savepoint Pressures_mapU-Out
+!$ser data pe0_=pe0_3d
+!$ser data pe3_=pe3_3d
+!$ser data u_=u
+!$ser verbatim if (present(mfy)) then
+!$ser data mfy_=mfy
+!$ser verbatim endif
+!$ser verbatim if (present(cy)) then
+!$ser data cy_=cy
+!$ser verbatim endif
+!$ser verbatim endif
+
+! Note : This serialization portion will test the update of pe0
+!         and pe3
+!$ser verbatim if(j == js2d) then
+!$ser verbatim do k = 1, km+1
+!$ser verbatim     do i = is, ie+1
+!$ser verbatim        pe_3d(i,j,k) = pe(i,k,j)
+!$ser verbatim        pe_3d(i-1,j,k) = pe(i-1,k,j)
+!$ser verbatim     enddo
+!$ser verbatim enddo
+!$ser verbatim do k = 1, km+1
+!$ser verbatim     do i = is, ie+1
+!$ser verbatim        pe0_3d(i,j,k) = pe0(i,k)
+!$ser verbatim        pe3_3d(i,j,k) = pe3(i,k)
+!$ser verbatim     enddo
+!$ser verbatim enddo
+!$ser savepoint Pressures_mapV-In
+!$ser data pe_=pe_3d
+!$ser data pe0_=pe0_3d
+!$ser data pe3_=pe3_3d
+!$ser data ak=ak 
+!$ser data bk=bk
+!$ser data v_=v
+!$ser data kord_mt=kord_mt
+!$ser verbatim if (present(mfx)) then
+!$ser data mfx_=mfx
+!$ser verbatim endif
+!$ser verbatim if (present(cx)) then
+!$ser data cx_=cx
+!$ser verbatim endif
+!$ser verbatim endif
+      !------
+      ! map v
+      !------
+      if (j < je+1) then
+         do k=2,km+1
+            do i=is,ie+1
+               pe0(i,k) = 0.5*(pe(i-1,k,   j)+pe(i,k,   j))
+            enddo
+         enddo
+
+         do k=1,km+1
+            bkh = 0.5*bk(k)
+            do i=is,ie+1
+               pe3(i,k) = ak(k) + bkh*(pe(i-1,km+1,j)+pe(i,km+1,j))
+            enddo
+         enddo
+
+         call map1_ppm (km, pe0,  v, gz,    &
+                        km, pe3,  v, is, ie+1,    &
+                        j, isd, ied+1, jsd, jed, -1, kord_mt)
+         if (present(mfx)) then
+            ! print*, 'mfx present'
+            call map1_ppm (km, pe0, mfx,  gz,         &
+                           km, pe3, mfx, is, ie+1,    &
+                           j, is, ie+1, js, je, -1, kord_mt)
+         endif
+         if (present(cx)) then
+            ! print*, 'cx present'
+            call map1_ppm (km, pe0, cx,  gz,         &
+                           km, pe3, cx, is, ie+1,    &
+                           j, is, ie+1, jsd, jed, -1, kord_mt)
+         endif
+      endif ! (j < je+1)
+
+!$ser verbatim if(j == js2d) then
+!$ser verbatim do k = 1, km+1
+!$ser verbatim     do i = is, ie+1
+!$ser verbatim        pe0_3d(i,j,k) = pe0(i,k)
+!$ser verbatim        pe3_3d(i,j,k) = pe3(i,k)
+!$ser verbatim     enddo
+!$ser verbatim enddo
+!$ser savepoint Pressures_mapV-Out
+!$ser data pe0_=pe0_3d
+!$ser data pe3_=pe3_3d
+!$ser data v_=v
+!$ser verbatim if (present(mfx)) then
+!$ser data mfx_=mfx
+!$ser verbatim endif
+!$ser verbatim if (present(cx)) then
+!$ser data cx_=cx
+!$ser verbatim endif
+!$ser verbatim endif
+
+1000  continue
+
+! Update pressure variables and get new pkz, T_v, and omega
+
+!$OMP parallel do default(none) shared(is,ie,js,je,km,pe,ptop,kord_tm,remap_t, &
+!$OMP                                  remap_pt,remap_te,mfy,mfx,cx,cy,hydrostatic, &
+!$OMP                                  pt,pk,rg,peln,q,nwat,liq_wat,rainwat,ice_wat,snowwat,    &
+!$OMP                                  graupel,sphum,cappa,r_vir,rcp,cp,k1k,delp, cld_amt, &
+!$OMP                                  delz,akap,pkz,te,u,v,ps, gridstruct, &
+!$OMP                                  ak,bk,nq,isd,ied,jsd,jed,kord_tr,fill, &
+!$OMP                                  hs,w,ws,kord_wz,do_omega,omga,rrg,kord_mt,    &
+#ifdef SERIALIZE
+!$OMP  ppser_savepoint, ppser_serializer, ppser_serializer_ref, ppser_zrperturb, js2d, &
+#endif
+!$OMP                                  pe2_3d, pe_3d, peln_3d, pn2_3d, pk2_3d, pe0, pe0_3d, pe3_3d, &
+!$OMP                                  pe1_3d, pn1, dp2_3d, q_con, pe1, pn1_3d) &
+!$OMP                          private(gz,cvm,kp,k_next,bkh,dp2,   &
+!$OMP                                  pe2,pe3,pk2,pn2,phis,tpe,dlnp,tmp)
+   do 2000 j=js,je
+
+!$ser verbatim if(j == js2d) then
+!$ser verbatim do k = 1, km+1
+!$ser verbatim     do i = is, ie
+!$ser verbatim        pe2_3d(i,j,k) = pe2(i,k)
+!$ser verbatim        pe_3d(i,j,k)  = pe(i,k,j)
+!$ser verbatim        peln_3d(i,j,k) = peln(i,k,j)
+!$ser verbatim        pn2_3d(i,j,k) = pn2(i,k)
+!$ser verbatim        pk2_3d(i,j,k) = pk2(i,k)
+!$ser verbatim     enddo
+!$ser verbatim enddo
+!$ser savepoint PE_pk_delp_peln-In
+!$ser data pe2_=pe2_3d
+!$ser data pe_=pe_3d
+!$ser data peln_=peln_3d
+!$ser data pn2_=pn2_3d
+!$ser data pk2_=pk2_3d
+!$ser data delp=delp
+!$ser data pk=pk
+!$ser data ak=ak
+!$ser data bk=bk
+!$ser data akap=akap
+!$ser data ptop=ptop
+!$ser verbatim endif
+      !----------
+      ! Update pe
+      !----------
+      do i=is,ie
+         pe2(i,   1) = ptop
+         pe2(i,km+1) = pe(i,km+1,j)
+      enddo
+      do k=2,km
+         do i=is,ie
+            pe2(i,k) = ak(k) + bk(k)*pe(i,km+1,j)
+         enddo
+      enddo
+      do k=1,km+1
+         do i=is,ie
+            pe(i,k,j) = pe2(i,k)
+         enddo
+      enddo
+
+      !----------
+      ! Update pk
+      !----------
+      do i=is,ie
+         pn2(i,   1) = peln(i,   1,j)
+         pn2(i,km+1) = peln(i,km+1,j)
+         pk2(i,   1) = pk(i,j,   1)
+         pk2(i,km+1) = pk(i,j,km+1)
+      enddo
+      do k=2,km
+         do i=is,ie
+            pn2(i,k) = log(pe2(i,k))
+            pk2(i,k) = exp(akap*pn2(i,k))
+         enddo
+      enddo
+      do k=1,km+1
+         do i=is,ie
+            pk(i,j,k) = pk2(i,k)
+         enddo
+      enddo
+
+      !------------
+      ! update delp
+      !------------
+      do k=1,km
+         do i=is,ie
+            delp(i,j,k) = pe2(i,k+1) - pe2(i,k)
+         enddo
+      enddo
+
+      !------------
+      ! update logP
+      !------------
+      do k=1,km+1
+         do i=is,ie
+            peln(i,k,j) =  pn2(i,k)
+         enddo
+      enddo
+
+!$ser verbatim if(j == js2d) then
+!$ser verbatim do k = 1, km+1
+!$ser verbatim     do i = is, ie
+!$ser verbatim        pe2_3d(i,j,k) = pe2(i,k)
+!$ser verbatim        pe_3d(i,j,k)  = pe(i,k,j)
+!$ser verbatim        peln_3d(i,j,k) = peln(i,k,j)
+!$ser verbatim        pn2_3d(i,j,k) = pn2(i,k)
+!$ser verbatim        pk2_3d(i,j,k) = pk2(i,k)
+!$ser verbatim     enddo
+!$ser verbatim enddo
+!$ser savepoint PE_pk_delp_peln-Out
+!$ser data pe2_=pe2_3d
+!$ser data pe_=pe_3d
+!$ser data peln_=peln_3d
+!$ser data pn2_=pn2_3d
+!$ser data pk2_=pk2_3d
+!$ser data delp=delp
+!$ser data pk=pk
+!$ser verbatim endif
+
+      !---------------------
+      ! Compute pkz and T_v
+      !---------------------
+      if ( hydrostatic ) then
+         print*,'CODE NOT TESTED HERE 8'
+         do k=1,km
+            do i=is,ie
+               pkz(i,j,k) = (pk(i,j,k+1)-pk(i,j,k))/(akap*(peln(i,k+1,j)-peln(i,k,j)))
+            enddo
+         enddo
+         if (.not.remap_t) then
+            if (remap_te) then
+            ! Get updated T_v (store in pt)
+               do i=is,ie
+                  gz(i) = hs(i,j)
+               enddo
+               do k=km,1,-1
+                  do i=is,ie
+                     tpe = te(i,j,k) - gz(i) - 0.25*gridstruct%rsin2(i,j)*(    &
+                           u(i,j,k)**2+u(i,j+1,k)**2 + v(i,j,k)**2+v(i+1,j,k)**2 -  &
+                        (u(i,j,k)+u(i,j+1,k))*(v(i,j,k)+v(i+1,j,k))*gridstruct%cosa_s(i,j) )
+                     dlnp = rg*(peln(i,k+1,j) - peln(i,k,j))
+                     tmp = tpe / (cp - pe(i,k,j)*dlnp/delp(i,j,k))
+                     pt(i,j,k) = tmp
+                     gz(i) = gz(i) + dlnp*tmp
+                  enddo
+               enddo           ! end k-loop
+            else
+               ! Make pt T_v
+               do k=1,km
+                  do i=is,ie
+                     pt(i,j,k) = pt(i,j,k)*pkz(i,j,k)
+                  enddo
+               enddo
+            endif
+         endif
+      else
+         if (remap_te) then
+            print*,'CODE NOT TESTED HERE 9'
+            ! Invert TE using 3D winds to get pt (virtual temperature) and pkz:
+            do i=is,ie
+               phis(i,km+1) = hs(i,j)
+            enddo
+            do k=km,1,-1
+               do i=is,ie
+                  phis(i,k) = phis(i,k+1) - grav*delz(i,j,k)
+               enddo
+            enddo
+            do k=1,km+1
+               do i=is,ie
+                  phis(i,k) = phis(i,k) * pe(i,k,j)
+               enddo
+            enddo
+            do k=1,km
+#ifdef MOIST_CAPPA
+               call moist_cv(is,ie,isd,ied,jsd,jed, km, j, k, nwat, sphum, liq_wat, rainwat,    &
+                              ice_wat, snowwat, graupel, q, gz, cvm)
+               do i=is,ie
+                  tpe = te(i,j,k) - &
+                                 ( 0.5*w(i,j,k)**2 + 0.25*gridstruct%rsin2(i,j)*( &
+                                    u(i,j,k)**2+u(i,j+1,k)**2 + v(i,j,k)**2+v(i+1,j,k)**2 -  &
+                                 (u(i,j,k)+u(i,j+1,k))*(v(i,j,k)+v(i+1,j,k))*gridstruct%cosa_s(i,j)) &
+                                 + (phis(i,k+1)-phis(i,k))/(pe(i,k+1,j)-pe(i,k,j)) )
+                  pt(i,j,k) = tpe/cvm(i)
+                  cappa(i,j,k) = rdgas / ( rdgas + cvm(i)/(1.+r_vir*q(i,j,k,sphum)) )
+                  pkz(i,j,k) = exp(cappa(i,j,k)*log(rrg*delp(i,j,k)/delz(i,j,k)*pt(i,j,k)))
+               enddo
+#else
+               do i=is,ie
+                  tpe = te(i,j,k) - &
+                                 ( 0.5*(phis(i,k)+phis(i,k+1) + w(i,j,k)**2 + 0.5*gridstruct%rsin2(i,j)*( &
+                                    u(i,j,k)**2+u(i,j+1,k)**2 + v(i,j,k)**2+v(i+1,j,k)**2 -  &
+                                 (u(i,j,k)+u(i,j+1,k))*(v(i,j,k)+v(i+1,j,k))*gridstruct%cosa_s(i,j))) )
+                  pt(i,j,k) = tpe/cv_air
+                  pkz(i,j,k) = exp(akap*log(rrg*delp(i,j,k)/delz(i,j,k)*pt(i,j,k)))
+               enddo
+#endif
+            enddo
+         endif
+         if (remap_t) then
+            ! print*,'CODE EXECUTED'
+! Note: pt at this stage is T_v or T_m
+            !$ser verbatim if(j == js2d) then 
+            !!$ser verbatim print*,'MoistCVPlusPt_2D serialization'
+            !$ser savepoint MoistCVPlusPkz_2d-In
+            !$ser data qvapor_js=q(:,j,:,sphum) 
+            !$ser data qliquid_js=q(:,j,:,liq_wat) 
+            !$ser data qice_js=q(:,j,:,ice_wat) 
+            !$ser data qrain_js=q(:,j,:,rainwat) 
+            !$ser data qsnow_js=q(:,j,:,snowwat) 
+            !$ser data qgraupel_js=q(:,j,:,graupel) 
+            !$ser data qcld_js=q(:,j,:,cld_amt) 
+            !$ser data r_vir=r_vir 
+            !$ser data cappa=cappa 
+            !$ser data rrg=rrg 
+            !$ser data delp=delp 
+            !$ser data delz=delz 
+            !$ser data pkz=pkz 
+            !$ser data pt=pt
+            !$ser data k1k=k1k
+            !$ser verbatim endif
+            do k=1,km
+#ifdef MOIST_CAPPA
+               call moist_cv(is,ie,isd,ied,jsd,jed, km, j, k, nwat, sphum, liq_wat, rainwat,    &
+                           ice_wat, snowwat, graupel, q, gz, cvm)
+               do i=is,ie
+                  cappa(i,j,k) = rdgas / ( rdgas + cvm(i)/(1.+r_vir*q(i,j,k,sphum)) )
+                  pkz(i,j,k) = exp(cappa(i,j,k)*log(rrg*delp(i,j,k)/delz(i,j,k)*pt(i,j,k)))
+               enddo
+#else
+               do i=is,ie
+                  pkz(i,j,k) = exp(akap*log(rrg*delp(i,j,k)/delz(i,j,k)*pt(i,j,k)))
+               enddo
+#endif
+            enddo
+            !$ser verbatim if(j == js2d) then 
+            !$ser savepoint MoistCVPlusPkz_2d-Out
+            !$ser data pkz=pkz 
+            !$ser data cappa=cappa
+            !$ser verbatim endif
+         endif
+         if (remap_pt) then
+            print*,'CODE NOT TESTED HERE 10'
+! Note: pt at this stage is Theta_v
+            do k=1,km
+               do i=is,ie
+                  pkz(i,j,k) = exp( k1k*log(rrg*delp(i,j,k)/delz(i,j,k)*pt(i,j,k)) )
+               ! Make pt T_v
+                  pt(i,j,k) = pt(i,j,k)*pkz(i,j,k)
+               enddo
+            enddo
+         endif
+      endif
+
+! Interpolate omega/pe3 (defined at peln) to remapped cell center (dp2)
+      if ( do_omega ) then
+! Copy omega field to pe3
+         print*,'CODE NOT TESTED HERE 11'
+         do i=is,ie
+            pe3(i,1) = 0.
+         enddo
+         do k=2,km+1
+            do i=is,ie
+               pe3(i,k) = omga(i,j,k-1)
+            enddo
+         enddo
+         do k=1,km
+            do i=is,ie
+               dp2(i,k) = 0.5*(peln(i,k,j) + peln(i,k+1,j))
+            enddo
+         enddo
+         do i=is,ie
+            k_next = 1
+            do n=1,km
+               kp = k_next
+               do k=kp,km
+                  if( dp2(i,n) <= peln(i,k+1,j) .and. dp2(i,n) >= peln(i,k,j) ) then
+                     omga(i,j,n) = pe3(i,k)  +  (pe3(i,k+1) - pe3(i,k)) *    &
+                           (dp2(i,n)-peln(i,k,j)) / (peln(i,k+1,j)-peln(i,k,j) )
+                     k_next = k
+                     exit
+                  endif
+               enddo
+            enddo
+         enddo
+      endif     ! end do_omega
+
+2000  continue
+
+! Do total energy conservation and fast saturation adjustment as requested
+! and fill new PT (Theta_V) for next k_split step or export dry T
+
+!$OMP parallel default(none) shared(is,ie,js,je,km,kmp,ptop,u,v,pe,isd,ied,jsd,jed,kord_mt, &
+!$OMP                               remap_t,remap_pt,remap_te, &
+!$OMP                               te_2d,te,delp,hydrostatic,hs,rg,pt,peln, adiabatic, &
+!$OMP                               cp,delz,nwat,rainwat,liq_wat,ice_wat,snowwat,       &
+!$OMP                               graupel,q_con,r_vir,sphum,w,pk,pkz,last_step,consv, &
+!$OMP                               do_adiabatic_init,zsum1,zsum0,te0_2d,domain,        &
+!$OMP                               ng,gridstruct,E_Flux,pdt,dtmp,reproduce_sum,q,      &
+!$OMP                               mdt,cld_amt,cappa,dtdt,out_dt,rrg,akap,do_sat_adj,  &
+!$OMP                               fast_mp_consv,kord_tm, phis_3d, te_2d_f32, te0_2d_f32, zsum1_f32, &
+#ifdef SERIALIZE
+!$OMP                               ppser_savepoint, ppser_serializer, ppser_serializer_ref, ppser_zrperturb, serial_flag, &
+#endif
+!$OMP                               js2d, pe1_3d, pe2_3d, pn1, pn1_3d, pn2, pn2_3d, pk2, pk2_3d, pe_3d, peln_3d, dp2, dp2_3d,  &
+!$OMP                               pe0_3d, pe3_3d, ps, mfy, cy, mfx, cx) &
+!$OMP                       private(pe0,pe1,pe2,pe3,cvm,gz,phis,tesum,zsum,dpln,dlnp,tmp, rsin2, cosa_s, grav_)
+
+   dtmp = 0.
+   if( last_step .and. (.not.do_adiabatic_init)  ) then
+      ! NOTE : Code can enter here since do_adiabatic_init can be False
+      if ( consv > consv_min ) then
+         ! print*, "consv > consv_min = entered", consv, consv_min
+!$OMP do
+         do j=js,je
+            if ( hydrostatic ) then
+               print*,'CODE NOT TESTED HERE 12'
+               do i=is,ie
+                  gz(i) = hs(i,j)
+                  do k=1,km
+                     gz(i) = gz(i) + rg*pt(i,j,k)*(peln(i,k+1,j)-peln(i,k,j))
+                  enddo
+               enddo
+               do i=is,ie
+                  te_2d(i,j) = pe(i,km+1,j)*hs(i,j) - pe(i,1,j)*gz(i)
+               enddo
+               do k=1,km
+                  do i=is,ie
+                     te_2d(i,j) = te_2d(i,j) + delp(i,j,k)*(cp*pt(i,j,k) +   &
+                                 0.25*gridstruct%rsin2(i,j)*(u(i,j,k)**2+u(i,j+1,k)**2 +  &
+                                                            v(i,j,k)**2+v(i+1,j,k)**2 -  &
+                                 (u(i,j,k)+u(i,j+1,k))*(v(i,j,k)+v(i+1,j,k))*gridstruct%cosa_s(i,j)))
+                  enddo
+               enddo
+            else
+! TE using 3D winds (pt is virtual temperature):
+               ! if(j==js) print*,"CODE EXECUTED 1"
+               !$ser verbatim if(j == js) then
+               !$ser verbatim    do k = 1, km+1
+               !$ser verbatim      do i = is, ie
+               !$ser verbatim         phis_3d(i,j,k) = phis(i,k)
+               !$ser verbatim      enddo
+               !$ser verbatim    enddo
+               !$ser verbatim    rsin2=gridstruct%rsin2
+               !$ser verbatim    cosa_s=gridstruct%cosa_s
+               !$ser verbatim   do i = is, ie
+               !$ser verbatim      te_2d_f32(i,:) = te_2d(i,:)
+               !$ser verbatim   enddo
+               !$ser verbatim   grav_=grav
+               !!$ser verbatim print*,'MoistCVPlusTe_2D serialization'
+               !$ser savepoint MoistCVPlusTe_2d-In
+               !$ser data qvapor_js=q(:,j,:,sphum) 
+               !$ser data qliquid_js=q(:,j,:,liq_wat)
+               !$ser data qice_js=q(:,j,:,ice_wat)
+               !$ser data qrain_js=q(:,j,:,rainwat)
+               !$ser data qsnow_js=q(:,j,:,snowwat)
+               !$ser data qgraupel_js=q(:,j,:,graupel)
+               !$ser data delp=delp
+               !$ser data pt=pt
+               !$ser data phis_=phis_3d
+               !$ser data te_2d_=te_2d_f32
+               !$ser data u=u
+               !$ser data v=v
+               !$ser data w=w
+               !$ser data cosa_s=cosa_s
+               !$ser data rsin2=rsin2
+               !$ser data hs=hs
+               !$ser data grav=grav_
+               !$ser data delz=delz
+               !$ser verbatim endif
+               do i=is,ie
+                  te_2d(i,j) = 0.
+                  phis(i,km+1) = hs(i,j)
+               enddo
+               do k=km,1,-1
+                  do i=is,ie
+                     phis(i,k) = phis(i,k+1) - grav*delz(i,j,k)
+                  enddo
+               enddo
+               do k=1,km
+#ifdef MOIST_CAPPA
+                  ! if(j==js) print*,"MOIST_CAPPA EXECUTED"
+                  call moist_cv(is,ie,isd,ied,jsd,jed, km, j, k, nwat, sphum, liq_wat, rainwat,    &
+                                 ice_wat, snowwat, graupel, q, gz, cvm)
+                  do i=is,ie
+                     te_2d(i,j) = te_2d(i,j) + delp(i,j,k)*(cvm(i)*pt(i,j,k) + &
+                                       0.5*(phis(i,k)+phis(i,k+1) + w(i,j,k)**2 + 0.5*gridstruct%rsin2(i,j)*( &
+                                       u(i,j,k)**2+u(i,j+1,k)**2 + v(i,j,k)**2+v(i+1,j,k)**2 -  &
+                                    (u(i,j,k)+u(i,j+1,k))*(v(i,j,k)+v(i+1,j,k))*gridstruct%cosa_s(i,j))))
+                  enddo
+#else
+                  do i=is,ie
+                     te_2d(i,j) = te_2d(i,j) + delp(i,j,k)*(cv_air*pt(i,j,k) + &
+                                       0.5*(phis(i,k)+phis(i,k+1) + w(i,j,k)**2 + 0.5*gridstruct%rsin2(i,j)*( &
+                                       u(i,j,k)**2+u(i,j+1,k)**2 + v(i,j,k)**2+v(i+1,j,k)**2 -  &
+                                    (u(i,j,k)+u(i,j+1,k))*(v(i,j,k)+v(i+1,j,k))*gridstruct%cosa_s(i,j))))
+                  enddo
+#endif
+               enddo   ! k-loop
+               !$ser verbatim if(j == js) then 
+               !$ser verbatim   te_2d_f32 = real(te_2d)
+               !$ser savepoint MoistCVPlusTe_2d-Out
+               !$ser data te_2d_=te_2d_f32
+               !$ser verbatim endif
+
+            endif  ! end non-hydro
+
+            !$ser verbatim if(j == js2d) then
+            !$ser verbatim   te_2d_f32 = real(te_2d)
+            !$ser verbatim   te0_2d_f32 = real(te0_2d)
+            !$ser verbatim   zsum1_f32 = real(zsum1)
+            !$ser savepoint Te_Zsum-In
+            !$ser data te_2d_=te_2d_f32
+            !$ser data te0_2d_=te0_2d_f32
+            !$ser data zsum1=zsum1_f32
+            !$ser data delp=delp
+            !$ser data pkz=pkz
+            !$ser verbatim endif
+            do i=is,ie
+               te_2d(i,j) = te0_2d(i,j) - te_2d(i,j)
+               zsum1(i,j) = pkz(i,j,1)*delp(i,j,1)
+            enddo
+            do k=2,km
+               do i=is,ie
+                  zsum1(i,j) = zsum1(i,j) + pkz(i,j,k)*delp(i,j,k)
+               enddo
+            enddo
+            if ( hydrostatic ) then
+               do i=is,ie
+                  zsum0(i,j) = ptop*(pk(i,j,1)-pk(i,j,km+1)) + zsum1(i,j)
+               enddo
+            endif
+            !$ser verbatim if(j == js2d) then
+            !$ser verbatim   te_2d_f32 = real(te_2d)
+            !$ser verbatim   zsum1_f32 = real(zsum1)
+            !!$ser verbatim print*, 'sum te_2d: ', sum(te_2d_f32(:,1)), sum(te_2d(:,1))
+            !$ser savepoint Te_Zsum-Out
+            !$ser data te_2d_=te_2d_f32
+            !$ser data zsum1=zsum1_f32
+            !$ser verbatim endif
+
+         enddo   ! j-loop
+
+!$OMP single
+         !print*,"MPP GLOBAL SUM CODE EXECUTED 1"
+         !$ser savepoint Mpp_global_sum-In
+         !$ser verbatim serial_flag=.true.
+         !$ser verbatim j=24
+         !$ser data x_compute_size=j
+         !$ser data y_compute_size=j
+         !$ser verbatim j=1
+         !$ser data x_compute_begin=j
+         !$ser data y_compute_begin=j
+         !$ser data max_ntile_pe=j
+         !$ser data tile=j
+         !$ser verbatim j=0
+         !$ser data ioff=j
+         !$ser data joff=j
+         !$ser data serial_flag=serial_flag
+         !$ser data tesum=tesum
+         tesum = mpp_global_sum(domain, te_2d*gridstruct%area_64(is:ie,js:je), &
+                              flags=BITWISE_EFP_SUM)
+
+         !$ser verbatim te_2d=te_2d*gridstruct%area_64(is:ie,js:je)
+         !!$ser verbatim print*,'Sum of input into mpp_global_sum = ', sum(te_2d)
+         !$ser data inputArray=te_2d
+         
+         !$ser savepoint Mpp_global_sum-Out
+         !$ser data tesum=tesum
+
+         !print*,'tesum = ', tesum
+         E_Flux = DBLE(consv)*tesum / DBLE(grav*pdt*4.*pi*radius**2)    ! unit: W/m**2
+                                                            ! Note pdt is "phys" time step
+         if ( hydrostatic ) then
+            print*,'CODE NOT TESTED HERE 13'
+            zsum = mpp_global_sum(domain, zsum0*gridstruct%area_64(is:ie,js:je), &
+                                    flags=BITWISE_EFP_SUM)
+            dtmp = tesum / DBLE(cp*zsum)
+         else
+            zsum = mpp_global_sum(domain, zsum1*gridstruct%area_64(is:ie,js:je), &
+                                    flags=BITWISE_EFP_SUM)
+            dtmp = tesum / DBLE(cv_air*zsum)
+         endif
+!$OMP end single
+
+      elseif ( consv < -consv_min ) then
+         print*,'CODE NOT TESTED HERE 14'
+!$OMP do
+         do j=js,je
+            do i=is,ie
+               zsum1(i,j) = pkz(i,j,1)*delp(i,j,1)
+            enddo
+            do k=2,km
+               do i=is,ie
+                  zsum1(i,j) = zsum1(i,j) + pkz(i,j,k)*delp(i,j,k)
+               enddo
+            enddo
+            if ( hydrostatic ) then
+               do i=is,ie
+                  zsum0(i,j) = ptop*(pk(i,j,1)-pk(i,j,km+1)) + zsum1(i,j)
+               enddo
+            endif
+         enddo
+
+         E_Flux = consv
+!$OMP single
+         if ( hydrostatic ) then
+            zsum = mpp_global_sum(domain, zsum0*gridstruct%area_64(is:ie,js:je), &
+                                    flags=BITWISE_EFP_SUM)
+            dtmp = E_Flux*(grav*pdt*4.*pi*radius**2) / (cp*zsum)
+         else
+            zsum = mpp_global_sum(domain, zsum1*gridstruct%area_64(is:ie,js:je), &
+                                    flags=BITWISE_EFP_SUM)
+            dtmp = E_Flux*(grav*pdt*4.*pi*radius**2) / (cv_air*zsum)
+         endif
+!$OMP end single
+      endif        ! end consv check
+   endif        ! end last_step check
+
+! Note: pt at this stage is T_v
+   if ( do_sat_adj .and. nwat>=6 ) then
+      print*,'CODE NOT TESTED HERE 15'
+      call timing_on('sat_adj2')
+!$OMP do
+      do k=kmp,km
+         do j=js,je
+            do i=is,ie
+               dpln(i,j) = peln(i,k+1,j) - peln(i,k,j)
+            enddo
+         enddo
+         call fv_sat_adj(abs(mdt), r_vir, is, ie, js, je, ng, hydrostatic, fast_mp_consv, &
+                        te(isd,jsd,k), q(isd,jsd,k,sphum), q(isd,jsd,k,liq_wat),   &
+                        q(isd,jsd,k,ice_wat), q(isd,jsd,k,rainwat),    &
+                        q(isd,jsd,k,snowwat), q(isd,jsd,k,graupel),    &
+                        hs ,dpln, delz(isd:,jsd:,k), pt(isd,jsd,k), delp(isd,jsd,k), &
+              cappa(isd:,jsd:,k), gridstruct%area_64, dtdt(is:,js:,k), out_dt, last_step, q(isd,jsd,k,cld_amt))
+         if ( .not. hydrostatic  ) then
+            do j=js,je
+               do i=is,ie
+#ifdef MOIST_CAPPA
+                  pkz(i,j,k) = exp(cappa(i,j,k)*log(rrg*delp(i,j,k)/delz(i,j,k)*pt(i,j,k)))
+#else
+                  pkz(i,j,k) = exp(akap*log(rrg*delp(i,j,k)/delz(i,j,k)*pt(i,j,k)))
+#endif
+               enddo
+            enddo
+         endif
+      enddo    ! OpenMP k-loop
+
+      if ( fast_mp_consv ) then
+!$OMP do
+         do j=js,je
+            do i=is,ie
+               do k=kmp,km
+                  te0_2d(i,j) = te0_2d(i,j) + te(i,j,k)
+               enddo
+            enddo
+         enddo
+      endif
+                                           call timing_off('sat_adj2')
+   endif   ! do_sat_adj
+
+
+   if ( last_step .and. (.not. adiabatic) ) then
+      ! Output temperature if last_step
+      if ( .not. hydrostatic  ) then
+         ! print*,'CODE EXECUTED'
+      !$ser savepoint MoistCVPlusPt_2d_last_step-In
+      !$ser data qvapor=q(:,:,:,sphum) 
+      !$ser data qliquid=q(:,:,:,liq_wat)
+      !$ser data qice=q(:,:,:,ice_wat)
+      !$ser data qrain=q(:,:,:,rainwat)
+      !$ser data qsnow=q(:,:,:,snowwat)
+      !$ser data qgraupel=q(:,:,:,graupel)
+      !$ser data r_vir=r_vir 
+      !$ser data dtmp=dtmp 
+      !$ser data pt=pt
+      !$ser data pkz=pkz
+!$OMP do
+         do k=1,km
+            do j=js,je
+#ifdef USE_COND
+               call moist_cv(is,ie,isd,ied,jsd,jed, km, j, k, nwat, sphum, liq_wat, rainwat,    &
+                              ice_wat, snowwat, graupel, q, gz, cvm)
+               do i=is,ie
+                  pt(i,j,k) = (pt(i,j,k)+dtmp*pkz(i,j,k)) / ((1.+r_vir*q(i,j,k,sphum))*(1.-gz(i)))
+               enddo
+#else
+               do i=is,ie
+                  pt(i,j,k) = (pt(i,j,k)+dtmp*pkz(i,j,k)) / (1.+r_vir*q(i,j,k,sphum))
+               enddo
+#endif
+            enddo   ! j-loop
+         enddo  ! k-loop
+      !$ser savepoint MoistCVPlusPt_2d_last_step-Out
+      !$ser data pt=pt
+#ifdef USE_COND
+   ! print*, "USE_COND active"
+   !$ser savepoint Cond_output-In
+   !$ser data q_con=q_con
+   !$ser data qliquid=q(:,:,:,liq_wat)
+   !$ser data qice=q(:,:,:,ice_wat)
+   !$ser data qrain=q(:,:,:,rainwat)
+   !$ser data qsnow=q(:,:,:,snowwat)
+   !$ser data qgraupel=q(:,:,:,graupel)
+! Fill condensate output
+!$OMP do
+        do k=1,km
+           do j=js,je
+              do i=is,ie
+                                  q_con(i,j,k) = 0.0
+                 if (liq_wat > 0) q_con(i,j,k) = q_con(i,j,k) + q(i,j,k,liq_wat)
+                 if (ice_wat > 0) q_con(i,j,k) = q_con(i,j,k) + q(i,j,k,ice_wat)
+                 if (rainwat > 0) q_con(i,j,k) = q_con(i,j,k) + q(i,j,k,rainwat)
+                 if (snowwat > 0) q_con(i,j,k) = q_con(i,j,k) + q(i,j,k,snowwat)
+                 if (graupel > 0) q_con(i,j,k) = q_con(i,j,k) + q(i,j,k,graupel)
+               enddo
+            enddo   ! j-loop
+         enddo  ! k-loop
+#endif
+   !$ser savepoint Cond_output-Out
+   !$ser data q_con=q_con
+
+      else
+         print*,'CODE NOT TESTED HERE 16'
+!$OMP do
+         do k=1,km
+            do j=js,je
+               do i=is,ie
+                  pt(i,j,k) = (pt(i,j,k)+dtmp*pkz(i,j,k)) / (1.+r_vir*q(i,j,k,sphum))
+               enddo
+            enddo   ! j-loop
+         enddo  ! k-loop
+      endif
+
+   elseif ( last_step .and. adiabatic ) then
+      print*,'CODE NOT TESTED HERE 17'
+!$OMP do
+      do k=1,km                          
+         do j=js,je
+            do i=is,ie
+               pt(i,j,k) = (pt(i,j,k)+dtmp*pkz(i,j,k))
+            enddo
+         enddo   ! j-loop
+      enddo  ! k-loop
+
+   else
+      print*,'CODE NOT TESTED HERE 18'
+      ! Top of the loop expects PT to be Theta_V
+!$OMP do
+      do k=1,km
+         do j=js,je
+            do i=is,ie
+               pt(i,j,k) = pt(i,j,k)/pkz(i,j,k)
+            enddo
+         enddo
+      enddo
+
+   endif
+!$OMP end parallel
+
+!$ser verbatim do k = 1, km+1
+!$ser verbatim    do j = js, je
+!$ser verbatim       do i = is, ie
+!$ser verbatim         peln_3d(i,j,k) = peln(i,k,j)
+!$ser verbatim       enddo
+!$ser verbatim     enddo
+!$ser verbatim enddo
+!$ser verbatim do k = 1, km+1
+!$ser verbatim    do j = js-1, je+1 
+!$ser verbatim       do i = is-1, ie+1
+!$ser verbatim          pe_3d(i,j,k) = pe(i,k,j)
+!$ser verbatim       enddo
+!$ser verbatim    enddo
+!$ser verbatim enddo
+!$ser verbatim   te_2d_f32 = real(te_2d)
+!$ser verbatim if(last_step .and. (.not.do_adiabatic_init)) then
+!$ser verbatim print*, 'tesum = ', tesum
+!$ser verbatim print*, 'zsum = ', zsum
+!$ser verbatim print*, 'dtmp = ', dtmp
+!$ser verbatim endif
+!$ser savepoint Remapping_GEOS-Out
+!$ser data pe_=pe_3d
+!$ser data qvapor=q(:,:,:,sphum)
+!$ser data qliquid=q(:,:,:,liq_wat) 
+!$ser data qice=q(:,:,:,ice_wat)
+!$ser data qrain=q(:,:,:,rainwat) 
+!$ser data qsnow=q(:,:,:,snowwat) 
+!$ser data qgraupel=q(:,:,:,graupel)
+!$ser data qcld=q(:,:,:,cld_amt)
+!$ser data qo3mr=q(:,:,:,8)
+!$ser data qsgs_tke=q(:,:,:,9)
+!$ser data q_con=q_con
+!$ser data pt=pt
+!$ser data cappa=cappa
+!$ser data delp=delp
+!$ser data delz=delz
+!$ser data ps=ps
+!$ser data peln_3d=peln_3d
+!$ser data r_vir=r_vir
+!$ser data cvm=cvm
+!$ser data pk=pk
+!$ser data pkz=pkz
+!$ser data akap=akap
+!$ser data w=w
+!$ser data u=u
+!$ser data v=v
+!$ser verbatim if (present(mfy)) then
+!$ser data mfy=mfy
+!$ser verbatim endif
+!$ser verbatim if (present(cy)) then
+!$ser data cy=cy
+!$ser verbatim endif
+!$ser verbatim if (present(mfx)) then
+!$ser data mfx_=mfx
+!$ser verbatim endif
+!$ser verbatim if (present(cx)) then
+!$ser data cx_=cx
+!$ser verbatim endif
+!$ser data kord_mt=kord_mt
+!$ser data cosa_s=cosa_s
+!$ser data rsin2=rsin2
+!$ser data hs=hs
+!$ser data grav=grav_
+!$ser data te0_2d_=te0_2d_f32
+
+!$ser savepoint GetMPIProp-In
+!$ser data delz=delz
+!$ser savepoint GetMPIProp-Out
+!$ser data delz=delz
+ end subroutine Lagrangian_to_Eulerian
+
+!>@brief The subroutine 'compute_total_energy' performs the FV3-consistent computation of the global total energy.
+!>@details It includes the potential, internal (latent and sensible heat), kinetic terms.
+ subroutine compute_total_energy(is, ie, js, je, isd, ied, jsd, jed, km,       &
+                                 u, v, w, delz, pt, delp, q, qc, pe, peln, hs, &
+                                 rsin2_l, cosa_s_l, &
+                                 r_vir,  cp, rg, hlv, te_2d, ua, va, teq, &
+                                 moist_phys, nwat, sphum, liq_wat, rainwat, ice_wat, snowwat, graupel, hydrostatic, id_te)
+!------------------------------------------------------
+! Compute vertically integrated total energy per column
+!------------------------------------------------------
+! !INPUT PARAMETERS:
+   integer,  intent(in):: km, is, ie, js, je, isd, ied, jsd, jed, id_te
+   integer,  intent(in):: sphum, liq_wat, ice_wat, rainwat, snowwat, graupel, nwat
+   real, intent(inout), dimension(isd:ied,jsd:jed,km):: ua, va
+   real, intent(in), dimension(isd:ied,jsd:jed,km):: pt, delp
+   real, intent(in), dimension(isd:ied,jsd:jed,km,*):: q
+   real, intent(in), dimension(isd:ied,jsd:jed,km):: qc
+   real, intent(inout)::  u(isd:ied,  jsd:jed+1,km)
+   real, intent(inout)::  v(isd:ied+1,jsd:jed,  km)
+   real, intent(in)::  w(isd:,jsd:,1:)   !< vertical velocity (m/s)
+   real, intent(in):: delz(isd:,jsd:,1:)
+   real, intent(in):: hs(isd:ied,jsd:jed)  !< surface geopotential
+   real, intent(in)::   pe(is-1:ie+1,km+1,js-1:je+1) !< pressure at layer edges
+   real, intent(in):: peln(is:ie,km+1,js:je)  !< log(pe)
+   real, intent(in):: cp, rg, r_vir, hlv
+   real, intent(in) :: rsin2_l(isd:ied, jsd:jed)
+   real, intent(in) :: cosa_s_l(isd:ied, jsd:jed)
+   logical, intent(in):: moist_phys, hydrostatic
+!! Output:
+   real(kind=8), intent(out):: te_2d(is:ie,js:je)   !< vertically integrated TE
+   real, intent(out)::   teq(is:ie,js:je)   !< Moist TE
+!! Local
+   real, dimension(is:ie,km):: tv
+   real  phiz(is:ie,km+1)
+   real  cvm(is:ie), qd(is:ie)
+   integer i, j, k
+
+!----------------------
+! Output lat-lon winds:
+!----------------------
+!  call cubed_to_latlon(u, v, ua, va, dx, dy, rdxa, rdya, km, flagstruct%c2l_ord)
+
+!$OMP parallel do default(none) shared(is,ie,js,je,isd,ied,jsd,jed,km,hydrostatic,hs,pt,qc,rg,peln,te_2d, &
+!$OMP                                  pe,delp,cp,rsin2_l,u,v,cosa_s_l,delz,moist_phys,w, &
+!$OMP                                  q,nwat,liq_wat,rainwat,ice_wat,snowwat,graupel,sphum)   &
+!$OMP                          private(phiz, tv, cvm, qd)
+  do j=js,je
+
+     if ( hydrostatic ) then
+
+        do i=is,ie
+           phiz(i,km+1) = hs(i,j)
+        enddo
+        do k=km,1,-1
+           do i=is,ie
+                tv(i,k) = pt(i,j,k)*(1.+qc(i,j,k))
+              phiz(i,k) = phiz(i,k+1) + rg*tv(i,k)*(peln(i,k+1,j)-peln(i,k,j))
+           enddo
+        enddo
+
+        do i=is,ie
+           te_2d(i,j) = pe(i,km+1,j)*phiz(i,km+1) - pe(i,1,j)*phiz(i,1)
+        enddo
+
+        do k=1,km
+           do i=is,ie
+              te_2d(i,j) = te_2d(i,j) + delp(i,j,k)*(cp*tv(i,k) +            &
+                           0.25*rsin2_l(i,j)*(u(i,j,k)**2+u(i,j+1,k)**2 +      &
+                                            v(i,j,k)**2+v(i+1,j,k)**2 -      &
+                       (u(i,j,k)+u(i,j+1,k))*(v(i,j,k)+v(i+1,j,k))*cosa_s_l(i,j)))
+           enddo
+        enddo
+
+     else
+!-----------------
+! Non-hydrostatic:
+!-----------------
+     do i=is,ie
+        phiz(i,km+1) = hs(i,j)
+        do k=km,1,-1
+           phiz(i,k) = phiz(i,k+1) - grav*delz(i,j,k)
+        enddo
+     enddo
+     do i=is,ie
+        te_2d(i,j) = 0.
+     enddo
+     if ( moist_phys ) then
+     do k=1,km
+#ifdef MOIST_CAPPA
+        call moist_cv(is,ie,isd,ied,jsd,jed, km, j, k, nwat, sphum, liq_wat, rainwat,    &
+                      ice_wat, snowwat, graupel, q, qd, cvm)
+#endif
+        do i=is,ie
+#ifdef MOIST_CAPPA
+           te_2d(i,j) = te_2d(i,j) + delp(i,j,k)*( cvm(i)*pt(i,j,k)*(1.+qc(i,j,k))*(1.-qd(i)) +  &
+#else
+           te_2d(i,j) = te_2d(i,j) + delp(i,j,k)*( cv_air*pt(i,j,k)*(1.+qc(i,j,k)) +  &
+#endif
+                        0.5*(phiz(i,k)+phiz(i,k+1)+w(i,j,k)**2+0.5*rsin2_l(i,j)*(u(i,j,k)**2+u(i,j+1,k)**2 +  &
+                        v(i,j,k)**2+v(i+1,j,k)**2-(u(i,j,k)+u(i,j+1,k))*(v(i,j,k)+v(i+1,j,k))*cosa_s_l(i,j))))
+        enddo
+     enddo
+     else
+       do k=1,km
+          do i=is,ie
+             te_2d(i,j) = te_2d(i,j) + delp(i,j,k)*( cv_air*pt(i,j,k) +  &
+                          0.5*(phiz(i,k)+phiz(i,k+1)+w(i,j,k)**2+0.5*rsin2_l(i,j)*(u(i,j,k)**2+u(i,j+1,k)**2 +  &
+                          v(i,j,k)**2+v(i+1,j,k)**2-(u(i,j,k)+u(i,j+1,k))*(v(i,j,k)+v(i+1,j,k))*cosa_s_l(i,j))))
+          enddo
+       enddo
+     endif
+     endif
+  enddo
+
+!-------------------------------------
+! Diganostics computation for moist TE
+!-------------------------------------
+  if( id_te>0 ) then
+!$OMP parallel do default(none) shared(is,ie,js,je,teq,te_2d,moist_phys,km,hlv,sphum,q,delp)
+      do j=js,je
+         do i=is,ie
+            teq(i,j) = te_2d(i,j)
+         enddo
+         if ( moist_phys ) then
+           do k=1,km
+              do i=is,ie
+                 teq(i,j) = teq(i,j) + hlv*q(i,j,k,sphum)*delp(i,j,k)
+              enddo
+           enddo
+         endif
+      enddo
+   endif
+
+  end subroutine compute_total_energy
+
+
+  subroutine pkez(km, ifirst, ilast, jfirst, jlast, j, &
+                  pe, pk, akap, peln, pkz, ptop)
+
+! INPUT PARAMETERS:
+   integer, intent(in):: km, j
+   integer, intent(in):: ifirst, ilast        !< Latitude strip
+   integer, intent(in):: jfirst, jlast        !< Latitude strip
+   real, intent(in):: akap
+   real, intent(in):: pe(ifirst-1:ilast+1,km+1,jfirst-1:jlast+1)
+   real, intent(in):: pk(ifirst:ilast,jfirst:jlast,km+1)
+   real, intent(IN):: ptop
+! OUTPUT
+   real, intent(out):: pkz(ifirst:ilast,jfirst:jlast,km)
+   real, intent(inout):: peln(ifirst:ilast, km+1, jfirst:jlast)   !< log (pe)
+! Local
+   real pk2(ifirst:ilast, km+1)
+   real pek
+   real lnp
+   real ak1
+   integer i, k
+
+   ak1 = (akap + 1.) / akap
+
+        pek = pk(ifirst,j,1)
+        do i=ifirst, ilast
+           pk2(i,1) = pek
+        enddo
+
+        do k=2,km+1
+           do i=ifirst, ilast
+!             peln(i,k,j) =  log(pe(i,k,j))
+              pk2(i,k) =  pk(i,j,k)
+           enddo
+        enddo
+
+!---- GFDL modification
+       if( ptop < ptop_min ) then
+           do i=ifirst, ilast
+               peln(i,1,j) = peln(i,2,j) - ak1
+           enddo
+       else
+           lnp = log( ptop )
+           do i=ifirst, ilast
+              peln(i,1,j) = lnp
+           enddo
+       endif
+!---- GFDL modification
+
+       do k=1,km
+          do i=ifirst, ilast
+             pkz(i,j,k) = (pk2(i,k+1) - pk2(i,k) )  /  &
+                          (akap*(peln(i,k+1,j) - peln(i,k,j)) )
+          enddo
+       enddo
+
+ end subroutine pkez
+
+
+
+ subroutine remap_z(km, pe1, q1, kn, pe2, q2, i1, i2, iv, kord)
+
+! INPUT PARAMETERS:
+      integer, intent(in) :: i1                !< Starting longitude
+      integer, intent(in) :: i2                !< Finishing longitude
+      integer, intent(in) :: kord              !< Method order
+      integer, intent(in) :: km                !< Original vertical dimension
+      integer, intent(in) :: kn                !< Target vertical dimension
+      integer, intent(in) :: iv
+
+      real, intent(in) ::  pe1(i1:i2,km+1)     !< height at layer edges from model top to bottom surface
+      real, intent(in) ::  pe2(i1:i2,kn+1)     !< height at layer edges from model top to bottom surface
+      real, intent(in) ::  q1(i1:i2,km)        !< Field input
+
+! INPUT/OUTPUT PARAMETERS:
+      real, intent(inout)::  q2(i1:i2,kn)      !< Field output
+
+! LOCAL VARIABLES:
+      real   qs(i1:i2)
+      real  dp1(  i1:i2,km)
+      real   q4(4,i1:i2,km)
+      real   pl, pr, qsum, delp, esl
+      integer i, k, l, m, k0
+
+      do k=1,km
+         do i=i1,i2
+             dp1(i,k) = pe1(i,k+1) - pe1(i,k)      ! negative
+            q4(1,i,k) = q1(i,k)
+         enddo
+      enddo
+
+! Compute vertical subgrid distribution
+   if ( kord >7 ) then
+        call  cs_profile( qs, q4, dp1, km, i1, i2, iv, kord )
+   else
+        call ppm_profile( q4, dp1, km, i1, i2, iv, kord )
+   endif
+
+! Mapping
+      do 3000 i=i1,i2
+         k0 = 1
+      do 555 k=1,kn
+      do 100 l=k0,km
+! locate the top edge: pe2(i,k)
+      if(pe2(i,k) <= pe1(i,l) .and. pe2(i,k) >= pe1(i,l+1)) then
+         pl = (pe2(i,k)-pe1(i,l)) / dp1(i,l)
+         if(pe2(i,k+1) >= pe1(i,l+1)) then
+! entire new grid is within the original grid
+            pr = (pe2(i,k+1)-pe1(i,l)) / dp1(i,l)
+            q2(i,k) = q4(2,i,l) + 0.5*(q4(4,i,l)+q4(3,i,l)-q4(2,i,l))  &
+                       *(pr+pl)-q4(4,i,l)*r3*(pr*(pr+pl)+pl**2)
+               k0 = l
+               goto 555
+          else
+! Fractional area...
+            qsum = (pe1(i,l+1)-pe2(i,k))*(q4(2,i,l)+0.5*(q4(4,i,l)+   &
+                    q4(3,i,l)-q4(2,i,l))*(1.+pl)-q4(4,i,l)*           &
+                     (r3*(1.+pl*(1.+pl))))
+              do m=l+1,km
+! locate the bottom edge: pe2(i,k+1)
+                 if(pe2(i,k+1) < pe1(i,m+1) ) then
+! Whole layer..
+                    qsum = qsum + dp1(i,m)*q4(1,i,m)
+                 else
+                    delp = pe2(i,k+1)-pe1(i,m)
+                    esl = delp / dp1(i,m)
+                    qsum = qsum + delp*(q4(2,i,m)+0.5*esl*               &
+                         (q4(3,i,m)-q4(2,i,m)+q4(4,i,m)*(1.-r23*esl)))
+                    k0 = m
+                 goto 123
+                 endif
+              enddo
+              goto 123
+           endif
+      endif
+100   continue
+123   q2(i,k) = qsum / ( pe2(i,k+1) - pe2(i,k) )
+555   continue
+3000  continue
+
+ end subroutine remap_z
+
+ subroutine map_scalar( km,   pe1,    q1,   qs,           &
+                        kn,   pe2,    q2,   i1, i2,       &
+                         j,  ibeg, iend, jbeg, jend, iv,  kord, q_min)
+! iv=1
+ integer, intent(in) :: i1                !< Starting longitude
+ integer, intent(in) :: i2                !< Finishing longitude
+ integer, intent(in) :: iv                !< Mode: 0 == constituents 1 == temp 2 == remap temp with cs scheme
+ integer, intent(in) :: kord              !< Method order
+ integer, intent(in) :: j                 !< Current latitude
+ integer, intent(in) :: ibeg, iend, jbeg, jend
+ integer, intent(in) :: km                !< Original vertical dimension
+ integer, intent(in) :: kn                !< Target vertical dimension
+ real, intent(in) ::   qs(i1:i2)       !< bottom BC
+ real, intent(in) ::  pe1(i1:i2,km+1)  !< pressure at layer edges from model top to bottom surface in the original vertical coordinate
+ real, intent(in) ::  pe2(i1:i2,kn+1)  !< pressure at layer edges from model top to bottom surface in the new vertical coordinate
+ real, intent(in) ::    q1(ibeg:iend,jbeg:jend,km) !< Field input
+! INPUT/OUTPUT PARAMETERS:
+ real, intent(inout)::  q2(ibeg:iend,jbeg:jend,kn) !< Field output
+ real, intent(in):: q_min
+
+! DESCRIPTION:
+! IV = 0: constituents
+! pe1: pressure at layer edges (from model top to bottom surface)
+!      in the original vertical coordinate
+! pe2: pressure at layer edges (from model top to bottom surface)
+!      in the new vertical coordinate
+! LOCAL VARIABLES:
+   real    dp1(i1:i2,km)
+   real   q4(4,i1:i2,km)
+   real    pl, pr, qsum, dp, esl
+   integer i, k, l, m, k0, jj
+   integer LM1,LP0,LP1 
+
+!$ser verbatim real q4_1_temp(i1:i2, i1:i2,km)
+!$ser verbatim real q4_2_temp(i1:i2, i1:i2,km)
+!$ser verbatim real q4_3_temp(i1:i2, i1:i2,km)
+!$ser verbatim real q4_4_temp(i1:i2, i1:i2,km)
+!$ser verbatim real dp1_temp(i1:i2, i1:i2,km)
+!$ser verbatim real pe1_temp(i1:i2, i1:i2,km+1)
+!$ser verbatim real pe2_temp(i1:i2, i1:i2,kn+1)
+
+!$ser verbatim real LM1_INDEX(i1:i2, i1:i2, km)
+!$ser verbatim real LP0_INDEX(i1:i2, i1:i2, km)
+
+ !$ser verbatim real :: qs_2d(i1:i2, i1:i2)
+
+!$ser verbatim q4_1_temp = 0.0
+!$ser verbatim q4_2_temp = 0.0
+!$ser verbatim q4_3_temp = 0.0
+!$ser verbatim q4_4_temp = 0.0
+!$ser verbatim dp1_temp = 0.0
+!$ser verbatim pe1_temp = 0.0
+!$ser verbatim pe2_temp = 0.0
+!$ser verbatim LM1_INDEX = 0
+!$ser verbatim LP0_INDEX = 0
+!$ser verbatim qs_2d = 0.0
+   do k=1,km
+      do i=i1,i2
+         dp1(i,k) = pe1(i,k+1) - pe1(i,k)
+         q4(1,i,k) = q1(i,j,k)
+      enddo
+   enddo
+
+!$ser verbatim if (j == 1) then
+!$ser verbatim do k = 1,kn
+!$ser verbatim    do i = i1,i2
+!$ser verbatim      q4_1_temp(i,j,k) = q4(1,i,k)
+!$ser verbatim      q4_2_temp(i,j,k) = q4(2,i,k)
+!$ser verbatim      q4_3_temp(i,j,k) = q4(3,i,k)
+!$ser verbatim      q4_4_temp(i,j,k) = q4(4,i,k)
+!$ser verbatim      dp1_temp(i,j,k) = dp1(i,k)
+!$ser verbatim      enddo
+!$ser verbatim    enddo
+!$ser verbatim do i = i1,i2
+!$ser verbatim    qs_2d(i,j) = qs(i)
+!$ser verbatim enddo
+!$ser savepoint Scalar_Profile-In
+!$ser data qs_=qs_2d
+!$ser data q4_1=q4_1_temp
+!$ser data q4_2=q4_2_temp
+!$ser data q4_3=q4_3_temp
+!$ser data q4_4=q4_4_temp
+!$ser data dp1_=dp1_temp
+!$ser data q_min=q_min
+!$ser verbatim endif
+
+! Compute vertical subgrid distribution
+   if ( kord >  7 ) then
+        call scalar_profile( qs, q4, dp1, km, i1, i2, iv, kord, q_min )
+   else
+        call ppm_profile( q4, dp1, km, i1, i2, iv, kord )
+   endif
+
+!$ser verbatim if (j == 1) then
+!$ser verbatim do k = 1,kn
+!$ser verbatim    do i = i1,i2
+!$ser verbatim      q4_1_temp(i,j,k) = q4(1,i,k)
+!$ser verbatim      q4_2_temp(i,j,k) = q4(2,i,k)
+!$ser verbatim      q4_3_temp(i,j,k) = q4(3,i,k)
+!$ser verbatim      q4_4_temp(i,j,k) = q4(4,i,k)
+!$ser verbatim      dp1_temp(i,j,k) = dp1(i,k)
+!$ser verbatim      enddo
+!$ser verbatim    enddo
+!$ser savepoint Scalar_Profile-Out
+!$ser data q4_1=q4_1_temp
+!$ser data q4_2=q4_2_temp
+!$ser data q4_3=q4_3_temp
+!$ser data q4_4=q4_4_temp
+!$ser data dp1_=dp1_temp
+!$ser verbatim endif
+
+! NOTE : q1 and q2 fields being passed into map_scalar are identical variables
+!        even though q1 is declared at INTENT(IN) and q2 is declared as INTENT(IN/OUT).
+!$ser verbatim if(j == 1) then 
+!$ser verbatim do k = 1,kn
+!$ser verbatim    do i = i1,i2
+!$ser verbatim      q4_1_temp(i,j,k) = q4(1,i,k)
+!$ser verbatim      q4_2_temp(i,j,k) = q4(2,i,k)
+!$ser verbatim      q4_3_temp(i,j,k) = q4(3,i,k)
+!$ser verbatim      q4_4_temp(i,j,k) = q4(4,i,k)
+!$ser verbatim      do jj = i1, i2
+!$ser verbatim      dp1_temp(i,jj,k) = dp1(i,k)
+!$ser verbatim      pe1_temp(i,jj,k) = pe1(i,k)
+!$ser verbatim      pe2_temp(i,jj,k) = pe2(i,k)
+!$ser verbatim      enddo
+!$ser verbatim    enddo
+!$ser verbatim  enddo
+!$ser verbatim do jj = i1, i2
+!$ser verbatim    do i = i1, i2
+!$ser verbatim       pe1_temp(i,jj,km+1) = pe1(i,km+1)
+!$ser verbatim       pe2_temp(i,jj,km+1) = pe2(i,km+1)
+!$ser verbatim    enddo
+!$ser verbatim enddo
+!$ser savepoint Lagrangian_Contribution_Interp-In 
+!$ser data q1=q1 pe1_=pe1_temp pe2_=pe2_temp q4_1=q4_1_temp q4_2=q4_2_temp
+!$ser data q4_3=q4_3_temp q4_4=q4_4_temp dp1_=dp1_temp
+!$ser verbatim endif
+
+! Interpolate field onto target Pressures
+! ---------------------------------------
+     do i=i1,i2
+     k0 = 1
+     do 555 k=1,kn
+      LM1 = 1
+      LP0 = 1
+      do while( LP0.le.km )
+         if (pe1(i,LP0).lt.pe2(i,k)) then
+            LP0 = LP0+1
+         else
+            exit
+         endif
+      enddo
+      LM1 = max(LP0-1,1)
+      LP0 = min(LP0, km)
+! Extrapolate Linearly above first model level
+! ----------------------------------------------------
+      if( LM1.eq.1 .and. LP0.eq.1 ) then
+             q2(i,j,k) = q1(i,j,1) + ( q1(i,j,2)-q1(i,j,1) )*( pe2(i,k)-pe1(i,1) ) &
+                                                            /( pe1(i,2)-pe1(i,1) )
+! Extrapolate Linearly below last model level
+! ---------------------------------------------------
+      else if( LM1.eq.km .and. LP0.eq.km ) then
+             q2(i,j,k) = q1(i,j,km) + ( q1(i,j,km)-q1(i,j,km-1) )*( pe2(i,k )-pe1(i,km  ) ) &
+                                                                 /( pe1(i,km)-pe1(i,km-1) )
+! Interpolate Linearly between levels 1 => 2 and km-1 => km
+! -----------------------------------------------------------------
+      else if( LM1.eq.1 .or. LP0.eq.km ) then
+             q2(i,j,k) = q1(i,j,LP0) + ( q1(i,j,LM1)-q1(i,j,LP0) )*( pe2(i,k  )-pe1(i,LP0) ) &
+                                                                  /( pe1(i,LM1)-pe1(i,LP0) )
+      else
+      do l=k0,km
+! locate the top edge: pe2(i,k)
+      if( pe2(i,k) >= pe1(i,l) .and. pe2(i,k) <= pe1(i,l+1) ) then
+         pl = (pe2(i,k)-pe1(i,l)) / dp1(i,l)
+         if( pe2(i,k+1) <= pe1(i,l+1) ) then
+! entire new grid is within the original grid
+            pr = (pe2(i,k+1)-pe1(i,l)) / dp1(i,l)
+            q2(i,j,k) = q4(2,i,l) + 0.5*(q4(4,i,l)+q4(3,i,l)-q4(2,i,l))  &
+                       *(pr+pl)-q4(4,i,l)*r3*(pr*(pr+pl)+pl**2)
+               k0 = l
+               goto 555
+         else
+! Fractional area...
+            qsum = (pe1(i,l+1)-pe2(i,k))*(q4(2,i,l)+0.5*(q4(4,i,l)+   &
+                    q4(3,i,l)-q4(2,i,l))*(1.+pl)-q4(4,i,l)*           &
+                     (r3*(1.+pl*(1.+pl))))
+              do m=l+1,km
+! locate the bottom edge: pe2(i,k+1)
+                 if( pe2(i,k+1) > pe1(i,m+1) ) then
+! Whole layer
+                     qsum = qsum + dp1(i,m)*q4(1,i,m)
+                 else
+                     dp = pe2(i,k+1)-pe1(i,m)
+                     esl = dp / dp1(i,m)
+                     qsum = qsum + dp*(q4(2,i,m)+0.5*esl*               &
+                           (q4(3,i,m)-q4(2,i,m)+q4(4,i,m)*(1.-r23*esl)))
+                     k0 = m
+                     goto 123
+                 endif
+              enddo
+              goto 123
+         endif
+      endif
+      enddo
+123   q2(i,j,k) = qsum / ( pe2(i,k+1) - pe2(i,k) )
+
+      endif
+555   continue
+  enddo
+
+!$ser verbatim if(j == 1) then 
+!$ser savepoint Lagrangian_Contribution_Interp-Out
+!$ser data q1=q2 
+!$ser verbatim endif
+
+ end subroutine map_scalar
+
+
+ subroutine map1_ppm( km,   pe1,    q1,   qs,           &
+                      kn,   pe2,    q2,   i1, i2,       &
+                      j,    ibeg, iend, jbeg, jend, iv,  kord)
+ integer, intent(in) :: i1                !< Starting longitude
+ integer, intent(in) :: i2                !< Finishing longitude
+ integer, intent(in) :: iv                !< Mode: 0 == constituents 1 == ??? 2 == remap temp with cs scheme
+ integer, intent(in) :: kord              !< Method order
+ integer, intent(in) :: j                 !< Current latitude
+ integer, intent(in) :: ibeg, iend, jbeg, jend
+ integer, intent(in) :: km                !< Original vertical dimension
+ integer, intent(in) :: kn                !< Target vertical dimension
+ real, intent(in) ::   qs(i1:i2)       !< bottom BC
+ real, intent(in) ::  pe1(i1:i2,km+1)  !< pressure at layer edges from model top to bottom surface in the original vertical coordinate
+ real, intent(in) ::  pe2(i1:i2,kn+1)  !< pressure at layer edges from model top to bottom surface in the new vertical coordinate
+ real, intent(in) ::    q1(ibeg:iend,jbeg:jend,km) !< Field input
+! INPUT/OUTPUT PARAMETERS:
+ real, intent(inout)::  q2(ibeg:iend,jbeg:jend,kn) !< Field output
+
+! DESCRIPTION:
+! IV = 0: constituents
+! pe1: pressure at layer edges (from model top to bottom surface)
+!      in the original vertical coordinate
+! pe2: pressure at layer edges (from model top to bottom surface)
+!      in the new vertical coordinate
+
+! LOCAL VARIABLES:
+   real    dp1(i1:i2,km)
+   real   q4(4,i1:i2,km)
+   real    pl, pr, qsum, dp, esl
+   integer i, k, l, m, k0
+
+!$ser verbatim real q4_1_temp(i1:i2, i1:i2,km)
+!$ser verbatim real q4_2_temp(i1:i2, i1:i2,km)
+!$ser verbatim real q4_3_temp(i1:i2, i1:i2,km)
+!$ser verbatim real q4_4_temp(i1:i2, i1:i2,km)
+!$ser verbatim real dp1_temp(i1:i2, i1:i2,km)
+!$ser verbatim real :: qs_2d(i1:i2, i1:i2)
+
+!$ser verbatim q4_1_temp = 0.0
+!$ser verbatim q4_2_temp = 0.0
+!$ser verbatim q4_3_temp = 0.0
+!$ser verbatim q4_4_temp = 0.0
+!$ser verbatim qs_2d = 0.0
+
+   do k=1,km
+      do i=i1,i2
+         dp1(i,k) = pe1(i,k+1) - pe1(i,k)
+         q4(1,i,k) = q1(i,j,k)
+      enddo
+   enddo
+
+! Compute vertical subgrid distribution
+   if ( kord >7 ) then
+!$ser verbatim if (j == 1 .and. i2 == 24) then
+!$ser verbatim do k = 1,kn
+!$ser verbatim    do i = i1,i2
+!$ser verbatim      q4_1_temp(i,j,k) = q4(1,i,k)
+!$ser verbatim      q4_2_temp(i,j,k) = q4(2,i,k)
+!$ser verbatim      q4_3_temp(i,j,k) = q4(3,i,k)
+!$ser verbatim      q4_4_temp(i,j,k) = q4(4,i,k)
+!$ser verbatim      dp1_temp(i,j,k) = dp1(i,k)
+!$ser verbatim      enddo
+!$ser verbatim    enddo
+!$ser verbatim do i = i1,i2
+!$ser verbatim    qs_2d(i,j) = qs(i)
+!$ser verbatim enddo
+!$ser savepoint CS_Profile-In
+!$ser data qs_=qs_2d
+!$ser data q4_1=q4_1_temp
+!$ser data q4_2=q4_2_temp
+!$ser data q4_3=q4_3_temp
+!$ser data q4_4=q4_4_temp
+!$ser data dp1_=dp1_temp
+!$ser data kord_=kord
+!$ser data iv_=iv
+!$ser verbatim endif
+        call  cs_profile( qs, q4, dp1, km, i1, i2, iv, kord )
+!$ser verbatim if (j == 1 .and. i2 == 24) then
+!$ser verbatim do k = 1,kn
+!$ser verbatim    do i = i1,i2
+!$ser verbatim      q4_1_temp(i,j,k) = q4(1,i,k)
+!$ser verbatim      q4_2_temp(i,j,k) = q4(2,i,k)
+!$ser verbatim      q4_3_temp(i,j,k) = q4(3,i,k)
+!$ser verbatim      q4_4_temp(i,j,k) = q4(4,i,k)
+!$ser verbatim      enddo
+!$ser verbatim    enddo
+!$ser savepoint CS_Profile-Out
+!$ser data q4_1=q4_1_temp
+!$ser data q4_2=q4_2_temp
+!$ser data q4_3=q4_3_temp
+!$ser data q4_4=q4_4_temp
+!$ser verbatim endif
+   else
+        call ppm_profile( q4, dp1, km, i1, i2, iv, kord )
+   endif
+
+  do i=i1,i2
+     k0 = 1
+     do 555 k=1,kn
+      do l=k0,km
+! locate the top edge: pe2(i,k)
+      if( pe2(i,k) >= pe1(i,l) .and. pe2(i,k) <= pe1(i,l+1) ) then
+         pl = (pe2(i,k)-pe1(i,l)) / dp1(i,l)
+         if( pe2(i,k+1) <= pe1(i,l+1) ) then
+! entire new grid is within the original grid
+            pr = (pe2(i,k+1)-pe1(i,l)) / dp1(i,l)
+            q2(i,j,k) = q4(2,i,l) + 0.5*(q4(4,i,l)+q4(3,i,l)-q4(2,i,l))  &
+                       *(pr+pl)-q4(4,i,l)*r3*(pr*(pr+pl)+pl**2)
+               k0 = l
+               goto 555
+         else
+! Fractional area...
+            qsum = (pe1(i,l+1)-pe2(i,k))*(q4(2,i,l)+0.5*(q4(4,i,l)+   &
+                    q4(3,i,l)-q4(2,i,l))*(1.+pl)-q4(4,i,l)*           &
+                     (r3*(1.+pl*(1.+pl))))
+              do m=l+1,km
+! locate the bottom edge: pe2(i,k+1)
+                 if( pe2(i,k+1) > pe1(i,m+1) ) then
+! Whole layer
+                     qsum = qsum + dp1(i,m)*q4(1,i,m)
+                 else
+                     dp = pe2(i,k+1)-pe1(i,m)
+                     esl = dp / dp1(i,m)
+                     qsum = qsum + dp*(q4(2,i,m)+0.5*esl*               &
+                           (q4(3,i,m)-q4(2,i,m)+q4(4,i,m)*(1.-r23*esl)))
+                     k0 = m
+                     goto 123
+                 endif
+              enddo
+              goto 123
+         endif
+      endif
+      enddo
+123   q2(i,j,k) = qsum / ( pe2(i,k+1) - pe2(i,k) )
+555   continue
+  enddo
+
+ end subroutine map1_ppm
+
+
+ subroutine mapn_tracer(nq, km, pe1, pe2, q1, dp2, kord, j,     &
+                        i1, i2, isd, ied, jsd, jed, q_min, fill)
+! INPUT PARAMETERS:
+      integer, intent(in):: km                !< vertical dimension
+      integer, intent(in):: j, nq, i1, i2
+      integer, intent(in):: isd, ied, jsd, jed
+      integer, intent(in):: kord(nq)
+      real, intent(in)::  pe1(i1:i2,km+1)     !< pressure at layer edges from model top to bottom surface in the original vertical coordinate
+      real, intent(in)::  pe2(i1:i2,km+1)     !< pressure at layer edges from model top to bottom surface in the new vertical coordinate
+      real, intent(in)::  dp2(i1:i2,km)
+      real, intent(in)::  q_min
+      logical, intent(in):: fill
+      real, intent(inout):: q1(isd:ied,jsd:jed,km,nq) ! Field input
+! LOCAL VARIABLES:
+      real:: q4(4,i1:i2,km,nq)
+      real:: q2(i1:i2,km,nq) !< Field output
+      real:: qsum(nq)
+      real:: dp1(i1:i2,km)
+      real:: qs(i1:i2)
+      real:: pl, pr, dp, esl, fac1, fac2
+      integer:: i, k, l, m, k0, iq
+
+      !$ser verbatim integer:: kord_iq, iv, im, js2d
+      !$ser verbatim js2d=jsd+3
+
+      do k=1,km
+         do i=i1,i2
+            dp1(i,k) = pe1(i,k+1) - pe1(i,k)
+         enddo
+      enddo
+
+      do iq=1,nq
+         do k=1,km
+            do i=i1,i2
+               q4(1,i,k,iq) = q1(i,j,k,iq)
+            enddo
+         enddo
+         call scalar_profile( qs, q4(1,i1,1,iq), dp1, km, i1, i2, 0, kord(iq), q_min )
+      enddo
+
+! Mapping
+      do 4000 i=i1,i2
+         k0 = 1
+      do 555 k=1,km
+      do 100 l=k0,km
+! locate the top edge: pe2(i,k)
+      if(pe2(i,k) >= pe1(i,l) .and. pe2(i,k) <= pe1(i,l+1)) then
+         pl = (pe2(i,k)-pe1(i,l)) / dp1(i,l)
+         if(pe2(i,k+1) <= pe1(i,l+1)) then
+! entire new grid is within the original grid
+            pr = (pe2(i,k+1)-pe1(i,l)) / dp1(i,l)
+            fac1 = pr + pl
+            fac2 = r3*(pr*fac1 + pl*pl)
+            fac1 = 0.5*fac1
+            do iq=1,nq
+               q2(i,k,iq) = q4(2,i,l,iq) + (q4(4,i,l,iq)+q4(3,i,l,iq)-q4(2,i,l,iq))*fac1  &
+                                         -  q4(4,i,l,iq)*fac2
+            enddo
+            k0 = l
+            goto 555
+          else
+! Fractional area...
+            dp = pe1(i,l+1) - pe2(i,k)
+            fac1 = 1. + pl
+            fac2 = r3*(1.+pl*fac1)
+            fac1 = 0.5*fac1
+            do iq=1,nq
+               qsum(iq) = dp*(q4(2,i,l,iq) + (q4(4,i,l,iq)+   &
+                              q4(3,i,l,iq) - q4(2,i,l,iq))*fac1 - q4(4,i,l,iq)*fac2)
+            enddo
+            do m=l+1,km
+! locate the bottom edge: pe2(i,k+1)
+               if(pe2(i,k+1) > pe1(i,m+1) ) then
+                                                   ! Whole layer..
+                  do iq=1,nq
+                     qsum(iq) = qsum(iq) + dp1(i,m)*q4(1,i,m,iq)
+                  enddo
+               else
+                  dp = pe2(i,k+1)-pe1(i,m)
+                  esl = dp / dp1(i,m)
+                  fac1 = 0.5*esl
+                  fac2 = 1.-r23*esl
+                  do iq=1,nq
+                     qsum(iq) = qsum(iq) + dp*( q4(2,i,m,iq) + fac1*(         &
+                                q4(3,i,m,iq)-q4(2,i,m,iq)+q4(4,i,m,iq)*fac2 ) )
+                  enddo
+                  k0 = m
+                  goto 123
+               endif
+            enddo
+            goto 123
+          endif
+      endif
+100   continue
+123   continue
+      do iq=1,nq
+         q2(i,k,iq) = qsum(iq) / dp2(i,k)
+      enddo
+555   continue
+4000  continue
+
+     !$ser verbatim if(j == js2d ) then
+       !$ser verbatim im = i2-i1+1
+       !$ser verbatim iv = 9
+       !$ser savepoint Fillz-In
+      ! Note : Currently, serializing nq=nq and q2tracers=q2(:,:,1:nq) will not run the translate test
+      !        To successfully run the translate test, serialize nq=iv and q2tracers=q2(:,:,1:iv)
+      !$ser data im=im km=km nq=iv dp2=dp2  q2tracers=q2(:,:,1:iv)
+      !$ser verbatim endif
+
+  if (fill) call fillz(i2-i1+1, km, nq, q2, dp2)
+
+  !$ser verbatim if(j == js2d ) then
+  !$ser savepoint Fillz-Out
+  !$ser data  q2tracers=q2(:,:,1:iv)
+  !$ser verbatim endif
+
+  do iq=1,nq
+!    if (fill) call fillz(i2-i1+1, km, 1, q2(i1,1,iq), dp2)
+     do k=1,km
+        do i=i1,i2
+           q1(i,j,k,iq) = q2(i,k,iq)
+        enddo
+     enddo
+  enddo
+
+ end subroutine mapn_tracer
+
+
+ subroutine map1_q2(km,   pe1,   q1,            &
+                    kn,   pe2,   q2,   dp2,     &
+                    i1,   i2,    iv,   kord, j, &
+                    ibeg, iend, jbeg, jend, q_min )
+
+
+! INPUT PARAMETERS:
+      integer, intent(in) :: j
+      integer, intent(in) :: i1, i2
+      integer, intent(in) :: ibeg, iend, jbeg, jend
+      integer, intent(in) :: iv                !< Mode: 0 ==  constituents 1 == ???
+      integer, intent(in) :: kord
+      integer, intent(in) :: km                !< Original vertical dimension
+      integer, intent(in) :: kn                !< Target vertical dimension
+
+      real, intent(in) ::  pe1(i1:i2,km+1)     !< pressure at layer edges from model top to bottom surface in the original vertical coordinate
+      real, intent(in) ::  pe2(i1:i2,kn+1)     !< pressure at layer edges from model top to bottom surface in the new vertical coordinate
+      real, intent(in) ::  q1(ibeg:iend,jbeg:jend,km) !< Field input
+      real, intent(in) ::  dp2(i1:i2,kn)
+      real, intent(in) ::  q_min
+! INPUT/OUTPUT PARAMETERS:
+      real, intent(inout):: q2(i1:i2,kn) !< Field output
+! LOCAL VARIABLES:
+      real   qs(i1:i2)
+      real   dp1(i1:i2,km)
+      real   q4(4,i1:i2,km)
+      real   pl, pr, qsum, dp, esl
+
+      integer i, k, l, m, k0
+
+      do k=1,km
+         do i=i1,i2
+             dp1(i,k) = pe1(i,k+1) - pe1(i,k)
+            q4(1,i,k) = q1(i,j,k)
+         enddo
+      enddo
+
+! Compute vertical subgrid distribution
+   if ( kord >7 ) then
+        call  scalar_profile( qs, q4, dp1, km, i1, i2, iv, kord, q_min )
+   else
+        call ppm_profile( q4, dp1, km, i1, i2, iv, kord )
+   endif
+
+! Mapping
+      do 5000 i=i1,i2
+         k0 = 1
+      do 555 k=1,kn
+      do 100 l=k0,km
+! locate the top edge: pe2(i,k)
+      if(pe2(i,k) >= pe1(i,l) .and. pe2(i,k) <= pe1(i,l+1)) then
+         pl = (pe2(i,k)-pe1(i,l)) / dp1(i,l)
+         if(pe2(i,k+1) <= pe1(i,l+1)) then
+! entire new grid is within the original grid
+            pr = (pe2(i,k+1)-pe1(i,l)) / dp1(i,l)
+            q2(i,k) = q4(2,i,l) + 0.5*(q4(4,i,l)+q4(3,i,l)-q4(2,i,l))  &
+                       *(pr+pl)-q4(4,i,l)*r3*(pr*(pr+pl)+pl**2)
+               k0 = l
+               goto 555
+          else
+! Fractional area...
+            qsum = (pe1(i,l+1)-pe2(i,k))*(q4(2,i,l)+0.5*(q4(4,i,l)+   &
+                    q4(3,i,l)-q4(2,i,l))*(1.+pl)-q4(4,i,l)*           &
+                     (r3*(1.+pl*(1.+pl))))
+              do m=l+1,km
+! locate the bottom edge: pe2(i,k+1)
+                 if(pe2(i,k+1) > pe1(i,m+1) ) then
+                                                   ! Whole layer..
+                    qsum = qsum + dp1(i,m)*q4(1,i,m)
+                 else
+                     dp = pe2(i,k+1)-pe1(i,m)
+                    esl = dp / dp1(i,m)
+                   qsum = qsum + dp*(q4(2,i,m)+0.5*esl*               &
+                       (q4(3,i,m)-q4(2,i,m)+q4(4,i,m)*(1.-r23*esl)))
+                   k0 = m
+                   goto 123
+                 endif
+              enddo
+              goto 123
+          endif
+      endif
+100   continue
+123   q2(i,k) = qsum / dp2(i,k)
+555   continue
+5000  continue
+
+ end subroutine map1_q2
+
+
+
+ subroutine remap_2d(km,   pe1,   q1,        &
+                     kn,   pe2,   q2,        &
+                     i1,   i2,    iv,   kord)
+   integer, intent(in):: i1, i2
+   integer, intent(in):: iv               !< Mode: 0 ==  constituents 1 ==others
+   integer, intent(in):: kord
+   integer, intent(in):: km               !< Original vertical dimension
+   integer, intent(in):: kn               !< Target vertical dimension
+   real, intent(in):: pe1(i1:i2,km+1)     !< Pressure at layer edges from model top to bottom surface in the original vertical coordinate
+   real, intent(in):: pe2(i1:i2,kn+1)     !< Pressure at layer edges from model top to bottom surface in the new vertical coordinate
+   real, intent(in) :: q1(i1:i2,km) !< Field input
+   real, intent(out):: q2(i1:i2,kn) !< Field output
+! LOCAL VARIABLES:
+   real   qs(i1:i2)
+   real   dp1(i1:i2,km)
+   real   q4(4,i1:i2,km)
+   real   pl, pr, qsum, dp, esl
+   integer i, k, l, m, k0
+
+   do k=1,km
+      do i=i1,i2
+          dp1(i,k) = pe1(i,k+1) - pe1(i,k)
+         q4(1,i,k) = q1(i,k)
+      enddo
+   enddo
+
+! Compute vertical subgrid distribution
+   if ( kord >7 ) then
+        call  cs_profile( qs, q4, dp1, km, i1, i2, iv, kord )
+   else
+        call ppm_profile( q4, dp1, km, i1, i2, iv, kord )
+   endif
+
+   do i=i1,i2
+      k0 = 1
+      do 555 k=1,kn
+#ifdef OLD_TOP_EDGE
+         if( pe2(i,k+1) <= pe1(i,1) ) then
+! Entire grid above old ptop
+             q2(i,k) = q4(2,i,1)
+         elseif( pe2(i,k) < pe1(i,1) .and. pe2(i,k+1)>pe1(i,1) ) then
+! Partially above old ptop:
+             q2(i,k) = q1(i,1)
+#else
+         if( pe2(i,k) <= pe1(i,1) ) then
+! above old ptop:
+             q2(i,k) = q1(i,1)
+#endif
+         else
+           do l=k0,km
+! locate the top edge: pe2(i,k)
+           if( pe2(i,k) >= pe1(i,l) .and. pe2(i,k) <= pe1(i,l+1) ) then
+               pl = (pe2(i,k)-pe1(i,l)) / dp1(i,l)
+               if(pe2(i,k+1) <= pe1(i,l+1)) then
+! entire new grid is within the original grid
+                  pr = (pe2(i,k+1)-pe1(i,l)) / dp1(i,l)
+                  q2(i,k) = q4(2,i,l) + 0.5*(q4(4,i,l)+q4(3,i,l)-q4(2,i,l))  &
+                          *(pr+pl)-q4(4,i,l)*r3*(pr*(pr+pl)+pl**2)
+                  k0 = l
+                  goto 555
+               else
+! Fractional area...
+                 qsum = (pe1(i,l+1)-pe2(i,k))*(q4(2,i,l)+0.5*(q4(4,i,l)+   &
+                         q4(3,i,l)-q4(2,i,l))*(1.+pl)-q4(4,i,l)*           &
+                        (r3*(1.+pl*(1.+pl))))
+                 do m=l+1,km
+! locate the bottom edge: pe2(i,k+1)
+                    if(pe2(i,k+1) > pe1(i,m+1) ) then
+                                                   ! Whole layer..
+                       qsum = qsum + dp1(i,m)*q4(1,i,m)
+                    else
+                       dp = pe2(i,k+1)-pe1(i,m)
+                      esl = dp / dp1(i,m)
+                      qsum = qsum + dp*(q4(2,i,m)+0.5*esl*               &
+                            (q4(3,i,m)-q4(2,i,m)+q4(4,i,m)*(1.-r23*esl)))
+                      k0 = m
+                      goto 123
+                    endif
+                 enddo
+                 goto 123
+               endif
+           endif
+           enddo
+123        q2(i,k) = qsum / ( pe2(i,k+1) - pe2(i,k) )
+         endif
+555   continue
+   enddo
+
+ end subroutine remap_2d
+
+
+!>@brief Optimized vertical profile reconstruction:
+!> Latest: Apr 2008 S.-J. Lin, NOAA/GFDL
+ subroutine scalar_profile(qs, a4, delp, km, i1, i2, iv, kord, qmin)
+! Optimized vertical profile reconstruction:
+! Latest: Apr 2008 S.-J. Lin, NOAA/GFDL
+ integer, intent(in):: i1, i2
+ integer, intent(in):: km      !< vertical dimension
+ integer, intent(in):: iv      !< iv =-1: winds iv = 0: positive definite scalars iv = 1: others
+ integer, intent(in):: kord
+ real, intent(in)   ::   qs(i1:i2)
+ real, intent(in)   :: delp(i1:i2,km)     !< Layer pressure thickness
+ real, intent(inout):: a4(4,i1:i2,km)     !< Interpolated values
+ real, intent(in):: qmin
+!-----------------------------------------------------------------------
+ logical, dimension(i1:i2,km):: extm, ext5, ext6
+ real  gam(i1:i2,km)
+ real    q(i1:i2,km+1)
+ real   d4(i1:i2)
+ real   bet, a_bot, grat
+ real   pmp_1, lac_1, pmp_2, lac_2, x0, x1
+ integer i, k, im
+
+ if ( iv .eq. -2 ) then
+      do i=i1,i2
+         gam(i,2) = 0.5
+           q(i,1) = 1.5*a4(1,i,1)
+      enddo
+      do k=2,km-1
+         do i=i1, i2
+                  grat = delp(i,k-1) / delp(i,k)
+                   bet =  2. + grat + grat - gam(i,k)
+                q(i,k) = (3.*(a4(1,i,k-1)+a4(1,i,k)) - q(i,k-1))/bet
+            gam(i,k+1) = grat / bet
+         enddo
+      enddo
+      do i=i1,i2
+            grat = delp(i,km-1) / delp(i,km)
+         q(i,km) = (3.*(a4(1,i,km-1)+a4(1,i,km)) - grat*qs(i) - q(i,km-1)) /  &
+                   (2. + grat + grat - gam(i,km))
+         q(i,km+1) = qs(i)
+      enddo
+      do k=km-1,1,-1
+        do i=i1,i2
+           q(i,k) = q(i,k) - gam(i,k+1)*q(i,k+1)
+        enddo
+      enddo
+ else
+  do i=i1,i2
+         grat = delp(i,2) / delp(i,1)   ! grid ratio
+          bet = grat*(grat+0.5)
+       q(i,1) = ( (grat+grat)*(grat+1.)*a4(1,i,1) + a4(1,i,2) ) / bet
+     gam(i,1) = ( 1. + grat*(grat+1.5) ) / bet
+  enddo
+
+  do k=2,km
+     do i=i1,i2
+           d4(i) = delp(i,k-1) / delp(i,k)
+             bet =  2. + d4(i) + d4(i) - gam(i,k-1)
+          q(i,k) = ( 3.*(a4(1,i,k-1)+d4(i)*a4(1,i,k)) - q(i,k-1) )/bet
+        gam(i,k) = d4(i) / bet
+     enddo
+  enddo
+
+  do i=i1,i2
+         a_bot = 1. + d4(i)*(d4(i)+1.5)
+     q(i,km+1) = (2.*d4(i)*(d4(i)+1.)*a4(1,i,km)+a4(1,i,km-1)-a_bot*q(i,km))  &
+               / ( d4(i)*(d4(i)+0.5) - a_bot*gam(i,km) )
+  enddo
+
+  do k=km,1,-1
+     do i=i1,i2
+        q(i,k) = q(i,k) - gam(i,k)*q(i,k+1)
+     enddo
+  enddo
+ endif
+
+!----- Perfectly linear scheme --------------------------------
+ if ( abs(kord) > 16 ) then
+  do k=1,km
+     do i=i1,i2
+        a4(2,i,k) = q(i,k  )
+        a4(3,i,k) = q(i,k+1)
+        a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+     enddo
+  enddo
+  return
+ endif
+!----- Perfectly linear scheme --------------------------------
+
+!------------------
+! Apply constraints
+!------------------
+  im = i2 - i1 + 1
+
+! Apply *large-scale* constraints
+  do i=i1,i2
+     q(i,2) = min( q(i,2), max(a4(1,i,1), a4(1,i,2)) )
+     q(i,2) = max( q(i,2), min(a4(1,i,1), a4(1,i,2)) )
+  enddo
+
+  do k=2,km
+     do i=i1,i2
+        gam(i,k) = a4(1,i,k) - a4(1,i,k-1)
+     enddo
+  enddo
+
+! Interior:
+  do k=3,km-1
+     do i=i1,i2
+        if ( gam(i,k-1)*gam(i,k+1)>0. ) then
+! Apply large-scale constraint to ALL fields if not local max/min
+             q(i,k) = min( q(i,k), max(a4(1,i,k-1),a4(1,i,k)) )
+             q(i,k) = max( q(i,k), min(a4(1,i,k-1),a4(1,i,k)) )
+        else
+          if ( gam(i,k-1) > 0. ) then
+! There exists a local max
+               q(i,k) = max(q(i,k), min(a4(1,i,k-1),a4(1,i,k)))
+          else
+! There exists a local min
+               q(i,k) = min(q(i,k), max(a4(1,i,k-1),a4(1,i,k)))
+               if ( iv==0 ) q(i,k) = max(0., q(i,k))
+          endif
+        endif
+     enddo
+  enddo
+
+! Bottom:
+  do i=i1,i2
+     q(i,km) = min( q(i,km), max(a4(1,i,km-1), a4(1,i,km)) )
+     q(i,km) = max( q(i,km), min(a4(1,i,km-1), a4(1,i,km)) )
+  enddo
+
+  do k=1,km
+     do i=i1,i2
+        a4(2,i,k) = q(i,k  )
+        a4(3,i,k) = q(i,k+1)
+     enddo
+  enddo
+
+  do k=1,km
+     if ( k==1 .or. k==km ) then
+       do i=i1,i2
+          extm(i,k) = (a4(2,i,k)-a4(1,i,k)) * (a4(3,i,k)-a4(1,i,k)) > 0.
+       enddo
+     else
+       do i=i1,i2
+          extm(i,k) = gam(i,k)*gam(i,k+1) < 0.
+       enddo
+     endif
+     if ( abs(kord) > 9 ) then
+       do i=i1,i2
+          x0 = 2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k))
+          x1 = abs(a4(2,i,k)-a4(3,i,k))
+          a4(4,i,k) = 3.*x0
+          ext5(i,k) = abs(x0) > x1
+          ext6(i,k) = abs(a4(4,i,k)) > x1
+       enddo
+     endif
+  enddo
+
+!---------------------------
+! Apply subgrid constraints:
+!---------------------------
+! f(s) = AL + s*[(AR-AL) + A6*(1-s)]         ( 0 <= s  <= 1 )
+! Top 2 and bottom 2 layers always use monotonic mapping
+
+  if ( iv==0 ) then
+     do i=i1,i2
+        a4(2,i,1) = max(0., a4(2,i,1))
+     enddo
+  elseif ( iv==-1 ) then
+      do i=i1,i2
+         if ( a4(2,i,1)*a4(1,i,1) <= 0. ) a4(2,i,1) = 0.
+      enddo
+  elseif ( iv==2 ) then
+     do i=i1,i2
+        a4(2,i,1) = a4(1,i,1)
+        a4(3,i,1) = a4(1,i,1)
+        a4(4,i,1) = 0.
+     enddo
+  endif
+
+  if ( iv/=2 ) then
+     do i=i1,i2
+        a4(4,i,1) = 3.*(2.*a4(1,i,1) - (a4(2,i,1)+a4(3,i,1)))
+     enddo
+     call cs_limiters(im, extm(i1,1), a4(1,i1,1), 1)
+  endif
+
+! k=2
+   do i=i1,i2
+      a4(4,i,2) = 3.*(2.*a4(1,i,2) - (a4(2,i,2)+a4(3,i,2)))
+   enddo
+   call cs_limiters(im, extm(i1,2), a4(1,i1,2), 2)
+
+!-------------------------------------
+! Huynh's 2nd constraint for interior:
+!-------------------------------------
+  do k=3,km-2
+     if ( abs(kord)<9 ) then
+       do i=i1,i2
+! Left  edges
+          pmp_1 = a4(1,i,k) - 2.*gam(i,k+1)
+          lac_1 = pmp_1 + 1.5*gam(i,k+2)
+          a4(2,i,k) = min(max(a4(2,i,k), min(a4(1,i,k), pmp_1, lac_1)),   &
+                                         max(a4(1,i,k), pmp_1, lac_1) )
+! Right edges
+          pmp_2 = a4(1,i,k) + 2.*gam(i,k)
+          lac_2 = pmp_2 - 1.5*gam(i,k-1)
+          a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), pmp_2, lac_2)),    &
+                                         max(a4(1,i,k), pmp_2, lac_2) )
+
+          a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+       enddo
+
+     elseif ( abs(kord)==9 ) then
+       do i=i1,i2
+          if ( extm(i,k) .and. extm(i,k-1) ) then
+! grid-scale 2-delta-z wave detected
+               a4(2,i,k) = a4(1,i,k)
+               a4(3,i,k) = a4(1,i,k)
+               a4(4,i,k) = 0.
+          else if ( extm(i,k) .and. extm(i,k+1) ) then
+! grid-scale 2-delta-z wave detected
+               a4(2,i,k) = a4(1,i,k)
+               a4(3,i,k) = a4(1,i,k)
+               a4(4,i,k) = 0.
+          else if ( extm(i,k) .and. a4(1,i,k)<qmin ) then
+! grid-scale 2-delta-z wave detected
+               a4(2,i,k) = a4(1,i,k)
+               a4(3,i,k) = a4(1,i,k)
+               a4(4,i,k) = 0.
+          else
+            a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+! Check within the smooth region if subgrid profile is non-monotonic
+            if( abs(a4(4,i,k)) > abs(a4(2,i,k)-a4(3,i,k)) ) then
+                  pmp_1 = a4(1,i,k) - 2.*gam(i,k+1)
+                  lac_1 = pmp_1 + 1.5*gam(i,k+2)
+              a4(2,i,k) = min(max(a4(2,i,k), min(a4(1,i,k), pmp_1, lac_1)),  &
+                                             max(a4(1,i,k), pmp_1, lac_1) )
+                  pmp_2 = a4(1,i,k) + 2.*gam(i,k)
+                  lac_2 = pmp_2 - 1.5*gam(i,k-1)
+              a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), pmp_2, lac_2)),  &
+                                             max(a4(1,i,k), pmp_2, lac_2) )
+              a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+            endif
+          endif
+       enddo
+     elseif ( abs(kord)==10 ) then
+       do i=i1,i2
+          if( ext5(i,k) ) then
+              if( ext5(i,k-1) .or. ext5(i,k+1) ) then
+                   a4(2,i,k) = a4(1,i,k)
+                   a4(3,i,k) = a4(1,i,k)
+              elseif ( ext6(i,k-1) .or. ext6(i,k+1) ) then
+                   pmp_1 = a4(1,i,k) - 2.*gam(i,k+1)
+                   lac_1 = pmp_1 + 1.5*gam(i,k+2)
+                   a4(2,i,k) = min(max(a4(2,i,k), min(a4(1,i,k), pmp_1, lac_1)),  &
+                                                  max(a4(1,i,k), pmp_1, lac_1) )
+                   pmp_2 = a4(1,i,k) + 2.*gam(i,k)
+                   lac_2 = pmp_2 - 1.5*gam(i,k-1)
+                   a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), pmp_2, lac_2)),  &
+                                                  max(a4(1,i,k), pmp_2, lac_2) )
+              endif
+          elseif( ext6(i,k) ) then
+              if( ext5(i,k-1) .or. ext5(i,k+1) ) then
+                  pmp_1 = a4(1,i,k) - 2.*gam(i,k+1)
+                  lac_1 = pmp_1 + 1.5*gam(i,k+2)
+                  a4(2,i,k) = min(max(a4(2,i,k), min(a4(1,i,k), pmp_1, lac_1)),  &
+                                                 max(a4(1,i,k), pmp_1, lac_1) )
+                  pmp_2 = a4(1,i,k) + 2.*gam(i,k)
+                  lac_2 = pmp_2 - 1.5*gam(i,k-1)
+                  a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), pmp_2, lac_2)),  &
+                                                 max(a4(1,i,k), pmp_2, lac_2) )
+              endif
+          endif
+       enddo
+       do i=i1,i2
+          a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+       enddo
+     elseif ( abs(kord)==12 ) then
+       do i=i1,i2
+          if( extm(i,k) ) then
+              a4(2,i,k) = a4(1,i,k)
+              a4(3,i,k) = a4(1,i,k)
+              a4(4,i,k) = 0.
+          else        ! not a local extremum
+            a4(4,i,k) = 6.*a4(1,i,k) - 3.*(a4(2,i,k)+a4(3,i,k))
+! Check within the smooth region if subgrid profile is non-monotonic
+            if( abs(a4(4,i,k)) > abs(a4(2,i,k)-a4(3,i,k)) ) then
+                  pmp_1 = a4(1,i,k) - 2.*gam(i,k+1)
+                  lac_1 = pmp_1 + 1.5*gam(i,k+2)
+              a4(2,i,k) = min(max(a4(2,i,k), min(a4(1,i,k), pmp_1, lac_1)),  &
+                                             max(a4(1,i,k), pmp_1, lac_1) )
+                  pmp_2 = a4(1,i,k) + 2.*gam(i,k)
+                  lac_2 = pmp_2 - 1.5*gam(i,k-1)
+              a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), pmp_2, lac_2)),  &
+                                             max(a4(1,i,k), pmp_2, lac_2) )
+              a4(4,i,k) = 6.*a4(1,i,k) - 3.*(a4(2,i,k)+a4(3,i,k))
+            endif
+          endif
+       enddo
+     elseif ( abs(kord)==13 ) then
+       do i=i1,i2
+          if( ext6(i,k) ) then
+             if ( ext6(i,k-1) .and. ext6(i,k+1) ) then
+! grid-scale 2-delta-z wave detected
+                 a4(2,i,k) = a4(1,i,k)
+                 a4(3,i,k) = a4(1,i,k)
+             endif
+          endif
+       enddo
+       do i=i1,i2
+          a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+       enddo
+     elseif ( abs(kord)==14 ) then
+
+       do i=i1,i2
+          a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+       enddo
+
+     elseif ( abs(kord)==15 ) then   ! Revised abs(kord)=9 scheme
+       do i=i1,i2
+          if ( ext5(i,k) .and. ext5(i,k-1) ) then
+               a4(2,i,k) = a4(1,i,k)
+               a4(3,i,k) = a4(1,i,k)
+          else if ( ext5(i,k) .and. ext5(i,k+1) ) then
+               a4(2,i,k) = a4(1,i,k)
+               a4(3,i,k) = a4(1,i,k)
+          else if ( ext5(i,k) .and. a4(1,i,k)<qmin ) then
+               a4(2,i,k) = a4(1,i,k)
+               a4(3,i,k) = a4(1,i,k)
+          elseif( ext6(i,k) ) then
+                  pmp_1 = a4(1,i,k) - 2.*gam(i,k+1)
+                  lac_1 = pmp_1 + 1.5*gam(i,k+2)
+              a4(2,i,k) = min(max(a4(2,i,k), min(a4(1,i,k), pmp_1, lac_1)),  &
+                                             max(a4(1,i,k), pmp_1, lac_1) )
+                  pmp_2 = a4(1,i,k) + 2.*gam(i,k)
+                  lac_2 = pmp_2 - 1.5*gam(i,k-1)
+              a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), pmp_2, lac_2)),  &
+                                             max(a4(1,i,k), pmp_2, lac_2) )
+          endif
+       enddo
+       do i=i1,i2
+          a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+       enddo
+     elseif ( abs(kord)==16 ) then
+       do i=i1,i2
+          if( ext5(i,k) ) then
+             if ( ext5(i,k-1) .or. ext5(i,k+1) ) then
+                 a4(2,i,k) = a4(1,i,k)
+                 a4(3,i,k) = a4(1,i,k)
+             elseif ( ext6(i,k-1) .or. ext6(i,k+1) ) then
+                 ! Left  edges
+                 pmp_1 = a4(1,i,k) - 2.*gam(i,k+1)
+                 lac_1 = pmp_1 + 1.5*gam(i,k+2)
+                 a4(2,i,k) = min(max(a4(2,i,k), min(a4(1,i,k), pmp_1, lac_1)),   &
+                                     max(a4(1,i,k), pmp_1, lac_1) )
+                 ! Right edges
+                 pmp_2 = a4(1,i,k) + 2.*gam(i,k)
+                 lac_2 = pmp_2 - 1.5*gam(i,k-1)
+                 a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), pmp_2, lac_2)),    &
+                                     max(a4(1,i,k), pmp_2, lac_2) )
+             endif
+          endif
+       enddo
+       do i=i1,i2
+          a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+       enddo
+     elseif ( abs(kord)<16 ) then ! kord = 11, 13
+       do i=i1,i2
+         if ( ext5(i,k) .and. (ext5(i,k-1).or.ext5(i,k+1).or.a4(1,i,k)<qmin) ) then
+! Noisy region:
+              a4(2,i,k) = a4(1,i,k)
+              a4(3,i,k) = a4(1,i,k)
+              a4(4,i,k) = 0.
+         else
+              a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+         endif
+       enddo
+     endif
+
+! Additional constraint to ensure positivity
+     if ( iv==0 ) call cs_limiters(im, extm(i1,k), a4(1,i1,k), 0)
+
+  enddo      ! k-loop
+
+!----------------------------------
+! Bottom layer subgrid constraints:
+!----------------------------------
+  if ( iv==0 ) then
+     do i=i1,i2
+        a4(3,i,km) = max(0., a4(3,i,km))
+     enddo
+  elseif ( iv .eq. -1 ) then
+      do i=i1,i2
+         if ( a4(3,i,km)*a4(1,i,km) <= 0. )  a4(3,i,km) = 0.
+      enddo
+  endif
+
+  do k=km-1,km
+     do i=i1,i2
+        a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+     enddo
+     if(k==(km-1)) call cs_limiters(im, extm(i1,k), a4(1,i1,k), 2)
+     if(k== km   ) call cs_limiters(im, extm(i1,k), a4(1,i1,k), 1)
+  enddo
+
+ end subroutine scalar_profile
+
+!>@brief The subroutine 'cs_profile' performs the optimized vertical profile reconstruction:
+!>@date April 2008
+!>@author S. J. Lin, NOAA/GFDL
+ subroutine cs_profile(qs, a4, delp, km, i1, i2, iv, kord)
+! Optimized vertical profile reconstruction:
+! Latest: Apr 2008 S.-J. Lin, NOAA/GFDL
+ integer, intent(in):: i1, i2
+ integer, intent(in):: km      !< vertical dimension
+ integer, intent(in):: iv      !< iv =-1: winds
+                               !< iv = 0: positive definite scalars
+                               !< iv = 1: others
+ integer, intent(in):: kord
+ real, intent(in)   ::   qs(i1:i2)
+ real, intent(in)   :: delp(i1:i2,km)     !< layer pressure thickness
+ real, intent(inout):: a4(4,i1:i2,km)     !< Interpolated values
+!-----------------------------------------------------------------------
+ logical, dimension(i1:i2,km):: extm, ext5, ext6
+ real  gam(i1:i2,km)
+ real    q(i1:i2,km+1)
+ real   d4(i1:i2)
+ real   bet, a_bot, grat
+ real   pmp_1, lac_1, pmp_2, lac_2, x0, x1
+ integer i, k, im
+
+ if ( iv .eq. -2 ) then
+      do i=i1,i2
+         gam(i,2) = 0.5
+           q(i,1) = 1.5*a4(1,i,1)
+      enddo
+      do k=2,km-1
+         do i=i1, i2
+                  grat = delp(i,k-1) / delp(i,k)
+                   bet =  2. + grat + grat - gam(i,k)
+                q(i,k) = (3.*(a4(1,i,k-1)+a4(1,i,k)) - q(i,k-1))/bet
+            gam(i,k+1) = grat / bet
+         enddo
+      enddo
+      do i=i1,i2
+            grat = delp(i,km-1) / delp(i,km)
+         q(i,km) = (3.*(a4(1,i,km-1)+a4(1,i,km)) - grat*qs(i) - q(i,km-1)) /  &
+                   (2. + grat + grat - gam(i,km))
+         q(i,km+1) = qs(i)
+      enddo
+      do k=km-1,1,-1
+        do i=i1,i2
+           q(i,k) = q(i,k) - gam(i,k+1)*q(i,k+1)
+        enddo
+      enddo
+ else
+  do i=i1,i2
+         grat = delp(i,2) / delp(i,1)   ! grid ratio
+          bet = grat*(grat+0.5)
+       q(i,1) = ( (grat+grat)*(grat+1.)*a4(1,i,1) + a4(1,i,2) ) / bet
+     gam(i,1) = ( 1. + grat*(grat+1.5) ) / bet
+  enddo
+
+  do k=2,km
+     do i=i1,i2
+           d4(i) = delp(i,k-1) / delp(i,k)
+             bet =  2. + d4(i) + d4(i) - gam(i,k-1)
+          q(i,k) = ( 3.*(a4(1,i,k-1)+d4(i)*a4(1,i,k)) - q(i,k-1) )/bet
+        gam(i,k) = d4(i) / bet
+     enddo
+  enddo
+
+  do i=i1,i2
+         a_bot = 1. + d4(i)*(d4(i)+1.5)
+     q(i,km+1) = (2.*d4(i)*(d4(i)+1.)*a4(1,i,km)+a4(1,i,km-1)-a_bot*q(i,km))  &
+               / ( d4(i)*(d4(i)+0.5) - a_bot*gam(i,km) )
+  enddo
+
+  do k=km,1,-1
+     do i=i1,i2
+        q(i,k) = q(i,k) - gam(i,k)*q(i,k+1)
+     enddo
+  enddo
+ endif
+!----- Perfectly linear scheme --------------------------------
+ if ( abs(kord) > 16 ) then
+  do k=1,km
+     do i=i1,i2
+        a4(2,i,k) = q(i,k  )
+        a4(3,i,k) = q(i,k+1)
+        a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+     enddo
+  enddo
+  return
+ endif
+!----- Perfectly linear scheme --------------------------------
+
+!------------------
+! Apply constraints
+!------------------
+  im = i2 - i1 + 1
+
+! Apply *large-scale* constraints
+  do i=i1,i2
+     q(i,2) = min( q(i,2), max(a4(1,i,1), a4(1,i,2)) )
+     q(i,2) = max( q(i,2), min(a4(1,i,1), a4(1,i,2)) )
+  enddo
+
+  do k=2,km
+     do i=i1,i2
+        gam(i,k) = a4(1,i,k) - a4(1,i,k-1)
+     enddo
+  enddo
+
+! Interior:
+  do k=3,km-1
+     do i=i1,i2
+        if ( gam(i,k-1)*gam(i,k+1)>0. ) then
+! Apply large-scale constraint to ALL fields if not local max/min
+             q(i,k) = min( q(i,k), max(a4(1,i,k-1),a4(1,i,k)) )
+             q(i,k) = max( q(i,k), min(a4(1,i,k-1),a4(1,i,k)) )
+        else
+          if ( gam(i,k-1) > 0. ) then
+! There exists a local max
+               q(i,k) = max(q(i,k), min(a4(1,i,k-1),a4(1,i,k)))
+          else
+! There exists a local min
+                 q(i,k) = min(q(i,k), max(a4(1,i,k-1),a4(1,i,k)))
+               if ( iv==0 ) q(i,k) = max(0., q(i,k))
+          endif
+        endif
+     enddo
+  enddo
+
+! Bottom:
+  do i=i1,i2
+     q(i,km) = min( q(i,km), max(a4(1,i,km-1), a4(1,i,km)) )
+     q(i,km) = max( q(i,km), min(a4(1,i,km-1), a4(1,i,km)) )
+  enddo
+
+  do k=1,km
+     do i=i1,i2
+        a4(2,i,k) = q(i,k  )
+        a4(3,i,k) = q(i,k+1)
+     enddo
+  enddo
+
+  do k=1,km
+     if ( k==1 .or. k==km ) then
+       do i=i1,i2
+          extm(i,k) = (a4(2,i,k)-a4(1,i,k)) * (a4(3,i,k)-a4(1,i,k)) > 0.
+       enddo
+     else
+       do i=i1,i2
+          extm(i,k) = gam(i,k)*gam(i,k+1) < 0.
+       enddo
+     endif
+     if ( abs(kord) > 9 ) then
+       do i=i1,i2
+          x0 = 2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k))
+          x1 = abs(a4(2,i,k)-a4(3,i,k))
+          a4(4,i,k) = 3.*x0
+          ext5(i,k) = abs(x0) > x1
+          ext6(i,k) = abs(a4(4,i,k)) > x1
+       enddo
+     endif
+  enddo
+
+!---------------------------
+! Apply subgrid constraints:
+!---------------------------
+! f(s) = AL + s*[(AR-AL) + A6*(1-s)]         ( 0 <= s  <= 1 )
+! Top 2 and bottom 2 layers always use monotonic mapping
+
+  if ( iv==0 ) then
+     do i=i1,i2
+        a4(2,i,1) = max(0., a4(2,i,1))
+     enddo
+  elseif ( iv==-1 ) then
+      do i=i1,i2
+         if ( a4(2,i,1)*a4(1,i,1) <= 0. ) a4(2,i,1) = 0.
+      enddo
+  elseif ( iv==2 ) then
+     do i=i1,i2
+        a4(2,i,1) = a4(1,i,1)
+        a4(3,i,1) = a4(1,i,1)
+        a4(4,i,1) = 0.
+     enddo
+  endif
+
+  if ( iv/=2 ) then
+     do i=i1,i2
+        a4(4,i,1) = 3.*(2.*a4(1,i,1) - (a4(2,i,1)+a4(3,i,1)))
+     enddo
+     call cs_limiters(im, extm(i1,1), a4(1,i1,1), 1)
+  endif
+
+! k=2
+   do i=i1,i2
+      a4(4,i,2) = 3.*(2.*a4(1,i,2) - (a4(2,i,2)+a4(3,i,2)))
+   enddo
+   call cs_limiters(im, extm(i1,2), a4(1,i1,2), 2)
+
+!-------------------------------------
+! Huynh's 2nd constraint for interior:
+!-------------------------------------
+  do k=3,km-2
+     if ( abs(kord)<9 ) then
+       do i=i1,i2
+! Left  edges
+          pmp_1 = a4(1,i,k) - 2.*gam(i,k+1)
+          lac_1 = pmp_1 + 1.5*gam(i,k+2)
+          a4(2,i,k) = min(max(a4(2,i,k), min(a4(1,i,k), pmp_1, lac_1)),   &
+                                         max(a4(1,i,k), pmp_1, lac_1) )
+! Right edges
+          pmp_2 = a4(1,i,k) + 2.*gam(i,k)
+          lac_2 = pmp_2 - 1.5*gam(i,k-1)
+          a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), pmp_2, lac_2)),    &
+                                         max(a4(1,i,k), pmp_2, lac_2) )
+
+          a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+       enddo
+
+     elseif ( abs(kord)==9 ) then
+       do i=i1,i2
+          if ( extm(i,k) .and. extm(i,k-1) ) then  ! c90_mp122
+! grid-scale 2-delta-z wave detected
+               a4(2,i,k) = a4(1,i,k)
+               a4(3,i,k) = a4(1,i,k)
+               a4(4,i,k) = 0.
+          else if ( extm(i,k) .and. extm(i,k+1) ) then  ! c90_mp122
+! grid-scale 2-delta-z wave detected
+               a4(2,i,k) = a4(1,i,k)
+               a4(3,i,k) = a4(1,i,k)
+               a4(4,i,k) = 0.
+          else
+            a4(4,i,k) = 6.*a4(1,i,k) - 3.*(a4(2,i,k)+a4(3,i,k))
+! Check within the smooth region if subgrid profile is non-monotonic
+            if( abs(a4(4,i,k)) > abs(a4(2,i,k)-a4(3,i,k)) ) then
+                  pmp_1 = a4(1,i,k) - 2.*gam(i,k+1)
+                  lac_1 = pmp_1 + 1.5*gam(i,k+2)
+              a4(2,i,k) = min(max(a4(2,i,k), min(a4(1,i,k), pmp_1, lac_1)),  &
+                                             max(a4(1,i,k), pmp_1, lac_1) )
+                  pmp_2 = a4(1,i,k) + 2.*gam(i,k)
+                  lac_2 = pmp_2 - 1.5*gam(i,k-1)
+              a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), pmp_2, lac_2)),  &
+                                             max(a4(1,i,k), pmp_2, lac_2) )
+              a4(4,i,k) = 6.*a4(1,i,k) - 3.*(a4(2,i,k)+a4(3,i,k))
+            endif
+          endif
+       enddo
+     elseif ( abs(kord)==10 ) then
+       do i=i1,i2
+          if( ext5(i,k) ) then
+              if( ext5(i,k-1) .or. ext5(i,k+1) ) then
+                   a4(2,i,k) = a4(1,i,k)
+                   a4(3,i,k) = a4(1,i,k)
+              elseif ( ext6(i,k-1) .or. ext6(i,k+1) ) then
+                   pmp_1 = a4(1,i,k) - 2.*gam(i,k+1)
+                   lac_1 = pmp_1 + 1.5*gam(i,k+2)
+                   a4(2,i,k) = min(max(a4(2,i,k), min(a4(1,i,k), pmp_1, lac_1)),  &
+                                                  max(a4(1,i,k), pmp_1, lac_1) )
+                   pmp_2 = a4(1,i,k) + 2.*gam(i,k)
+                   lac_2 = pmp_2 - 1.5*gam(i,k-1)
+                   a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), pmp_2, lac_2)),  &
+                                                  max(a4(1,i,k), pmp_2, lac_2) )
+              endif
+          elseif( ext6(i,k) ) then
+              if( ext5(i,k-1) .or. ext5(i,k+1) ) then
+                  pmp_1 = a4(1,i,k) - 2.*gam(i,k+1)
+                  lac_1 = pmp_1 + 1.5*gam(i,k+2)
+                  a4(2,i,k) = min(max(a4(2,i,k), min(a4(1,i,k), pmp_1, lac_1)),  &
+                                                 max(a4(1,i,k), pmp_1, lac_1) )
+                  pmp_2 = a4(1,i,k) + 2.*gam(i,k)
+                  lac_2 = pmp_2 - 1.5*gam(i,k-1)
+                  a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), pmp_2, lac_2)),  &
+                                                 max(a4(1,i,k), pmp_2, lac_2) )
+              endif
+          endif
+       enddo
+       do i=i1,i2
+          a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+       enddo
+     elseif ( abs(kord)==12 ) then
+       do i=i1,i2
+          if( extm(i,k) ) then
+! grid-scale 2-delta-z wave detected
+              a4(2,i,k) = a4(1,i,k)
+              a4(3,i,k) = a4(1,i,k)
+              a4(4,i,k) = 0.
+          else        ! not a local extremum
+            a4(4,i,k) = 6.*a4(1,i,k) - 3.*(a4(2,i,k)+a4(3,i,k))
+! Check within the smooth region if subgrid profile is non-monotonic
+            if( abs(a4(4,i,k)) > abs(a4(2,i,k)-a4(3,i,k)) ) then
+                  pmp_1 = a4(1,i,k) - 2.*gam(i,k+1)
+                  lac_1 = pmp_1 + 1.5*gam(i,k+2)
+              a4(2,i,k) = min(max(a4(2,i,k), min(a4(1,i,k), pmp_1, lac_1)),  &
+                                             max(a4(1,i,k), pmp_1, lac_1) )
+                  pmp_2 = a4(1,i,k) + 2.*gam(i,k)
+                  lac_2 = pmp_2 - 1.5*gam(i,k-1)
+              a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), pmp_2, lac_2)),  &
+                                             max(a4(1,i,k), pmp_2, lac_2) )
+              a4(4,i,k) = 6.*a4(1,i,k) - 3.*(a4(2,i,k)+a4(3,i,k))
+            endif
+          endif
+       enddo
+     elseif ( abs(kord)==13 ) then
+       do i=i1,i2
+          if( ext6(i,k) ) then
+             if ( ext6(i,k-1) .and. ext6(i,k+1) ) then
+! grid-scale 2-delta-z wave detected
+                 a4(2,i,k) = a4(1,i,k)
+                 a4(3,i,k) = a4(1,i,k)
+             endif
+          endif
+       enddo
+       do i=i1,i2
+          a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+       enddo
+     elseif ( abs(kord)==14 ) then
+
+       do i=i1,i2
+          a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+       enddo
+
+     elseif ( abs(kord)==15 ) then   ! revised kord=9 scehem
+       do i=i1,i2
+          if ( ext5(i,k) ) then  ! c90_mp122
+             if ( ext5(i,k-1) .or. ext5(i,k+1) ) then  ! c90_mp122
+! grid-scale 2-delta-z wave detected
+                  a4(2,i,k) = a4(1,i,k)
+                  a4(3,i,k) = a4(1,i,k)
+             endif
+          elseif( ext6(i,k) ) then
+! Check within the smooth region if subgrid profile is non-monotonic
+                  pmp_1 = a4(1,i,k) - 2.*gam(i,k+1)
+                  lac_1 = pmp_1 + 1.5*gam(i,k+2)
+              a4(2,i,k) = min(max(a4(2,i,k), min(a4(1,i,k), pmp_1, lac_1)),  &
+                                             max(a4(1,i,k), pmp_1, lac_1) )
+                  pmp_2 = a4(1,i,k) + 2.*gam(i,k)
+                  lac_2 = pmp_2 - 1.5*gam(i,k-1)
+              a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), pmp_2, lac_2)),  &
+                                             max(a4(1,i,k), pmp_2, lac_2) )
+          endif
+       enddo
+       do i=i1,i2
+          a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+       enddo
+     elseif ( abs(kord)==16 ) then
+       do i=i1,i2
+          if( ext5(i,k) ) then
+             if ( ext5(i,k-1) .or. ext5(i,k+1) ) then
+                 a4(2,i,k) = a4(1,i,k)
+                 a4(3,i,k) = a4(1,i,k)
+             elseif ( ext6(i,k-1) .or. ext6(i,k+1) ) then
+                 ! Left  edges
+                 pmp_1 = a4(1,i,k) - 2.*gam(i,k+1)
+                 lac_1 = pmp_1 + 1.5*gam(i,k+2)
+                 a4(2,i,k) = min(max(a4(2,i,k), min(a4(1,i,k), pmp_1, lac_1)),   &
+                                     max(a4(1,i,k), pmp_1, lac_1) )
+                 ! Right edges
+                 pmp_2 = a4(1,i,k) + 2.*gam(i,k)
+                 lac_2 = pmp_2 - 1.5*gam(i,k-1)
+                 a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), pmp_2, lac_2)),    &
+                                     max(a4(1,i,k), pmp_2, lac_2) )
+             endif
+          endif
+       enddo
+       do i=i1,i2
+          a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+       enddo
+     else      ! kord = 11
+       do i=i1,i2
+         if ( ext5(i,k) .and. (ext5(i,k-1) .or. ext5(i,k+1)) ) then
+! Noisy region:
+              a4(2,i,k) = a4(1,i,k)
+              a4(3,i,k) = a4(1,i,k)
+              a4(4,i,k) = 0.
+         else
+              a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+         endif
+       enddo
+     endif
+
+! Additional constraint to ensure positivity
+     if ( iv==0 ) call cs_limiters(im, extm(i1,k), a4(1,i1,k), 0)
+
+  enddo      ! k-loop
+
+!----------------------------------
+! Bottom layer subgrid constraints:
+!----------------------------------
+  if ( iv==0 ) then
+     do i=i1,i2
+        a4(3,i,km) = max(0., a4(3,i,km))
+     enddo
+  elseif ( iv .eq. -1 ) then
+      do i=i1,i2
+         if ( a4(3,i,km)*a4(1,i,km) <= 0. )  a4(3,i,km) = 0.
+      enddo
+  endif
+
+  do k=km-1,km
+     do i=i1,i2
+        a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+     enddo
+     if(k==(km-1)) call cs_limiters(im, extm(i1,k), a4(1,i1,k), 2)
+     if(k== km   ) call cs_limiters(im, extm(i1,k), a4(1,i1,k), 1)
+  enddo
+
+ end subroutine cs_profile
+
+
+ subroutine cs_limiters(im, extm, a4, iv)
+ integer, intent(in) :: im
+ integer, intent(in) :: iv
+ logical, intent(in) :: extm(im)
+ real , intent(inout) :: a4(4,im)   !< PPM array
+! LOCAL VARIABLES:
+ real  da1, da2, a6da
+ integer i
+
+ if ( iv==0 ) then
+! Positive definite constraint
+    do i=1,im
+    if( a4(1,i)<=0.) then
+        a4(2,i) = a4(1,i)
+        a4(3,i) = a4(1,i)
+        a4(4,i) = 0.
+    else
+      if( abs(a4(3,i)-a4(2,i)) < -a4(4,i) ) then
+         if( (a4(1,i)+0.25*(a4(3,i)-a4(2,i))**2/a4(4,i)+a4(4,i)*r12) < 0. ) then
+! local minimum is negative
+             if( a4(1,i)<a4(3,i) .and. a4(1,i)<a4(2,i) ) then
+                 a4(3,i) = a4(1,i)
+                 a4(2,i) = a4(1,i)
+                 a4(4,i) = 0.
+             elseif( a4(3,i) > a4(2,i) ) then
+                 a4(4,i) = 3.*(a4(2,i)-a4(1,i))
+                 a4(3,i) = a4(2,i) - a4(4,i)
+             else
+                 a4(4,i) = 3.*(a4(3,i)-a4(1,i))
+                 a4(2,i) = a4(3,i) - a4(4,i)
+             endif
+         endif
+      endif
+    endif
+    enddo
+ elseif ( iv==1 ) then
+    do i=1,im
+      if( (a4(1,i)-a4(2,i))*(a4(1,i)-a4(3,i))>=0. ) then
+         a4(2,i) = a4(1,i)
+         a4(3,i) = a4(1,i)
+         a4(4,i) = 0.
+      else
+         da1  = a4(3,i) - a4(2,i)
+         da2  = da1**2
+         a6da = a4(4,i)*da1
+         if(a6da < -da2) then
+            a4(4,i) = 3.*(a4(2,i)-a4(1,i))
+            a4(3,i) = a4(2,i) - a4(4,i)
+         elseif(a6da > da2) then
+            a4(4,i) = 3.*(a4(3,i)-a4(1,i))
+            a4(2,i) = a4(3,i) - a4(4,i)
+         endif
+      endif
+    enddo
+ else
+! Standard PPM constraint
+    do i=1,im
+      if( extm(i) ) then
+         a4(2,i) = a4(1,i)
+         a4(3,i) = a4(1,i)
+         a4(4,i) = 0.
+      else
+         da1  = a4(3,i) - a4(2,i)
+         da2  = da1**2
+         a6da = a4(4,i)*da1
+         if(a6da < -da2) then
+            a4(4,i) = 3.*(a4(2,i)-a4(1,i))
+            a4(3,i) = a4(2,i) - a4(4,i)
+         elseif(a6da > da2) then
+            a4(4,i) = 3.*(a4(3,i)-a4(1,i))
+            a4(2,i) = a4(3,i) - a4(4,i)
+         endif
+      endif
+    enddo
+ endif
+ end subroutine cs_limiters
+
+
+
+ subroutine ppm_profile(a4, delp, km, i1, i2, iv, kord)
+
+! INPUT PARAMETERS:
+ integer, intent(in):: iv      !< iv =-1: winds iv = 0: positive definite scalars iv = 1: others iv = 2: temp (if remap_t) and w (iv=-2)
+ integer, intent(in):: i1      !< Starting longitude
+ integer, intent(in):: i2      !< Finishing longitude
+ integer, intent(in):: km      !< Vertical dimension
+ integer, intent(in):: kord    !< Order (or more accurately method no.):
+                               !
+ real , intent(in):: delp(i1:i2,km)     !< Layer pressure thickness
+
+! !INPUT/OUTPUT PARAMETERS:
+ real , intent(inout):: a4(4,i1:i2,km)  !< Interpolated values
+
+! DESCRIPTION:
+!
+!   Perform the piecewise parabolic reconstruction
+!
+! !REVISION HISTORY:
+! S.-J. Lin   revised at GFDL 2007
+!-----------------------------------------------------------------------
+! local arrays:
+      real    dc(i1:i2,km)
+      real    h2(i1:i2,km)
+      real  delq(i1:i2,km)
+      real   df2(i1:i2,km)
+      real    d4(i1:i2,km)
+
+! local scalars:
+      integer i, k, km1, lmt, it
+      real  fac
+      real  a1, a2, c1, c2, c3, d1, d2
+      real  qm, dq, lac, qmp, pmp
+
+      km1 = km - 1
+       it = i2 - i1 + 1
+
+      do k=2,km
+         do i=i1,i2
+            delq(i,k-1) =   a4(1,i,k) - a4(1,i,k-1)
+              d4(i,k  ) = delp(i,k-1) + delp(i,k)
+         enddo
+      enddo
+
+      do k=2,km1
+         do i=i1,i2
+                 c1  = (delp(i,k-1)+0.5*delp(i,k))/d4(i,k+1)
+                 c2  = (delp(i,k+1)+0.5*delp(i,k))/d4(i,k)
+            df2(i,k) = delp(i,k)*(c1*delq(i,k) + c2*delq(i,k-1)) /      &
+                                    (d4(i,k)+delp(i,k+1))
+            dc(i,k) = sign( min(abs(df2(i,k)),              &
+                            max(a4(1,i,k-1),a4(1,i,k),a4(1,i,k+1))-a4(1,i,k),  &
+                  a4(1,i,k)-min(a4(1,i,k-1),a4(1,i,k),a4(1,i,k+1))), df2(i,k) )
+         enddo
+      enddo
+
+!-----------------------------------------------------------
+! 4th order interpolation of the provisional cell edge value
+!-----------------------------------------------------------
+
+      do k=3,km1
+         do i=i1,i2
+            c1 = delq(i,k-1)*delp(i,k-1) / d4(i,k)
+            a1 = d4(i,k-1) / (d4(i,k) + delp(i,k-1))
+            a2 = d4(i,k+1) / (d4(i,k) + delp(i,k))
+            a4(2,i,k) = a4(1,i,k-1) + c1 + 2./(d4(i,k-1)+d4(i,k+1)) *    &
+                      ( delp(i,k)*(c1*(a1 - a2)+a2*dc(i,k-1)) -          &
+                        delp(i,k-1)*a1*dc(i,k  ) )
+         enddo
+      enddo
+
+!     if(km>8 .and. kord>4) call steepz(i1, i2, km, a4, df2, dc, delq, delp, d4)
+
+! Area preserving cubic with 2nd deriv. = 0 at the boundaries
+! Top
+      do i=i1,i2
+         d1 = delp(i,1)
+         d2 = delp(i,2)
+         qm = (d2*a4(1,i,1)+d1*a4(1,i,2)) / (d1+d2)
+         dq = 2.*(a4(1,i,2)-a4(1,i,1)) / (d1+d2)
+         c1 = 4.*(a4(2,i,3)-qm-d2*dq) / ( d2*(2.*d2*d2+d1*(d2+3.*d1)) )
+         c3 = dq - 0.5*c1*(d2*(5.*d1+d2)-3.*d1*d1)
+         a4(2,i,2) = qm - 0.25*c1*d1*d2*(d2+3.*d1)
+! Top edge:
+!-------------------------------------------------------
+         a4(2,i,1) = d1*(2.*c1*d1**2-c3) + a4(2,i,2)
+!-------------------------------------------------------
+!        a4(2,i,1) = (12./7.)*a4(1,i,1)-(13./14.)*a4(1,i,2)+(3./14.)*a4(1,i,3)
+!-------------------------------------------------------
+! No over- and undershoot condition
+         a4(2,i,2) = max( a4(2,i,2), min(a4(1,i,1), a4(1,i,2)) )
+         a4(2,i,2) = min( a4(2,i,2), max(a4(1,i,1), a4(1,i,2)) )
+         dc(i,1) =  0.5*(a4(2,i,2) - a4(1,i,1))
+      enddo
+
+! Enforce monotonicity  within the top layer
+
+      if( iv==0 ) then
+         do i=i1,i2
+            a4(2,i,1) = max(0., a4(2,i,1))
+            a4(2,i,2) = max(0., a4(2,i,2))
+         enddo
+      elseif( iv==-1 ) then
+         do i=i1,i2
+            if ( a4(2,i,1)*a4(1,i,1) <= 0. ) a4(2,i,1) = 0.
+         enddo
+      elseif( abs(iv)==2 ) then
+         do i=i1,i2
+            a4(2,i,1) = a4(1,i,1)
+            a4(3,i,1) = a4(1,i,1)
+         enddo
+      endif
+
+! Bottom
+! Area preserving cubic with 2nd deriv. = 0 at the surface
+      do i=i1,i2
+         d1 = delp(i,km)
+         d2 = delp(i,km1)
+         qm = (d2*a4(1,i,km)+d1*a4(1,i,km1)) / (d1+d2)
+         dq = 2.*(a4(1,i,km1)-a4(1,i,km)) / (d1+d2)
+         c1 = (a4(2,i,km1)-qm-d2*dq) / (d2*(2.*d2*d2+d1*(d2+3.*d1)))
+         c3 = dq - 2.0*c1*(d2*(5.*d1+d2)-3.*d1*d1)
+         a4(2,i,km) = qm - c1*d1*d2*(d2+3.*d1)
+! Bottom edge:
+!-----------------------------------------------------
+         a4(3,i,km) = d1*(8.*c1*d1**2-c3) + a4(2,i,km)
+!        dc(i,km) = 0.5*(a4(3,i,km) - a4(1,i,km))
+!-----------------------------------------------------
+!        a4(3,i,km) = (12./7.)*a4(1,i,km)-(13./14.)*a4(1,i,km-1)+(3./14.)*a4(1,i,km-2)
+! No over- and under-shoot condition
+         a4(2,i,km) = max( a4(2,i,km), min(a4(1,i,km), a4(1,i,km1)) )
+         a4(2,i,km) = min( a4(2,i,km), max(a4(1,i,km), a4(1,i,km1)) )
+         dc(i,km) = 0.5*(a4(1,i,km) - a4(2,i,km))
+      enddo
+
+
+! Enforce constraint on the "slope" at the surface
+
+#ifdef BOT_MONO
+      do i=i1,i2
+         a4(4,i,km) = 0
+         if( a4(3,i,km) * a4(1,i,km) <= 0. ) a4(3,i,km) = 0.
+         d1 = a4(1,i,km) - a4(2,i,km)
+         d2 = a4(3,i,km) - a4(1,i,km)
+         if ( d1*d2 < 0. ) then
+              a4(2,i,km) = a4(1,i,km)
+              a4(3,i,km) = a4(1,i,km)
+         else
+              dq = sign(min(abs(d1),abs(d2),0.5*abs(delq(i,km-1))), d1)
+              a4(2,i,km) = a4(1,i,km) - dq
+              a4(3,i,km) = a4(1,i,km) + dq
+         endif
+      enddo
+#else
+      if( iv==0 ) then
+          do i=i1,i2
+             a4(2,i,km) = max(0.,a4(2,i,km))
+             a4(3,i,km) = max(0.,a4(3,i,km))
+          enddo
+      elseif( iv<0 ) then
+          do i=i1,i2
+             if( a4(1,i,km)*a4(3,i,km) <= 0. )  a4(3,i,km) = 0.
+          enddo
+      endif
+#endif
+
+   do k=1,km1
+      do i=i1,i2
+         a4(3,i,k) = a4(2,i,k+1)
+      enddo
+   enddo
+
+!-----------------------------------------------------------
+! f(s) = AL + s*[(AR-AL) + A6*(1-s)]         ( 0 <= s  <= 1 )
+!-----------------------------------------------------------
+! Top 2 and bottom 2 layers always use monotonic mapping
+      do k=1,2
+         do i=i1,i2
+            a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+         enddo
+         call ppm_limiters(dc(i1,k), a4(1,i1,k), it, 0)
+      enddo
+
+      if(kord >= 7) then
+!-----------------------
+! Huynh's 2nd constraint
+!-----------------------
+      do k=2,km1
+         do i=i1,i2
+! Method#1
+!           h2(i,k) = delq(i,k) - delq(i,k-1)
+! Method#2 - better
+            h2(i,k) = 2.*(dc(i,k+1)/delp(i,k+1) - dc(i,k-1)/delp(i,k-1))  &
+                     / ( delp(i,k)+0.5*(delp(i,k-1)+delp(i,k+1)) )        &
+                     * delp(i,k)**2
+! Method#3
+!!!            h2(i,k) = dc(i,k+1) - dc(i,k-1)
+         enddo
+      enddo
+
+      fac = 1.5           ! original quasi-monotone
+
+      do k=3,km-2
+        do i=i1,i2
+! Right edges
+!        qmp   = a4(1,i,k) + 2.0*delq(i,k-1)
+!        lac   = a4(1,i,k) + fac*h2(i,k-1) + 0.5*delq(i,k-1)
+!
+         pmp   = 2.*dc(i,k)
+         qmp   = a4(1,i,k) + pmp
+         lac   = a4(1,i,k) + fac*h2(i,k-1) + dc(i,k)
+         a4(3,i,k) = min(max(a4(3,i,k), min(a4(1,i,k), qmp, lac)),    &
+                                        max(a4(1,i,k), qmp, lac) )
+! Left  edges
+!        qmp   = a4(1,i,k) - 2.0*delq(i,k)
+!        lac   = a4(1,i,k) + fac*h2(i,k+1) - 0.5*delq(i,k)
+!
+         qmp   = a4(1,i,k) - pmp
+         lac   = a4(1,i,k) + fac*h2(i,k+1) - dc(i,k)
+         a4(2,i,k) = min(max(a4(2,i,k),  min(a4(1,i,k), qmp, lac)),   &
+                     max(a4(1,i,k), qmp, lac))
+!-------------
+! Recompute A6
+!-------------
+         a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+        enddo
+! Additional constraint to ensure positivity when kord=7
+         if (iv == 0 .and. kord >= 6 )                      &
+             call ppm_limiters(dc(i1,k), a4(1,i1,k), it, 2)
+      enddo
+
+      else
+
+         lmt = kord - 3
+         lmt = max(0, lmt)
+         if (iv == 0) lmt = min(2, lmt)
+
+         do k=3,km-2
+            if( kord /= 4) then
+              do i=i1,i2
+                 a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+              enddo
+            endif
+            if(kord/=6) call ppm_limiters(dc(i1,k), a4(1,i1,k), it, lmt)
+         enddo
+      endif
+
+      do k=km1,km
+         do i=i1,i2
+            a4(4,i,k) = 3.*(2.*a4(1,i,k) - (a4(2,i,k)+a4(3,i,k)))
+         enddo
+         call ppm_limiters(dc(i1,k), a4(1,i1,k), it, 0)
+      enddo
+
+ end subroutine ppm_profile
+
+
+ subroutine ppm_limiters(dm, a4, itot, lmt)
+
+! INPUT PARAMETERS:
+      real , intent(in):: dm(*)     !< Linear slope
+      integer, intent(in) :: itot      !< Total Longitudes
+      integer, intent(in) :: lmt       !< 0: Standard PPM constraint 1: Improved full monotonicity constraint
+                                       !< (Lin) 2: Positive definite constraint
+                                       !< 3: do nothing (return immediately)
+! INPUT/OUTPUT PARAMETERS:
+      real , intent(inout) :: a4(4,*)   !< PPM array AA <-- a4(1,i) AL <-- a4(2,i) AR <-- a4(3,i) A6 <-- a4(4,i)
+! LOCAL VARIABLES:
+      real  qmp
+      real  da1, da2, a6da
+      real  fmin
+      integer i
+
+! Developer: S.-J. Lin
+
+      if ( lmt == 3 ) return
+
+      if(lmt == 0) then
+! Standard PPM constraint
+      do i=1,itot
+      if(dm(i) == 0.) then
+         a4(2,i) = a4(1,i)
+         a4(3,i) = a4(1,i)
+         a4(4,i) = 0.
+      else
+         da1  = a4(3,i) - a4(2,i)
+         da2  = da1**2
+         a6da = a4(4,i)*da1
+         if(a6da < -da2) then
+            a4(4,i) = 3.*(a4(2,i)-a4(1,i))
+            a4(3,i) = a4(2,i) - a4(4,i)
+         elseif(a6da > da2) then
+            a4(4,i) = 3.*(a4(3,i)-a4(1,i))
+            a4(2,i) = a4(3,i) - a4(4,i)
+         endif
+      endif
+      enddo
+
+      elseif (lmt == 1) then
+
+! Improved full monotonicity constraint (Lin 2004)
+! Note: no need to provide first guess of A6 <-- a4(4,i)
+      do i=1, itot
+           qmp = 2.*dm(i)
+         a4(2,i) = a4(1,i)-sign(min(abs(qmp),abs(a4(2,i)-a4(1,i))), qmp)
+         a4(3,i) = a4(1,i)+sign(min(abs(qmp),abs(a4(3,i)-a4(1,i))), qmp)
+         a4(4,i) = 3.*( 2.*a4(1,i) - (a4(2,i)+a4(3,i)) )
+      enddo
+
+      elseif (lmt == 2) then
+
+! Positive definite constraint
+      do i=1,itot
+      if( abs(a4(3,i)-a4(2,i)) < -a4(4,i) ) then
+      fmin = a4(1,i)+0.25*(a4(3,i)-a4(2,i))**2/a4(4,i)+a4(4,i)*r12
+         if( fmin < 0. ) then
+         if(a4(1,i)<a4(3,i) .and. a4(1,i)<a4(2,i)) then
+            a4(3,i) = a4(1,i)
+            a4(2,i) = a4(1,i)
+            a4(4,i) = 0.
+         elseif(a4(3,i) > a4(2,i)) then
+            a4(4,i) = 3.*(a4(2,i)-a4(1,i))
+            a4(3,i) = a4(2,i) - a4(4,i)
+         else
+            a4(4,i) = 3.*(a4(3,i)-a4(1,i))
+            a4(2,i) = a4(3,i) - a4(4,i)
+         endif
+         endif
+      endif
+      enddo
+
+      endif
+
+ end subroutine ppm_limiters
+
+
+
+ subroutine steepz(i1, i2, km, a4, df2, dm, dq, dp, d4)
+ integer, intent(in) :: km, i1, i2
+   real , intent(in) ::  dp(i1:i2,km)       !< Grid size
+   real , intent(in) ::  dq(i1:i2,km)       !< Backward diff of q
+   real , intent(in) ::  d4(i1:i2,km)       !< Backward sum:  dp(k)+ dp(k-1)
+   real , intent(in) :: df2(i1:i2,km)       !< First guess mismatch
+   real , intent(in) ::  dm(i1:i2,km)       !< Monotonic mismatch
+! INPUT/OUTPUT PARAMETERS:
+      real , intent(inout) ::  a4(4,i1:i2,km)  !<First guess/steepened
+! LOCAL VARIABLES:
+      integer i, k
+      real  alfa(i1:i2,km)
+      real     f(i1:i2,km)
+      real   rat(i1:i2,km)
+      real   dg2
+
+! Compute ratio of dq/dp
+      do k=2,km
+         do i=i1,i2
+            rat(i,k) = dq(i,k-1) / d4(i,k)
+         enddo
+      enddo
+
+! Compute F
+      do k=2,km-1
+         do i=i1,i2
+            f(i,k) =   (rat(i,k+1) - rat(i,k))                          &
+                     / ( dp(i,k-1)+dp(i,k)+dp(i,k+1) )
+         enddo
+      enddo
+
+      do k=3,km-2
+         do i=i1,i2
+         if(f(i,k+1)*f(i,k-1)<0. .and. df2(i,k)/=0.) then
+            dg2 = (f(i,k+1)-f(i,k-1))*((dp(i,k+1)-dp(i,k-1))**2          &
+                   + d4(i,k)*d4(i,k+1) )
+            alfa(i,k) = max(0., min(0.5, -0.1875*dg2/df2(i,k)))
+         else
+            alfa(i,k) = 0.
+         endif
+         enddo
+      enddo
+
+      do k=4,km-2
+         do i=i1,i2
+            a4(2,i,k) = (1.-alfa(i,k-1)-alfa(i,k)) * a4(2,i,k) +         &
+                        alfa(i,k-1)*(a4(1,i,k)-dm(i,k))    +             &
+                        alfa(i,k)*(a4(1,i,k-1)+dm(i,k-1))
+         enddo
+      enddo
+
+ end subroutine steepz
+
+!>@brief The subroutine 'rst_remap' remaps all variables required for a restart.
+!>@details npz_restart /= npz (i.e., when the number of vertical levels is
+!! changed at restart)
+ subroutine rst_remap(km, kn, is,ie,js,je, isd,ied,jsd,jed, nq, ntp, &
+                      delp_r, u_r, v_r, w_r, delz_r, pt_r, q_r, qdiag_r, &
+                      delp,   u,   v,   w,   delz,   pt,   q,   qdiag,   &
+                      ak_r, bk_r, ptop, ak, bk, hydrostatic, make_nh, &
+                      domain, square_domain)
+!------------------------------------
+! Assuming hybrid sigma-P coordinate:
+!------------------------------------
+! INPUT PARAMETERS:
+  integer, intent(in):: km                    !< Restart z-dimension
+  integer, intent(in):: kn                    !< Run time dimension
+  integer, intent(in):: nq, ntp               !< Number of tracers (including H2O)
+  integer, intent(in):: is,ie,isd,ied         !< Starting & ending X-Dir index
+  integer, intent(in):: js,je,jsd,jed         !< Starting & ending Y-Dir index
+  logical, intent(in):: hydrostatic, make_nh, square_domain
+  real, intent(IN) :: ptop
+  real, intent(in) :: ak_r(km+1)
+  real, intent(in) :: bk_r(km+1)
+  real, intent(in) :: ak(kn+1)
+  real, intent(in) :: bk(kn+1)
+  real, intent(in):: delp_r(is:ie,js:je,km) !< Pressure thickness
+  real, intent(in)::   u_r(is:ie,  js:je+1,km)   !< u-wind (m/s)
+  real, intent(in)::   v_r(is:ie+1,js:je  ,km)   !< v-wind (m/s)
+  real, intent(inout)::  pt_r(is:ie,js:je,km)
+  real, intent(in)::   w_r(is:ie,js:je,km)
+  real, intent(in)::   q_r(is:ie,js:je,km,1:ntp)
+  real, intent(in)::   qdiag_r(is:ie,js:je,km,ntp+1:nq)
+  real, intent(inout)::delz_r(is:ie,js:je,km)
+  type(domain2d), intent(INOUT) :: domain
+! Output:
+  real, intent(out):: delp(isd:ied,jsd:jed,kn) !< Pressure thickness
+  real, intent(out)::  u(isd:ied  ,jsd:jed+1,kn)   !< u-wind (m/s)
+  real, intent(out)::  v(isd:ied+1,jsd:jed  ,kn)   !< v-wind (m/s)
+  real, intent(out)::  w(isd:     ,jsd:     ,1:)   !< Vertical velocity (m/s)
+  real, intent(out):: pt(isd:ied  ,jsd:jed  ,kn)   !< Temperature
+  real, intent(out):: q(isd:ied,jsd:jed,kn,1:ntp)
+  real, intent(out):: qdiag(isd:ied,jsd:jed,kn,ntp+1:nq)
+  real, intent(out):: delz(isd:,jsd:,1:)   !< Delta-height (m)
+!-----------------------------------------------------------------------
+  real r_vir, rgrav
+  real ps(isd:ied,jsd:jed)  !< Surface pressure
+  real  pe1(is:ie,km+1)
+  real  pe2(is:ie,kn+1)
+  real  pv1(is:ie+1,km+1)
+  real  pv2(is:ie+1,kn+1)
+
+  integer i,j,k , iq
+  integer, parameter:: kord=4
+
+#ifdef HYDRO_DELZ_REMAP
+  if (is_master() .and. .not. hydrostatic) then
+     print*, ''
+     print*, ' REMAPPING IC: INITIALIZING DELZ WITH HYDROSTATIC STATE  '
+     print*, ''
+  endif
+#endif
+
+#ifdef HYDRO_DELZ_EXTRAP
+  if (is_master() .and. .not. hydrostatic) then
+     print*, ''
+     print*, ' REMAPPING IC: INITIALIZING DELZ WITH HYDROSTATIC STATE ABOVE INPUT MODEL TOP  '
+     print*, ''
+  endif
+#endif
+
+#ifdef ZERO_W_EXTRAP
+  if (is_master() .and. .not. hydrostatic) then
+     print*, ''
+     print*, ' REMAPPING IC: INITIALIZING W TO ZERO ABOVE INPUT MODEL TOP  '
+     print*, ''
+  endif
+#endif
+
+  r_vir = rvgas/rdgas - 1.
+  rgrav = 1./grav
+
+!$OMP parallel do default(none) shared(is,ie,js,je,ps,ak_r)
+  do j=js,je
+     do i=is,ie
+        ps(i,j) = ak_r(1)
+     enddo
+  enddo
+
+! this OpenMP do-loop setup cannot work in it's current form....
+!$OMP parallel do default(none) shared(is,ie,js,je,km,ps,delp_r)
+  do j=js,je
+     do k=1,km
+        do i=is,ie
+           ps(i,j) = ps(i,j) + delp_r(i,j,k)
+        enddo
+     enddo
+  enddo
+
+! only one cell is needed
+  if ( square_domain ) then
+      call mpp_update_domains(ps, domain,  whalo=1, ehalo=1, shalo=1, nhalo=1, complete=.true.)
+  else
+      call mpp_update_domains(ps, domain, complete=.true.)
+  endif
+
+! Compute virtual Temp
+!$OMP parallel do default(none) shared(is,ie,js,je,km,pt_r,r_vir,q_r)
+  do k=1,km
+     do j=js,je
+        do i=is,ie
+           pt_r(i,j,k) = pt_r(i,j,k) * (1.+r_vir*q_r(i,j,k,1))
+        enddo
+     enddo
+  enddo
+
+!$OMP parallel do default(none) shared(is,ie,js,je,km,ak_r,bk_r,ps,kn,ak,bk,u_r,u,delp, &
+!$OMP                                  ntp,nq,hydrostatic,make_nh,w_r,w,delz_r,delp_r,delz, &
+!$OMP                                  pt_r,pt,v_r,v,q,q_r,qdiag,qdiag_r) &
+!$OMP                          private(pe1,  pe2, pv1, pv2)
+  do 6000 j=js,je+1
+!------
+! map u
+!------
+     do k=1,km+1
+        do i=is,ie
+           pe1(i,k) = ak_r(k) + 0.5*bk_r(k)*(ps(i,j-1)+ps(i,j))
+        enddo
+     enddo
+
+     do k=1,kn+1
+        do i=is,ie
+           pe2(i,k) = ak(k) + 0.5*bk(k)*(ps(i,j-1)+ps(i,j))
+        enddo
+     enddo
+
+     call remap_2d(km, pe1, u_r(is:ie,j:j,1:km),       &
+                   kn, pe2,   u(is:ie,j:j,1:kn),       &
+                   is, ie, -1, kord)
+
+  if ( j /= (je+1) )  then
+
+!---------------
+! Hybrid sigma-p
+!---------------
+     do k=1,km+1
+        do i=is,ie
+           pe1(i,k) = ak_r(k) + bk_r(k)*ps(i,j)
+        enddo
+     enddo
+
+     do k=1,kn+1
+        do i=is,ie
+           pe2(i,k) =   ak(k) + bk(k)*ps(i,j)
+        enddo
+     enddo
+
+!-------------
+! Compute delp
+!-------------
+      do k=1,kn
+         do i=is,ie
+            delp(i,j,k) = pe2(i,k+1) - pe2(i,k)
+         enddo
+      enddo
+
+!----------------
+! Map constituents
+!----------------
+      if( nq /= 0 ) then
+          do iq=1,ntp
+             call remap_2d(km, pe1, q_r(is:ie,j:j,1:km,iq:iq),  &
+                           kn, pe2,   q(is:ie,j:j,1:kn,iq:iq),  &
+                           is, ie, 0, kord)
+          enddo
+          do iq=ntp+1,nq
+             call remap_2d(km, pe1, qdiag_r(is:ie,j:j,1:km,iq:iq),  &
+                           kn, pe2,   qdiag(is:ie,j:j,1:kn,iq:iq),  &
+                           is, ie, 0, kord)
+          enddo
+      endif
+
+      if ( .not. hydrostatic .and. .not. make_nh) then
+! Remap vertical wind:
+         call remap_2d(km, pe1, w_r(is:ie,j:j,1:km),       &
+                       kn, pe2,   w(is:ie,j:j,1:kn),       &
+                       is, ie, -1, kord)
+
+#ifdef ZERO_W_EXTRAP
+       do k=1,kn
+       do i=is,ie
+          if (pe2(i,k) < pe1(i,1)) then
+             w(i,j,k) = 0.
+          endif
+       enddo
+       enddo
+#endif
+
+#ifndef HYDRO_DELZ_REMAP
+! Remap delz for hybrid sigma-p coordinate
+         do k=1,km
+            do i=is,ie
+               delz_r(i,j,k) = -delz_r(i,j,k)/delp_r(i,j,k) ! ="specific volume"/grav
+            enddo
+         enddo
+         call remap_2d(km, pe1, delz_r(is:ie,j:j,1:km),       &
+                       kn, pe2,   delz(is:ie,j:j,1:kn),       &
+                       is, ie, 1, kord)
+         do k=1,kn
+            do i=is,ie
+               delz(i,j,k) = -delz(i,j,k)*delp(i,j,k)
+            enddo
+         enddo
+#endif
+      endif
+
+! Geopotential conserving remap of virtual temperature:
+       do k=1,km+1
+          do i=is,ie
+             pe1(i,k) = log(pe1(i,k))
+          enddo
+       enddo
+       do k=1,kn+1
+          do i=is,ie
+             pe2(i,k) = log(pe2(i,k))
+          enddo
+       enddo
+
+       call remap_2d(km, pe1, pt_r(is:ie,j:j,1:km),       &
+                     kn, pe2,   pt(is:ie,j:j,1:kn),       &
+                     is, ie, 1, kord)
+
+#ifdef HYDRO_DELZ_REMAP
+       !initialize delz from the hydrostatic state
+       do k=1,kn
+       do i=is,ie
+          delz(i,j,k) = (rdgas*rgrav)*pt(i,j,k)*(pe2(i,k)-pe2(i,k+1))
+       enddo
+       enddo
+#endif
+#ifdef HYDRO_DELZ_EXTRAP
+       !initialize delz from the hydrostatic state
+       do k=1,kn
+       do i=is,ie
+          if (pe2(i,k) < pe1(i,1)) then
+             delz(i,j,k) = (rdgas*rgrav)*pt(i,j,k)*(pe2(i,k)-pe2(i,k+1))
+          endif
+       enddo
+       enddo
+#endif
+!------
+! map v
+!------
+       do k=1,km+1
+          do i=is,ie+1
+             pv1(i,k) = ak_r(k) + 0.5*bk_r(k)*(ps(i-1,j)+ps(i,j))
+          enddo
+       enddo
+       do k=1,kn+1
+          do i=is,ie+1
+             pv2(i,k) = ak(k) + 0.5*bk(k)*(ps(i-1,j)+ps(i,j))
+          enddo
+       enddo
+
+       call remap_2d(km, pv1, v_r(is:ie+1,j:j,1:km),       &
+                     kn, pv2,   v(is:ie+1,j:j,1:kn),       &
+                     is, ie+1, -1, kord)
+
+  endif !(j < je+1)
+6000  continue
+
+!$OMP parallel do default(none) shared(is,ie,js,je,kn,pt,r_vir,q)
+  do k=1,kn
+     do j=js,je
+        do i=is,ie
+           pt(i,j,k) = pt(i,j,k) / (1.+r_vir*q(i,j,k,1))
+        enddo
+     enddo
+  enddo
+
+ end subroutine rst_remap
+
+!>@brief The subroutine 'mappm' is a general-purpose routine for remapping
+!! one set of vertical levels to another.
+ subroutine mappm(km, pe1, q1, kn, pe2, q2, i1, i2, iv, kord, ptop)
+
+! IV = 0: constituents
+! IV = 1: potential temp
+! IV =-1: winds
+
+! Mass flux preserving mapping: q1(im,km) -> q2(im,kn)
+
+! pe1: pressure at layer edges (from model top to bottom surface)
+!      in the original vertical coordinate
+! pe2: pressure at layer edges (from model top to bottom surface)
+!      in the new vertical coordinate
+
+ integer, intent(in):: i1, i2, km, kn, kord, iv
+ real, intent(in ):: pe1(i1:i2,km+1), pe2(i1:i2,kn+1) !< pe1: pressure at layer edges from model top to bottom
+                                                      !!      surface in the ORIGINAL vertical coordinate
+                                                      !< pe2: pressure at layer edges from model top to bottom
+                                                      !!      surface in the NEW vertical coordinate
+! Mass flux preserving mapping: q1(im,km) -> q2(im,kn)
+ real, intent(in )::  q1(i1:i2,km)
+ real, intent(out)::  q2(i1:i2,kn)
+ real, intent(IN) :: ptop
+! local
+      real  qs(i1:i2)
+      real dp1(i1:i2,km)
+      real a4(4,i1:i2,km)
+      integer i, k, l
+      integer k0, k1
+      real pl, pr, tt, delp, qsum, dpsum, esl
+
+      do k=1,km
+         do i=i1,i2
+             dp1(i,k) = pe1(i,k+1) - pe1(i,k)
+            a4(1,i,k) = q1(i,k)
+         enddo
+      enddo
+
+      if ( kord >7 ) then
+           call  cs_profile( qs, a4, dp1, km, i1, i2, iv, kord )
+      else
+           call ppm_profile( a4, dp1, km, i1, i2, iv, kord )
+      endif
+
+!------------------------------------
+! Lowest layer: constant distribution
+!------------------------------------
+#ifdef NGGPS_SUBMITTED
+      do i=i1,i2
+         a4(2,i,km) = q1(i,km)
+         a4(3,i,km) = q1(i,km)
+         a4(4,i,km) = 0.
+      enddo
+#endif
+
+      do 5555 i=i1,i2
+         k0 = 1
+      do 555 k=1,kn
+
+         if(pe2(i,k) .le. pe1(i,1)) then
+! above old ptop
+            q2(i,k) = q1(i,1)
+         elseif(pe2(i,k) .ge. pe1(i,km+1)) then
+! Entire grid below old ps
+#ifdef NGGPS_SUBMITTED
+            q2(i,k) = a4(3,i,km)   ! this is not good.
+#else
+            q2(i,k) = q1(i,km)
+#endif
+         else
+
+         do 45 L=k0,km
+! locate the top edge at pe2(i,k)
+         if( pe2(i,k) .ge. pe1(i,L) .and.        &
+             pe2(i,k) .le. pe1(i,L+1)    ) then
+             k0 = L
+             PL = (pe2(i,k)-pe1(i,L)) / dp1(i,L)
+             if(pe2(i,k+1) .le. pe1(i,L+1)) then
+
+! entire new grid is within the original grid
+               PR = (pe2(i,k+1)-pe1(i,L)) / dp1(i,L)
+               TT = r3*(PR*(PR+PL)+PL**2)
+               q2(i,k) = a4(2,i,L) + 0.5*(a4(4,i,L)+a4(3,i,L)  &
+                       - a4(2,i,L))*(PR+PL) - a4(4,i,L)*TT
+              goto 555
+             else
+! Fractional area...
+              delp = pe1(i,L+1) - pe2(i,k)
+              TT   = r3*(1.+PL*(1.+PL))
+              qsum = delp*(a4(2,i,L)+0.5*(a4(4,i,L)+            &
+                     a4(3,i,L)-a4(2,i,L))*(1.+PL)-a4(4,i,L)*TT)
+              dpsum = delp
+              k1 = L + 1
+             goto 111
+             endif
+         endif
+45       continue
+
+111      continue
+         do 55 L=k1,km
+         if( pe2(i,k+1) .gt. pe1(i,L+1) ) then
+
+! Whole layer..
+
+            qsum  =  qsum + dp1(i,L)*q1(i,L)
+            dpsum = dpsum + dp1(i,L)
+         else
+           delp = pe2(i,k+1)-pe1(i,L)
+           esl  = delp / dp1(i,L)
+           qsum = qsum + delp * (a4(2,i,L)+0.5*esl*            &
+                 (a4(3,i,L)-a4(2,i,L)+a4(4,i,L)*(1.-r23*esl)) )
+          dpsum = dpsum + delp
+           k0 = L
+           goto 123
+         endif
+55       continue
+        delp = pe2(i,k+1) - pe1(i,km+1)
+        if(delp > 0.) then
+! Extended below old ps
+#ifdef NGGPS_SUBMITTED
+           qsum = qsum + delp * a4(3,i,km)    ! not good.
+#else
+           qsum = qsum + delp * q1(i,km)
+#endif
+          dpsum = dpsum + delp
+        endif
+123     q2(i,k) = qsum / dpsum
+      endif
+555   continue
+5555  continue
+
+ end subroutine mappm
+
+!>@brief The subroutine 'moist_cv' computes the FV3-consistent moist heat capacity under constant volume,
+!! including the heating capacity of water vapor and condensates.
+!>@details See \cite emanuel1994atmospheric for information on variable heat capacities.
+ subroutine moist_cv(is,ie, isd,ied, jsd,jed, km, j, k, nwat, sphum, liq_wat, rainwat,    &
+                     ice_wat, snowwat, graupel, q, qd, cvm, t1)
+  integer, intent(in):: is, ie, isd,ied, jsd,jed, km, nwat, j, k
+  integer, intent(in):: sphum, liq_wat, rainwat, ice_wat, snowwat, graupel
+  real, intent(in), dimension(isd:ied,jsd:jed,km,nwat):: q
+  real, intent(out), dimension(is:ie):: cvm, qd
+  real, intent(in), optional:: t1(is:ie)
+!
+  real, parameter:: t_i0 = 15.
+  real, dimension(is:ie):: qv, ql, qs
+  integer:: i
+
+  select case (nwat)
+
+   case(1)
+     do i=is,ie
+        qv(i) = max(q(i,j,k,sphum)  ,0.0)
+        cvm(i) = (1.-qv(i))*cv_air + qv(i)*cv_vap
+     enddo
+   case(2)
+     if ( present(t1) ) then  ! Special case for GFS physics
+        do i=is,ie
+           qd(i) = max(0., q(i,j,k,liq_wat))
+           if ( t1(i) > tice ) then
+                qs(i) = 0.
+           elseif ( t1(i) < tice-t_i0 ) then
+                qs(i) = qd(i)
+           else
+                qs(i) = qd(i)*(tice-t1(i))/t_i0
+           endif
+           ql(i) = qd(i) - qs(i)
+           qv(i) = max(0.,q(i,j,k,sphum))
+           cvm(i) = (1.-(qv(i)+qd(i)))*cv_air + qv(i)*cv_vap + ql(i)*c_liq + qs(i)*c_ice
+        enddo
+     else
+        do i=is,ie
+           qv(i) = max(q(i,j,k,sphum)  ,0.0)
+           qs(i) = max(q(i,j,k,liq_wat),0.0)
+           qd(i) = qs(i)
+           cvm(i) = (1.-qv(i))*cv_air + qv(i)*cv_vap
+        enddo
+     endif
+  case (3)
+     do i=is,ie
+        qv(i) = max(q(i,j,k,sphum)  ,0.0)
+        ql(i) = max(q(i,j,k,liq_wat),0.0)
+        qs(i) = max(q(i,j,k,ice_wat),0.0)
+        qd(i) = ql(i) + qs(i)
+        cvm(i) = (1.-(qv(i)+qd(i)))*cv_air + qv(i)*cv_vap + ql(i)*c_liq + qs(i)*c_ice
+     enddo
+  case(4)              ! K_warm_rain with fake ice
+     do i=is,ie
+        qv(i) = max(q(i,j,k,sphum)  ,0.0)
+        ql(i) = max(q(i,j,k,liq_wat),0.0) + max(q(i,j,k,rainwat),0.0)
+        cvm(i) = (1.-(qv(i)+qd(i)))*cv_air + qv(i)*cv_vap + qd(i)*c_liq
+     enddo
+  case(5)
+     do i=is,ie
+        qv(i) = max(q(i,j,k,sphum)  ,0.0)
+        ql(i) = max(q(i,j,k,liq_wat),0.0) + max(q(i,j,k,rainwat),0.0)
+        qs(i) = max(q(i,j,k,ice_wat),0.0) + max(q(i,j,k,snowwat),0.0)
+        qd(i) = ql(i) + qs(i)
+        cvm(i) = (1.-(qv(i)+qd(i)))*cv_air + qv(i)*cv_vap + ql(i)*c_liq + qs(i)*c_ice
+     enddo
+  case(6:7)
+     do i=is,ie
+        qv(i) = max(q(i,j,k,sphum)  ,0.0)
+        ql(i) = max(q(i,j,k,liq_wat),0.0) + max(q(i,j,k,rainwat),0.0)
+        qs(i) = max(q(i,j,k,ice_wat),0.0) + max(q(i,j,k,snowwat),0.0) + max(q(i,j,k,graupel),0.0)
+        qd(i) = ql(i) + qs(i)
+        cvm(i) = (1.-(qv(i)+qd(i)))*cv_air + qv(i)*cv_vap + ql(i)*c_liq + qs(i)*c_ice
+     enddo
+  case default
+    !call mpp_error (NOTE, 'fv_mapz::moist_cv - using default cv_air')
+     do i=is,ie
+         qd(i) = 0.
+        cvm(i) = cv_air
+     enddo
+ end select
+
+ end subroutine moist_cv
+
+!>@brief The subroutine 'moist_cp' computes the FV3-consistent moist heat capacity under constant pressure,
+!! including the heating capacity of water vapor and condensates.
+ subroutine moist_cp(is,ie, isd,ied, jsd,jed, km, j, k, nwat, sphum, liq_wat, rainwat,    &
+                     ice_wat, snowwat, graupel, q, qd, cpm, t1)
+
+  integer, intent(in):: is, ie, isd,ied, jsd,jed, km, nwat, j, k
+  integer, intent(in):: sphum, liq_wat, rainwat, ice_wat, snowwat, graupel
+  real, intent(in), dimension(isd:ied,jsd:jed,km,nwat):: q
+  real, intent(out), dimension(is:ie):: cpm, qd
+  real, intent(in), optional:: t1(is:ie)
+!
+  real, parameter:: t_i0 = 15.
+  real, dimension(is:ie):: qv, ql, qs
+  integer:: i
+
+  select case (nwat)
+
+  case(2)
+     if ( present(t1) ) then  ! Special case for GFS physics
+        do i=is,ie
+           qd(i) = max(0., q(i,j,k,liq_wat))
+           if ( t1(i) > tice ) then
+                qs(i) = 0.
+           elseif ( t1(i) < tice-t_i0 ) then
+                qs(i) = qd(i)
+           else
+                qs(i) = qd(i)*(tice-t1(i))/t_i0
+           endif
+           ql(i) = qd(i) - qs(i)
+           qv(i) = max(0.,q(i,j,k,sphum))
+           cpm(i) = (1.-(qv(i)+qd(i)))*cp_air + qv(i)*cp_vapor + ql(i)*c_liq + qs(i)*c_ice
+        enddo
+     else
+     do i=is,ie
+        qv(i) = max(0.,q(i,j,k,sphum))
+        qs(i) = max(0.,q(i,j,k,liq_wat))
+        qd(i) = qs(i)
+        cpm(i) = (1.-qv(i))*cp_air + qv(i)*cp_vapor
+     enddo
+     endif
+
+  case(3)
+     do i=is,ie
+        qv(i) = q(i,j,k,sphum)
+        ql(i) = q(i,j,k,liq_wat)
+        qs(i) = q(i,j,k,ice_wat)
+        qd(i) = ql(i) + qs(i)
+        cpm(i) = (1.-(qv(i)+qd(i)))*cp_air + qv(i)*cp_vapor + ql(i)*c_liq + qs(i)*c_ice
+     enddo
+  case(4)    ! K_warm_rain scheme with fake ice
+     do i=is,ie
+        qv(i) = q(i,j,k,sphum)
+        qd(i) = q(i,j,k,liq_wat) + q(i,j,k,rainwat)
+        cpm(i) = (1.-(qv(i)+qd(i)))*cp_air + qv(i)*cp_vapor + qd(i)*c_liq
+     enddo
+  case(5)
+     do i=is,ie
+        qv(i) = q(i,j,k,sphum)
+        ql(i) = q(i,j,k,liq_wat) + q(i,j,k,rainwat)
+        qs(i) = q(i,j,k,ice_wat) + q(i,j,k,snowwat)
+        qd(i) = ql(i) + qs(i)
+        cpm(i) = (1.-(qv(i)+qd(i)))*cp_air + qv(i)*cp_vapor + ql(i)*c_liq + qs(i)*c_ice
+     enddo
+  case(6:7)
+     do i=is,ie
+        qv(i) = q(i,j,k,sphum)
+        ql(i) = q(i,j,k,liq_wat) + q(i,j,k,rainwat)
+        qs(i) = q(i,j,k,ice_wat) + q(i,j,k,snowwat) + q(i,j,k,graupel)
+        qd(i) = ql(i) + qs(i)
+        cpm(i) = (1.-(qv(i)+qd(i)))*cp_air + qv(i)*cp_vapor + ql(i)*c_liq + qs(i)*c_ice
+     enddo
+  case default
+   ! call mpp_error (NOTE, 'fv_mapz::moist_cp - using default cp_air')
+     do i=is,ie
+        qd(i) = 0.
+        cpm(i) = cp_air
+     enddo
+  end select
+
+ end subroutine moist_cp
+
+!-----------------------------------------------------------------------
+!BOP
+! !ROUTINE:  map1_gmao --- GMAO Interpolation for vertical re-mapping
+!
+! !INTERFACE:
+  subroutine map1_gmao( km,   pe1,    q1,                 &
+                        kn,   pe2,    q2,   i1, i2,       &
+                        j,    ibeg, iend, jbeg, jend, akap, gmao_remap, P_MAP, conserv)
+      implicit none
+
+! !INPUT PARAMETERS:
+      integer, intent(in) :: i1                ! Starting longitude
+      integer, intent(in) :: i2                ! Finishing longitude
+      real, intent(in) :: akap
+      integer, intent(in) :: P_MAP             ! Thermodynamic variable to remap
+                                               ! 1:TE-logP  2:PT-PK  3:T-logP
+      integer, intent(in) :: gmao_remap        ! 3:cubic  2:quadratic  1:linear
+      logical, intent(in) :: conserv
+      integer, intent(in) :: j                 ! Current latitude
+      integer, intent(in) :: ibeg, iend, jbeg, jend
+      integer, intent(in) :: km                ! Original vertical dimension
+      integer, intent(in) :: kn                ! Target vertical dimension
+
+      real, intent(in) ::  pe1(i1:i2,km+1)  ! pressure at layer edges
+                                               ! (from model top to bottom surface)
+                                               ! in the original vertical coordinate
+      real, intent(in) ::  pe2(i1:i2,kn+1)  ! pressure at layer edges
+                                               ! (from model top to bottom surface)
+                                               ! in the new vertical coordinate
+
+      real, intent(in) ::    q1(ibeg:iend,jbeg:jend,km) ! Field input
+! !INPUT/OUTPUT PARAMETERS:
+      real, intent(inout)::  q2(ibeg:iend,jbeg:jend,kn) ! Field output
+
+! !DESCRIPTION:
+!
+!     Perform Cubic Interpolation a given latitude
+! pe1: pressure at layer edges (from model top to bottom surface)
+!      in the original vertical coordinate
+! pe2: pressure at layer edges (from model top to bottom surface)
+!      in the new vertical coordinate
+!
+! !REVISION HISTORY:
+!    2005.11.14   Takacs    Initial Code
+!    2016.07.20   Putman    Modified to make genaric for any thermodynamic variable
+!
+!EOP
+!-----------------------------------------------------------------------
+!BOC
+!
+! !LOCAL VARIABLES:
+      real       qx(i1:i2,km)
+      real   logpl1(i1:i2,km)
+      real   logpl2(i1:i2,kn)
+      real   dlogp1(i1:i2,km)
+      real    vsum1(i1:i2)
+      real    vsum2(i1:i2)
+      real   am2,am1,ap0,ap1,P,PLP1,PLP0,PLM1,PLM2,DLP0,DLM1,DLM2
+
+      integer i, k, LM2,LM1,LP0,LP1
+
+! Initialization
+! --------------
+
+      select case (P_MAP)
+      case(1)
+       ! Remapping in Log(P)
+        do k=1,km
+            qx(:,k) = q1(i1:i2,j,k)
+        logpl1(:,k) = log( r2*(pe1(:,k)+pe1(:,k+1)) )
+        enddo
+        do k=1,kn
+        logpl2(:,k) = log( r2*(pe2(:,k)+pe2(:,k+1)) )
+        enddo
+        do k=1,km-1
+        dlogp1(:,k) = logpl1(:,k+1)-logpl1(:,k)
+        enddo
+      case(2)
+       ! Remapping in P^KAPPA
+        do k=1,km
+            qx(:,k) = q1(i1:i2,j,k)
+        logpl1(:,k) = exp( akap*log( r2*(pe1(:,k)+pe1(:,k+1))) )
+        enddo
+        do k=1,kn
+        logpl2(:,k) = exp( akap*log( r2*(pe2(:,k)+pe2(:,k+1))) )
+        enddo
+        do k=1,km-1
+        dlogp1(:,k) = logpl1(:,k+1)-logpl1(:,k)
+        enddo
+      end select
+
+      if (conserv) then
+! Compute vertical integral of Input field
+! ----------------------------------------
+        vsum1(:) = r0
+        do i=i1,i2
+        do k=1,km
+        vsum1(i) = vsum1(i) + qx(i,k)*( pe1(i,k+1)-pe1(i,k) )
+        enddo
+        vsum1(i) = vsum1(i) / ( pe1(i,km+1)-pe1(i,1) )
+        enddo
+      endif
+
+! Interpolate field onto target Pressures
+! ---------------------------------------
+      do i=i1,i2
+      do k=1,kn
+         LM1 = 1
+         LP0 = 1
+         do while( LP0.le.km )
+            if (logpl1(i,LP0).lt.logpl2(i,k)) then
+               LP0 = LP0+1
+            else
+               exit
+            endif
+         enddo
+         LM1 = max(LP0-1,1)
+         LP0 = min(LP0, km)
+
+! Extrapolate Linearly above first model level
+! ----------------------------------------------------
+         if( LM1.eq.1 .and. LP0.eq.1 ) then
+             q2(i,j,k) = qx(i,1) + ( qx(i,2)-qx(i,1) )*( logpl2(i,k)-logpl1(i,1) ) &
+                                                      /( logpl1(i,2)-logpl1(i,1) )
+
+! Extrapolate Linearly below last model level
+! ---------------------------------------------------
+         else if( LM1.eq.km .and. LP0.eq.km ) then
+             q2(i,j,k) = qx(i,km) + ( qx(i,km)-qx(i,km-1) )*( logpl2(i,k )-logpl1(i,km  ) ) &
+                                                           /( logpl1(i,km)-logpl1(i,km-1) )
+
+! Interpolate Linearly between levels 1 => 2 and km-1 => km
+! -----------------------------------------------------------------
+         else if( LM1.eq.1 .or. LP0.eq.km ) then
+             q2(i,j,k) = qx(i,LP0) + ( qx(i,LM1)-qx(i,LP0) )*( logpl2(i,k  )-logpl1(i,LP0) ) &
+                                                            /( logpl1(i,LM1)-logpl1(i,LP0) )
+! Interpolate Cubicly between other model levels
+! ------------------------------------------------------
+         else
+              LP1 = LP0+1
+              LM2 = LM1-1
+             P    = logpl2(i,k)
+             PLP1 = logpl1(i,LP1)
+             PLP0 = logpl1(i,LP0)
+             PLM1 = logpl1(i,LM1)
+             PLM2 = logpl1(i,LM2)
+             DLP0 = dlogp1(i,LP0)
+             DLM1 = dlogp1(i,LM1)
+             DLM2 = dlogp1(i,LM2)
+
+           ! Cubic Coefficients
+           ! ------------------
+             if( gmao_remap .eq. 3 ) then
+                 ap1 = (P-PLP0)*(P-PLM1)*(P-PLM2)/( DLP0*(DLP0+DLM1)*(DLP0+DLM1+DLM2) )
+                 ap0 = (PLP1-P)*(P-PLM1)*(P-PLM2)/( DLP0*      DLM1 *(     DLM1+DLM2) )
+                 am1 = (PLP1-P)*(PLP0-P)*(P-PLM2)/( DLM1*      DLM2 *(DLP0+DLM1     ) )
+                 am2 = (PLP1-P)*(PLP0-P)*(PLM1-P)/( DLM2*(DLM1+DLM2)*(DLP0+DLM1+DLM2) )
+                 q2(i,j,k) = ap1*qx(i,LP1) + ap0*qx(i,LP0) + am1*qx(i,LM1) + am2*qx(i,LM2)
+             endif
+
+           ! Quadratic Coefficients
+           ! ----------------------
+             if( gmao_remap .eq. 2 ) then
+                 ap1 = (P-PLP0)*(P-PLM1)/( (PLP1-PLP0)*(PLP1-PLM1) )
+                 ap0 = (PLP1-P)*(P-PLM1)/( (PLP1-PLP0)*(PLP0-PLM1) )
+                 am1 = (PLP1-P)*(PLP0-P)/( (PLP1-PLM1)*(PLP0-PLM1) )
+                 q2(i,j,k) = ap1*qx(i,LP1) + ap0*qx(i,LP0) + am1*qx(i,LM1)
+             endif
+
+           ! Linear Coefficients
+           ! -------------------
+             if( gmao_remap .eq. 1 ) then
+                 q2(i,j,k) = qx(i,LP0) + ( qx(i,LM1)-qx(i,LP0) )*( logpl2(i,k  )-logpl1(i,LP0) ) &
+                                                                /( logpl1(i,LM1)-logpl1(i,LP0) )
+             endif
+
+         endif
+
+      enddo
+      enddo
+
+      if (conserv) then
+! Compute vertical integral of Output field
+! -----------------------------------------
+        vsum2(:) = r0
+        do i=i1,i2
+        do k=1,kn
+        vsum2(i) = vsum2(i) + q2(i,j,k)*( pe2(i,k+1)-pe2(i,k) )
+        enddo
+        vsum2(i) = vsum2(i) / ( pe2(i,kn+1)-pe2(i,1) )
+        enddo
+! Adjust Final field to conserve
+! ------------------------------
+        do i=i1,i2
+        do k=1,kn
+           q2(i,j,k) = q2(i,j,k) + vsum1(i)-vsum2(i)
+        enddo
+        enddo
+      endif
+
+      return
+!EOC
+ end subroutine map1_gmao
+!-----------------------------------------------------------------------
+
+
+end module fv_mapz_mod
diff --git a/pyFV3/__init__.py b/pyFV3/__init__.py
index 4024a8f1..c8081971 100644
--- a/pyFV3/__init__.py
+++ b/pyFV3/__init__.py
@@ -11,3 +11,10 @@
 """
 
 __version__ = "0.2.0"
+
+__all__ = [
+    "DynamicalCoreConfig",
+    "DycoreState",
+    "DryConvectiveAdjustment",
+    "DynamicalCore",
+]
diff --git a/pyFV3/_config.py b/pyFV3/_config.py
index ce7fb2a6..0f226979 100644
--- a/pyFV3/_config.py
+++ b/pyFV3/_config.py
@@ -1,18 +1,25 @@
+from __future__ import annotations
+
 import dataclasses
 from datetime import timedelta
 from math import floor
-from typing import Optional, Tuple
 
 import f90nml
 import yaml
+from dacite import Config, from_dict
 
-from ndsl.namelist import Namelist, NamelistDefaults
+from ndsl.utils import f90nml_as_dict
 
 
 DEFAULT_INT = 0
 DEFAULT_STR = ""
 DEFAULT_FLOAT = 0.0
 DEFAULT_BOOL = False
+DEFAULT_DYCORE_NML_GROUPS = (
+    "main_nml",
+    "coupler_nml",
+    "fv_core_nml",
+)
 
 
 @dataclasses.dataclass(frozen=True)
@@ -195,95 +202,101 @@ class DynamicalCoreConfig:
     vtdm4: float = DEFAULT_FLOAT
     z_tracer: bool = DEFAULT_BOOL
     do_qa: bool = DEFAULT_BOOL
-    layout: Tuple[int, int] = NamelistDefaults.layout
-    grid_type: int = NamelistDefaults.grid_type
-    u_max: float = NamelistDefaults.u_max  # max windspeed for dp config
-    do_f3d: bool = NamelistDefaults.do_f3d
-    inline_q: bool = NamelistDefaults.inline_q
-    do_skeb: bool = NamelistDefaults.do_skeb  # save dissipation estimate
-    use_logp: bool = NamelistDefaults.use_logp
-    moist_phys: bool = NamelistDefaults.moist_phys
-    check_negative: bool = NamelistDefaults.check_negative
+    layout: tuple[int, int] = (1, 1)
+    grid_type: int = 0
+    u_max: float = 350.0
+    """max windspeed for dp config"""
+    do_f3d: bool = False
+    inline_q: bool = False
+    do_skeb: bool = False
+    """save dissipation estimate"""
+    use_logp: bool = False
+    moist_phys: bool = True
+    check_negative: bool = False
     # gfdl_cloud_microphys.F90
-    tau_r2g: float = NamelistDefaults.tau_r2g  # rain freezing during fast_sat
-    tau_smlt: float = NamelistDefaults.tau_smlt  # snow melting
-    tau_g2r: float = NamelistDefaults.tau_g2r  # graupel melting to rain
-    tau_imlt: float = NamelistDefaults.tau_imlt  # cloud ice melting
-    tau_i2s: float = NamelistDefaults.tau_i2s  # cloud ice to snow auto - conversion
-    tau_l2r: float = NamelistDefaults.tau_l2r  # cloud water to rain auto - conversion
-    tau_g2v: float = NamelistDefaults.tau_g2v  # graupel sublimation
-    tau_v2g: float = (
-        NamelistDefaults.tau_v2g
-    )  # graupel deposition -- make it a slow process
-    sat_adj0: float = (
-        NamelistDefaults.sat_adj0
-    )  # adjustment factor (0: no 1: full) during fast_sat_adj
+    tau_r2g: float = 900.0
+    """rain freezing during fast_sat"""
+    tau_smlt: float = 900.0
+    """snow melting"""
+    tau_g2r: float = 600.0
+    """graupel melting to rain"""
+    tau_imlt: float = 600.0
+    """cloud ice melting"""
+    tau_i2s: float = 1000.0
+    """cloud ice to snow auto - conversion"""
+    tau_l2r: float = 900.0
+    """cloud water to rain auto - conversion"""
+    tau_g2v: float = 1200.0
+    """graupel sublimation"""
+    tau_v2g: float = 21600.0
+    """graupel deposition -- make it a slow process"""
+    sat_adj0: float = 0.90
+    """adjustment factor (0: no 1: full) during fast_sat_adj"""
     ql_gen: float = (
         1.0e-3  # max new cloud water during remapping step if fast_sat_adj = .t.
     )
-    ql_mlt: float = (
-        NamelistDefaults.ql_mlt
-    )  # max value of cloud water allowed from melted cloud ice
-    qs_mlt: float = NamelistDefaults.qs_mlt  # max cloud water due to snow melt
-    ql0_max: float = (
-        NamelistDefaults.ql0_max
-    )  # max cloud water value (auto converted to rain)
-    t_sub: float = NamelistDefaults.t_sub  # min temp for sublimation of cloud ice
-    qi_gen: float = (
-        NamelistDefaults.qi_gen
-    )  # max cloud ice generation during remapping step
-    qi_lim: float = (
-        NamelistDefaults.qi_lim
-    )  # cloud ice limiter to prevent large ice build up
-    qi0_max: float = NamelistDefaults.qi0_max  # max cloud ice value (by other sources)
-    rad_snow: bool = (
-        NamelistDefaults.rad_snow
-    )  # consider snow in cloud fraction calculation
-    rad_rain: bool = (
-        NamelistDefaults.rad_rain
-    )  # consider rain in cloud fraction calculation
-    rad_graupel: bool = (
-        NamelistDefaults.rad_graupel
-    )  # consider graupel in cloud fraction calculation
-    tintqs: bool = (
-        NamelistDefaults.tintqs
-    )  # use temperature in the saturation mixing in PDF
-    dw_ocean: float = NamelistDefaults.dw_ocean  # base value for ocean
-    dw_land: float = (
-        NamelistDefaults.dw_land
-    )  # base value for subgrid deviation / variability over land
+    ql_mlt: float = 2.0e-3
+    """max value of cloud water allowed from melted cloud ice"""
+    qs_mlt: float = 1.0e-6
+    """max cloud water due to snow melt"""
+    ql0_max: float = 2.0e-3
+    """max cloud water value (auto converted to rain)"""
+    t_sub: float = 184.0
+    """min temp for sublimation of cloud ice"""
+    qi_gen: float = 1.82e-6
+    """max cloud ice generation during remapping step"""
+    qi_lim: float = 1.0
+    """cloud ice limiter to prevent large ice build up"""
+    qi0_max: float = 1.0e-4
+    """max cloud ice value (by other sources)"""
+    rad_snow: bool = True
+    """consider snow in cloud fraction calculation"""
+    rad_rain: bool = True
+    """consider rain in cloud fraction calculation"""
+    rad_graupel: bool = True
+    """consider graupel in cloud fraction calculation"""
+    tintqs: bool = False
+    """use temperature in the saturation mixing in PDF"""
+    dw_ocean: float = 0.10
+    """base value for ocean"""
+    dw_land: float = 0.15
+    """base value for subgrid deviation / variability over land"""
     # cloud scheme 0 - ?
     # 1: old fvgfs gfdl) mp implementation
     # 2: binary cloud scheme (0 / 1)
-    icloud_f: int = NamelistDefaults.icloud_f
-    cld_min: float = NamelistDefaults.cld_min  # !< minimum cloud fraction
-    tau_l2v: float = (
-        NamelistDefaults.tau_l2v
-    )  # cloud water to water vapor (evaporation)
-    tau_v2l: float = (
-        NamelistDefaults.tau_v2l
-    )  # water vapor to cloud water (condensation)
-    c2l_ord: int = NamelistDefaults.c2l_ord
-    regional: bool = NamelistDefaults.regional
-    m_split: int = NamelistDefaults.m_split
-    convert_ke: bool = NamelistDefaults.convert_ke
-    breed_vortex_inline: bool = NamelistDefaults.breed_vortex_inline
-    use_old_omega: bool = NamelistDefaults.use_old_omega
-    rf_fast: bool = NamelistDefaults.rf_fast
-    adiabatic: bool = NamelistDefaults.adiabatic
-    nf_omega: int = NamelistDefaults.nf_omega
-    fv_sg_adj: int = NamelistDefaults.fv_sg_adj
-    n_sponge: int = NamelistDefaults.n_sponge
-    namelist_override: Optional[str] = None
-
-    def __post_init__(self):
+    icloud_f: int = 0
+    cld_min: float = 0.05
+    """!< minimum cloud fraction"""
+    tau_l2v: float = 300.0
+    """cloud water to water vapor (evaporation)"""
+    tau_v2l: float = 90.0
+    """water vapor to cloud water (condensation)"""
+    c2l_ord: int = 4
+    regional: bool = False
+    m_split: int = 0
+    convert_ke: bool = False
+    breed_vortex_inline: bool = False
+    use_old_omega: bool = True
+    rf_fast: bool = False
+    adiabatic: bool = False
+    nf_omega: int = 1
+    fv_sg_adj: int = -1
+    n_sponge: int = 1
+    sw_dynamics: bool = False
+    """shallow water conditions"""
+    namelist_override: str | None = None
+    target_nml_groups: tuple[str, ...] | None = DEFAULT_DYCORE_NML_GROUPS
+
+    def __post_init__(self) -> None:
         if self.namelist_override is not None:
             try:
                 f90_nml = f90nml.read(self.namelist_override)
             except FileNotFoundError:
                 print(f"{self.namelist_override} does not exist")
                 raise
-            dycore_config = self.from_f90nml(f90_nml)
+            # TODO: Find a better way to do below. Passing self.* as an argument
+            # to a class function of the same class is always a bit fishy.
+            dycore_config = self.from_f90nml(f90_nml, self.target_nml_groups)
             for var in dycore_config.__dict__.keys():
                 setattr(self, var, dycore_config.__dict__[var])
         # Single tile cartesian grids
@@ -291,102 +304,52 @@ def __post_init__(self):
             self.nf_omega = 0
 
     @classmethod
-    def from_f90nml(cls, f90_namelist: f90nml.Namelist) -> "DynamicalCoreConfig":
-        namelist = Namelist.from_f90nml(f90_namelist)
-        return cls.from_namelist(namelist)
+    def from_f90nml(
+        cls,
+        nml: f90nml.Namelist,
+        target_groups: tuple[str, ...] | None = DEFAULT_DYCORE_NML_GROUPS,
+    ) -> DynamicalCoreConfig:
+        """Uses the nml to create a DynamicalCoreConfig.
+
+        Args:
+            nml: f90nml.Namelist
+            target_groups: tuple[str,...] | None
+                This list will be used to specify which groups in the nml to
+                use when initializing the DynamicalCoreConfig. If None, all
+                groups will be used. (Default: DEFAULT_DYCORE_NML_GROUPS)
+        """
+        groups = list(target_groups) if target_groups is not None else None
+        nml_dict = f90nml_as_dict(nml, flatten=True, target_groups=groups)
+        nml_dict["target_nml_groups"] = target_groups
+        return cls.from_dict(nml_dict)
 
     @classmethod
-    def from_namelist(cls, namelist: Namelist) -> "DynamicalCoreConfig":
-        return cls(
-            dt_atmos=namelist.dt_atmos,
-            a_imp=namelist.a_imp,
-            beta=namelist.beta,
-            consv_te=namelist.consv_te,
-            d2_bg=namelist.d2_bg,
-            d2_bg_k1=namelist.d2_bg_k1,
-            d2_bg_k2=namelist.d2_bg_k2,
-            d4_bg=namelist.d4_bg,
-            d_con=namelist.d_con,
-            d_ext=namelist.d_ext,
-            dddmp=namelist.dddmp,
-            delt_max=namelist.delt_max,
-            do_sat_adj=namelist.do_sat_adj,
-            do_vort_damp=namelist.do_vort_damp,
-            fill=namelist.fill,
-            hord_dp=namelist.hord_dp,
-            hord_mt=namelist.hord_mt,
-            hord_tm=namelist.hord_tm,
-            hord_tr=namelist.hord_tr,
-            hord_vt=namelist.hord_vt,
-            hydrostatic=namelist.hydrostatic,
-            k_split=namelist.k_split,
-            ke_bg=namelist.ke_bg,
-            kord_mt=namelist.kord_mt,
-            kord_tm=namelist.kord_tm,
-            kord_tr=namelist.kord_tr,
-            kord_wz=namelist.kord_wz,
-            n_split=namelist.n_split,
-            nord=namelist.nord,
-            npx=namelist.npx,
-            npy=namelist.npy,
-            npz=namelist.npz,
-            ntiles=namelist.ntiles,
-            nwat=namelist.nwat,
-            p_fac=namelist.p_fac,
-            rf_cutoff=namelist.rf_cutoff,
-            tau=namelist.tau,
-            vtdm4=namelist.vtdm4,
-            z_tracer=namelist.z_tracer,
-            do_qa=namelist.do_qa,
-            layout=namelist.layout,
-            grid_type=namelist.grid_type,
-            u_max=namelist.u_max,
-            do_f3d=namelist.do_f3d,
-            inline_q=namelist.inline_q,
-            do_skeb=namelist.do_skeb,
-            check_negative=namelist.check_negative,
-            tau_r2g=namelist.tau_r2g,
-            tau_smlt=namelist.tau_smlt,
-            tau_g2r=namelist.tau_g2r,
-            tau_imlt=namelist.tau_imlt,
-            tau_i2s=namelist.tau_i2s,
-            tau_l2r=namelist.tau_l2r,
-            tau_g2v=namelist.tau_g2v,
-            tau_v2g=namelist.tau_v2g,
-            sat_adj0=namelist.sat_adj0,
-            ql_gen=namelist.ql_gen,
-            ql_mlt=namelist.ql_mlt,
-            qs_mlt=namelist.qs_mlt,
-            ql0_max=namelist.ql0_max,
-            t_sub=namelist.t_sub,
-            qi_gen=namelist.qi_gen,
-            qi_lim=namelist.qi_lim,
-            qi0_max=namelist.qi0_max,
-            rad_snow=namelist.rad_snow,
-            rad_rain=namelist.rad_rain,
-            rad_graupel=namelist.rad_graupel,
-            tintqs=namelist.tintqs,
-            dw_ocean=namelist.dw_ocean,
-            dw_land=namelist.dw_land,
-            icloud_f=namelist.icloud_f,
-            cld_min=namelist.cld_min,
-            tau_l2v=namelist.tau_l2v,
-            tau_v2l=namelist.tau_v2l,
-            c2l_ord=namelist.c2l_ord,
-            regional=namelist.regional,
-            m_split=namelist.m_split,
-            convert_ke=namelist.convert_ke,
-            breed_vortex_inline=namelist.breed_vortex_inline,
-            use_old_omega=namelist.use_old_omega,
-            rf_fast=namelist.rf_fast,
-            adiabatic=namelist.adiabatic,
-            nf_omega=namelist.nf_omega,
-            fv_sg_adj=namelist.fv_sg_adj,
-            n_sponge=namelist.n_sponge,
+    def from_dict(
+        cls,
+        data: dict,
+    ) -> DynamicalCoreConfig:
+        """Create a DynamicalCoreConfig from the given data.
+
+        Args:
+            data: "flattened" dictionary where the keys match the class member variables
+        """
+        # NOTE: We're setting strict to False so that extra keys in the data are
+        # ignored. Eventually, we'd like to turn this to True once we move away from
+        # expecting dicts that are basically flattened yamls and f90nml files.
+        dacite_config = Config(
+            strict=False,
+            type_hooks={
+                tuple[int, int]: lambda x: tuple(x),
+                tuple[str, ...]: lambda x: tuple(x) if x is not None else None,
+            },
+        )
+        dycore_config = from_dict(
+            data_class=DynamicalCoreConfig, data=data, config=dacite_config
         )
+        return dycore_config
 
     @classmethod
-    def from_yaml(cls, yaml_config: str) -> "DynamicalCoreConfig":
+    def from_yaml(cls, yaml_config: str) -> DynamicalCoreConfig:
         config = cls()
         with open(yaml_config, "r") as f:
             raw_config = yaml.safe_load(f)
diff --git a/pyFV3/dycore_state.py b/pyFV3/dycore_state.py
index 9058e4d5..2dab7dac 100644
--- a/pyFV3/dycore_state.py
+++ b/pyFV3/dycore_state.py
@@ -1,10 +1,10 @@
 from dataclasses import asdict, dataclass, field, fields
-from typing import Any, Dict, Mapping, Union
+from typing import Any, Dict, List, Mapping, Optional, Union
 
 import xarray as xr
 
 import ndsl.dsl.gt4py_utils as gt_utils
-from ndsl import GridSizer, Quantity, QuantityFactory
+from ndsl import Quantity, QuantityFactory
 from ndsl.constants import (
     X_DIM,
     X_INTERFACE_DIM,
@@ -16,6 +16,66 @@
 from ndsl.dsl.typing import Float
 from ndsl.restart._legacy_restart import open_restart
 from ndsl.typing import Communicator
+from pyFV3.tracers import TracersType, setup_tracers
+from ndsl.quantity.field_bundle import FieldBundle
+
+
+DEFAULT_TRACER_PROPERTIES = {
+    "specific_humidity": {
+        "pyFV3_key": "vapor",
+        "dims": [Z_DIM, Y_DIM, X_DIM],
+        "restart_name": "sphum",
+        "units": "g/kg",
+    },
+    "cloud_liquid_water_mixing_ratio": {
+        "pyFV3_key": "liquid",
+        "dims": [Z_DIM, Y_DIM, X_DIM],
+        "restart_name": "liq_wat",
+        "units": "g/kg",
+    },
+    "cloud_ice_mixing_ratio": {
+        "pyFV3_key": "ice",
+        "dims": [Z_DIM, Y_DIM, X_DIM],
+        "restart_name": "ice_wat",
+        "units": "g/kg",
+    },
+    "rain_mixing_ratio": {
+        "pyFV3_key": "rain",
+        "dims": [Z_DIM, Y_DIM, X_DIM],
+        "restart_name": "rainwat",
+        "units": "g/kg",
+    },
+    "snow_mixing_ratio": {
+        "pyFV3_key": "snow",
+        "dims": [Z_DIM, Y_DIM, X_DIM],
+        "restart_name": "snowwat",
+        "units": "g/kg",
+    },
+    "graupel_mixing_ratio": {
+        "pyFV3_key": "graupel",
+        "dims": [Z_DIM, Y_DIM, X_DIM],
+        "restart_name": "graupel",
+        "units": "g/kg",
+    },
+    "ozone_mixing_ratio": {
+        "pyFV3_key": "o3mr",
+        "dims": [Z_DIM, Y_DIM, X_DIM],
+        "restart_name": "o3mr",
+        "units": "g/kg",
+    },
+    "turbulent_kinetic_energy": {
+        "pyFV3_key": "sgs_tke",
+        "dims": [Z_DIM, Y_DIM, X_DIM],
+        "restart_name": "sgs_tke",
+        "units": "g/kg",
+    },
+    "cloud_fraction": {
+        "pyFV3_key": "cloud",
+        "dims": [Z_DIM, Y_DIM, X_DIM],
+        "restart_name": "cld_amt",
+        "units": "g/kg",
+    },
+}
 
 
 @dataclass()
@@ -148,74 +208,10 @@ class DycoreState:
             "intent": "inout",
         }
     )
-    qvapor: Quantity = field(
-        metadata={
-            "name": "specific_humidity",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "kg/kg",
-        }
-    )
-    qliquid: Quantity = field(
-        metadata={
-            "name": "cloud_water_mixing_ratio",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "kg/kg",
-            "intent": "inout",
-        }
-    )
-    qice: Quantity = field(
-        metadata={
-            "name": "cloud_ice_mixing_ratio",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "kg/kg",
-            "intent": "inout",
-        }
-    )
-    qrain: Quantity = field(
-        metadata={
-            "name": "rain_mixing_ratio",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "kg/kg",
-            "intent": "inout",
-        }
-    )
-    qsnow: Quantity = field(
+    tracers: TracersType = field(
         metadata={
-            "name": "snow_mixing_ratio",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "kg/kg",
-            "intent": "inout",
-        }
-    )
-    qgraupel: Quantity = field(
-        metadata={
-            "name": "graupel_mixing_ratio",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "kg/kg",
-            "intent": "inout",
-        }
-    )
-    qo3mr: Quantity = field(
-        metadata={
-            "name": "ozone_mixing_ratio",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "kg/kg",
-            "intent": "inout",
-        }
-    )
-    qsgs_tke: Quantity = field(
-        metadata={
-            "name": "turbulent_kinetic_energy",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "m**2/s**2",
-            "intent": "inout",
-        }
-    )
-    qcld: Quantity = field(
-        metadata={
-            "name": "cloud_fraction",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "",
+            "name": "tracers",
+            "units": "g/kg",
             "intent": "inout",
         }
     )
@@ -297,6 +293,8 @@ class DycoreState:
 
     def __post_init__(self):
         for _field in fields(self):
+            if _field.name == "tracers":
+                continue
             for check_name in ["units", "dims"]:
                 if check_name in _field.metadata:
                     required = _field.metadata[check_name]
@@ -310,22 +308,46 @@ def __post_init__(self):
                         )
 
     @classmethod
-    def init_zeros(cls, quantity_factory: QuantityFactory):
+    def init_zeros(
+        cls,
+        quantity_factory: QuantityFactory,
+        tracer_count: int,
+        dtype_dict: Optional[Dict[str, type]] = None,
+        allow_mismatch_float_precision=False,
+    ):
         initial_storages = {}
         for _field in fields(cls):
             if "dims" in _field.metadata.keys():
                 initial_storages[_field.name] = quantity_factory.zeros(
                     _field.metadata["dims"],
                     _field.metadata["units"],
-                    dtype=Float,
+                    dtype=dtype_dict[_field.name]
+                    if dtype_dict and _field.name in dtype_dict.keys()
+                    else Float,  # type: ignore
+                    allow_mismatch_float_precision=allow_mismatch_float_precision,
+                ).data
+            elif _field.name == "tracers":
+                qty_factory_tracers = FieldBundle.extend_3D_quantity_factory(
+                    quantity_factory, {"tracers": tracer_count}
+                )
+                initial_storages[_field.name] = qty_factory_tracers.zeros(
+                    [X_DIM, Y_DIM, Z_DIM, "tracers"],
+                    _field.metadata["units"],
+                    dtype=Float,  # type: ignore
                 ).data
         return cls.init_from_storages(
-            storages=initial_storages, sizer=quantity_factory.sizer
+            storages=initial_storages,
+            quantity_factory=quantity_factory,
+            allow_mismatch_float_precision=allow_mismatch_float_precision,
         )
 
     @classmethod
     def init_from_numpy_arrays(
-        cls, dict_of_numpy_arrays, sizer: GridSizer, backend: str
+        cls,
+        dict_of_numpy_arrays,
+        quantity_factory: QuantityFactory,
+        backend: str,
+        tracer_list: List[str],
     ):
         field_names = [_field.name for _field in fields(cls)]
         for variable_name in dict_of_numpy_arrays.keys():
@@ -341,10 +363,22 @@ def init_from_numpy_arrays(
                     dict_of_numpy_arrays[_field.name],
                     dims,
                     _field.metadata["units"],
-                    origin=sizer.get_origin(dims),
-                    extent=sizer.get_extent(dims),
+                    origin=quantity_factory.sizer.get_origin(dims),
+                    extent=quantity_factory.sizer.get_extent(dims),
                     gt4py_backend=backend,
                 )
+            elif issubclass(_field.type, Tracers):
+                if len(dict_of_numpy_arrays[_field.name]) != len(tracer_list):
+                    raise ValueError(
+                        "[pyFV3] DycoreState init:"
+                        f" tracer list size ({len(tracer_list)})"
+                        " doesn't match the inputs size"
+                        f" ({len(dict_of_numpy_arrays[_field.name])})"
+                    )
+                dict_state[_field.name] = Tracers.make(
+                    quantity_factory=quantity_factory,
+                    tracer_mapping=tracer_list,
+                )
         state = cls(**dict_state)  # type: ignore
         return state
 
@@ -352,9 +386,10 @@ def init_from_numpy_arrays(
     def init_from_storages(
         cls,
         storages: Mapping[str, Any],
-        sizer: GridSizer,
+        quantity_factory: QuantityFactory,
         bdt: float = 0.0,
         mdt: float = 0.0,
+        allow_mismatch_float_precision=False,
     ):
         inputs = {}
         for _field in fields(cls):
@@ -364,10 +399,16 @@ def init_from_storages(
                     storages[_field.name],
                     dims,
                     _field.metadata["units"],
-                    origin=sizer.get_origin(dims),
-                    extent=sizer.get_extent(dims),
+                    origin=quantity_factory.sizer.get_origin(dims),
+                    extent=quantity_factory.sizer.get_extent(dims),
+                    allow_mismatch_float_precision=allow_mismatch_float_precision,
                 )
                 inputs[_field.name] = quantity
+            elif "tracers" == _field.name:
+                tracers = setup_tracers(storages["tracers"].shape[3], quantity_factory)
+                tracers.quantity.data[:] = storages["tracers"][:]
+                inputs[_field.name] = tracers
+
         return cls(**inputs, bdt=bdt, mdt=mdt)
 
     @classmethod
@@ -381,10 +422,12 @@ def from_fortran_restart(
         state_dict: Mapping[str, Quantity] = open_restart(
             dirname=path,
             communicator=communicator,
-            tracer_properties=TRACER_PROPERTIES,
+            tracer_properties=DEFAULT_TRACER_PROPERTIES,
+        )
+        new = cls.init_zeros(
+            quantity_factory=quantity_factory,
+            tracer_count=len(DEFAULT_TRACER_PROPERTIES),
         )
-
-        new = cls.init_zeros(quantity_factory=quantity_factory)
         new.pt.view[:] = new.pt.np.asarray(
             state_dict["air_temperature"].transpose(new.pt.dims).view[:]
         )
@@ -405,31 +448,33 @@ def from_fortran_restart(
         new.v.view[:] = new.v.np.asarray(
             state_dict["y_wind"].transpose(new.v.dims).view[:]
         )
-        new.qvapor.view[:] = new.qvapor.np.asarray(
-            state_dict["specific_humidity"].transpose(new.qvapor.dims).view[:]
+        new.tracers.vapor.view[:] = new.tracers.vapor.np.asarray(
+            state_dict["specific_humidity"].transpose(new.tracers.vapor.dims).view[:]
         )
-        new.qliquid.view[:] = new.qliquid.np.asarray(
+        new.tracers.liquid.view[:] = new.tracers.liquid.np.asarray(
             state_dict["cloud_liquid_water_mixing_ratio"]
-            .transpose(new.qliquid.dims)
+            .transpose(new.tracers.liquid.dims)
             .view[:]
         )
-        new.qice.view[:] = new.qice.np.asarray(
-            state_dict["cloud_ice_mixing_ratio"].transpose(new.qice.dims).view[:]
+        new.tracers.ice.view[:] = new.tracers.ice.np.asarray(
+            state_dict["cloud_ice_mixing_ratio"].transpose(new.tracers.ice.dims).view[:]
         )
-        new.qrain.view[:] = new.qrain.np.asarray(
-            state_dict["rain_mixing_ratio"].transpose(new.qrain.dims).view[:]
+        new.tracers.rain.view[:] = new.tracers.rain.np.asarray(
+            state_dict["rain_mixing_ratio"].transpose(new.tracers.rain.dims).view[:]
         )
-        new.qsnow.view[:] = new.qsnow.np.asarray(
-            state_dict["snow_mixing_ratio"].transpose(new.qsnow.dims).view[:]
+        new.tracers.snow.view[:] = new.tracers.snow.np.asarray(
+            state_dict["snow_mixing_ratio"].transpose(new.tracers.snow.dims).view[:]
         )
-        new.qgraupel.view[:] = new.qgraupel.np.asarray(
-            state_dict["graupel_mixing_ratio"].transpose(new.qgraupel.dims).view[:]
+        new.tracers.graupel.view[:] = new.tracers.graupel.np.asarray(
+            state_dict["graupel_mixing_ratio"]
+            .transpose(new.tracers.graupel.dims)
+            .view[:]
         )
-        new.qo3mr.view[:] = new.qo3mr.np.asarray(
-            state_dict["ozone_mixing_ratio"].transpose(new.qo3mr.dims).view[:]
+        new.tracers.o3mr.view[:] = new.tracers.o3mr.np.asarray(
+            state_dict["ozone_mixing_ratio"].transpose(new.tracers.o3mr.dims).view[:]
         )
-        new.qcld.view[:] = new.qcld.np.asarray(
-            state_dict["cloud_fraction"].transpose(new.qcld.dims).view[:]
+        new.tracers.cloud.view[:] = new.tracers.cld.np.asarray(
+            state_dict["cloud_fraction"].transpose(new.tracers.cld.dims).view[:]
         )
         new.delz.view[:] = new.delz.np.asarray(
             state_dict["vertical_thickness_of_atmospheric_layer"]
@@ -439,21 +484,32 @@ def from_fortran_restart(
 
         return new
 
+    def _xr_dataarray_from_quantity(self, name: str, metadata: Dict[str, Any], data):
+        dims = [f"{dim_name}_{name}" for dim_name in metadata["dims"]]
+        return xr.DataArray(
+            gt_utils.asarray(data),
+            dims=dims,
+            attrs={
+                "long_name": metadata["name"],
+                "units": metadata.get("units", "unknown"),
+            },
+        )
+
     @property
     def xr_dataset(self):
         data_vars = {}
         for name, field_info in self.__dataclass_fields__.items():
             if issubclass(field_info.type, Quantity):
-                dims = [
-                    f"{dim_name}_{name}" for dim_name in field_info.metadata["dims"]
-                ]
-                data_vars[name] = xr.DataArray(
-                    gt_utils.asarray(getattr(self, name).data),
-                    dims=dims,
-                    attrs={
-                        "long_name": field_info.metadata["name"],
-                        "units": field_info.metadata.get("units", "unknown"),
-                    },
+                data_vars[name] = self._xr_dataarray_from_quantity(
+                    name=name,
+                    metadata=field_info.metadata,
+                    data=getattr(self, name).data,
+                )
+            if isinstance(field_info.type, FieldBundle):
+                data_vars[name] = self._xr_dataarray_from_quantity(
+                    name=name,
+                    metadata=field_info.metadata,
+                    data=getattr(self, name).quantity.data,
                 )
         return xr.Dataset(data_vars=data_vars)
 
@@ -465,52 +521,3 @@ def as_dict(self, quantity_only=True) -> Dict[str, Union[Quantity, int]]:
             return {k: v for k, v in asdict(self).items() if isinstance(v, Quantity)}
         else:
             return {k: v for k, v in asdict(self).items()}
-
-
-TRACER_PROPERTIES = {
-    "specific_humidity": {
-        "dims": [Z_DIM, Y_DIM, X_DIM],
-        "restart_name": "sphum",
-        "units": "g/kg",
-    },
-    "cloud_liquid_water_mixing_ratio": {
-        "dims": [Z_DIM, Y_DIM, X_DIM],
-        "restart_name": "liq_wat",
-        "units": "g/kg",
-    },
-    "cloud_ice_mixing_ratio": {
-        "dims": [Z_DIM, Y_DIM, X_DIM],
-        "restart_name": "ice_wat",
-        "units": "g/kg",
-    },
-    "rain_mixing_ratio": {
-        "dims": [Z_DIM, Y_DIM, X_DIM],
-        "restart_name": "rainwat",
-        "units": "g/kg",
-    },
-    "snow_mixing_ratio": {
-        "dims": [Z_DIM, Y_DIM, X_DIM],
-        "restart_name": "snowwat",
-        "units": "g/kg",
-    },
-    "graupel_mixing_ratio": {
-        "dims": [Z_DIM, Y_DIM, X_DIM],
-        "restart_name": "graupel",
-        "units": "g/kg",
-    },
-    "ozone_mixing_ratio": {
-        "dims": [Z_DIM, Y_DIM, X_DIM],
-        "restart_name": "o3mr",
-        "units": "g/kg",
-    },
-    "turbulent_kinetic_energy": {
-        "dims": [Z_DIM, Y_DIM, X_DIM],
-        "restart_name": "sgs_tke",
-        "units": "g/kg",
-    },
-    "cloud_fraction": {
-        "dims": [Z_DIM, Y_DIM, X_DIM],
-        "restart_name": "cld_amt",
-        "units": "g/kg",
-    },
-}
diff --git a/pyFV3/initialization/test_cases/initialize_baroclinic.py b/pyFV3/initialization/test_cases/initialize_baroclinic.py
index 17196057..1a4a92eb 100644
--- a/pyFV3/initialization/test_cases/initialize_baroclinic.py
+++ b/pyFV3/initialization/test_cases/initialize_baroclinic.py
@@ -339,9 +339,11 @@ def init_baroclinic_state(
     )
     state = DycoreState.init_from_numpy_arrays(
         numpy_state.__dict__,
-        sizer=quantity_factory.sizer,
+        quantity_factory=quantity_factory,
         backend=sample_quantity.metadata.gt4py_backend,
+        tracer_list=["vapor", "liquid", "rain", "snow", "ice", "graupel", "cloud"],
     )
+    state.tracers["vapor"].view[:] = numpy_state.qvapor[slice_3d]
 
     comm.halo_update(state.phis, n_points=NHALO)
 
diff --git a/pyFV3/initialization/test_cases/initialize_tc.py b/pyFV3/initialization/test_cases/initialize_tc.py
index 38a4a46c..a402bd48 100644
--- a/pyFV3/initialization/test_cases/initialize_tc.py
+++ b/pyFV3/initialization/test_cases/initialize_tc.py
@@ -561,7 +561,6 @@ def init_tc_state(
     numpy_state.pkz[:] = pkz
     numpy_state.ps[:] = pe[:, :, -1]
     numpy_state.pt[:] = pt
-    numpy_state.qvapor[:] = qvapor
     numpy_state.u[:] = ud
     numpy_state.ua[:] = ua
     numpy_state.v[:] = vd
@@ -569,8 +568,10 @@ def init_tc_state(
     numpy_state.w[:] = w
     state = DycoreState.init_from_numpy_arrays(
         numpy_state.__dict__,
-        sizer=quantity_factory.sizer,
+        quantity_factory=quantity_factory,
         backend=sample_quantity.metadata.gt4py_backend,
+        tracer_list=["vapor", "liquid", "rain", "snow", "ice", "graupel", "cloud"],
     )
+    state.tracers["vapor"].view[:] = qvapor
 
     return state
diff --git a/pyFV3/mpi/mpp_sum.py b/pyFV3/mpi/mpp_sum.py
new file mode 100644
index 00000000..0a0ea019
--- /dev/null
+++ b/pyFV3/mpi/mpp_sum.py
@@ -0,0 +1,127 @@
+import numpy as np
+from ndsl import Quantity, StencilFactory
+from ndsl.dsl.typing import Float
+from ndsl.comm.communicator import Communicator, ReductionOperator
+
+
+def _increment_ints_faster(int_sum, pr, I_pr, r, max_mag_term):
+    if (r >= 1e30) == r < 1e30:
+        print("NaN_error")
+        return
+    sgn = 1
+    if r < 0.0:
+        sgn = -1
+
+    rs = abs(r)
+    if rs > abs(max_mag_term):
+        max_mag_term = r
+
+    for i in range(len(I_pr)):
+        ival = int(rs * I_pr[i])
+        rs = rs - ival * pr[i]
+        int_sum[i] = int_sum[i] + sgn * ival
+
+
+def _carry_overflow(int_sum, prec, I_prec, prec_error):
+    for i in range(len(int_sum) - 1, 0, -1):
+        if abs(int_sum[i]) > prec:
+            num_carry = int(int_sum[i] * I_prec)
+            int_sum[i] = int_sum[i] - num_carry * prec
+            int_sum[i - 1] = int_sum[i - 1] + num_carry
+    if abs(int_sum[0]) > prec_error:
+        overflow_error = True
+
+
+def _regularize_ints(int_sum, prec, I_prec):
+    for i in range(len(int_sum) - 1, 0, -1):
+        if abs(int_sum[i]) > prec:
+            num_carry = int(int_sum[i] * I_prec)
+            int_sum[i] = int_sum[i] - num_carry * prec
+            int_sum[i - 1] = int_sum[i - 1] + num_carry
+
+    positive = True
+
+    for i in range(len(int_sum)):
+        if abs(int_sum[i]) > 0:
+            if int_sum[i] < 0:
+                positive = False
+                break
+
+    if positive:
+        for i in range(len(int_sum) - 1, 0, -1):
+            if int_sum[i] < 0:
+                int_sum[i] = int_sum[i] + prec
+                int_sum[i - 1] = int_sum[i - 1] - 1
+
+    else:
+        for i in range(len(int_sum) - 1, 0, -1):
+            if int_sum[i] > 0:
+                int_sum[i] = int_sum[i] - prec
+                int_sum[i - 1] = int_sum[i - 1] + 1
+
+
+def _ints_to_real(ints, pr):
+    r = 0.0
+
+    for i in range(len(ints)):
+        r = r + pr[i] * ints[i]
+
+    return r
+
+
+class MPPGlobalSum:
+    def __init__(
+        self, stencil_factory: StencilFactory, communicator: Communicator
+    ) -> None:
+        NUMINT = 6
+        self._comm = communicator
+        self._ints_sum = Quantity(
+            data=np.zeros((NUMINT), dtype=Float),
+            dims=["K"],
+            units="dunno",
+            gt4py_backend=stencil_factory.backend,
+        )
+
+        self._ints_sum_reduce = Quantity(
+            data=np.zeros((NUMINT), dtype=Float),
+            dims=["K"],
+            units="dunno",
+            gt4py_backend=stencil_factory.backend,
+        )
+
+    def __call__(self, qty_to_sum: Quantity) -> Float:
+        NUMBIT = 46
+        r_prec = 2.0**NUMBIT
+        prec = 2**NUMBIT
+        I_prec = 1.0 / (2.0**NUMBIT)
+        pr = [
+            r_prec**2,
+            r_prec,
+            1.0,
+            1.0 / r_prec,
+            1.0 / r_prec**2,
+            1.0 / r_prec**3,
+        ]
+        I_pr = [1.0 / r_prec**2, 1.0 / r_prec, 1.0, r_prec, r_prec**2, r_prec**3]
+        prec_error = (2**62 + (2**62 - 1)) / 6
+        mag_max_term = 0.0
+
+        # Note: This loop range in i and j are for the TBC test case.
+        self._ints_sum.data[:] = 0
+        for j in range(qty_to_sum.field.shape[1]):
+            for i in range(qty_to_sum.field.shape[0]):
+                _increment_ints_faster(
+                    self._ints_sum.data, pr, I_pr, qty_to_sum.field[i, j], mag_max_term
+                )
+
+        _carry_overflow(self._ints_sum.data, prec, I_prec, prec_error)
+
+        self._comm.all_reduce(
+            self._ints_sum,
+            ReductionOperator.SUM,
+            self._ints_sum_reduce,
+        )
+
+        _regularize_ints(self._ints_sum_reduce.data, prec, I_prec)
+
+        return _ints_to_real(self._ints_sum_reduce.data, pr)
diff --git a/pyFV3/mpi/sum.py b/pyFV3/mpi/sum.py
new file mode 100644
index 00000000..e5697326
--- /dev/null
+++ b/pyFV3/mpi/sum.py
@@ -0,0 +1,44 @@
+import numpy as np
+
+from ndsl import Quantity, QuantityFactory
+from ndsl.comm.communicator import Communicator, ReductionOperator
+from ndsl.constants import X_DIM, Y_DIM
+from ndsl.dsl.dace.orchestration import dace_inhibitor
+from ndsl.dsl.stencil import GridIndexing
+from ndsl.dsl.typing import Float
+from ndsl.optional_imports import cupy as cp
+
+
+class GlobalSum:
+    def __init__(
+        self,
+        quantity_factory: QuantityFactory,
+        communicator: Communicator,
+        grid_indexing: GridIndexing = None,
+    ) -> None:
+        self._comm = communicator
+        # self._tmp_reduce = quantity_factory.empty(dims=[X_DIM, Y_DIM], units="n/a")
+        self._tmp_reduce = quantity_factory.zeros(dims=[X_DIM, Y_DIM], units="n/a")
+        self._isc = grid_indexing.isc
+        self._iec = grid_indexing.iec
+        self._jsc = grid_indexing.jsc
+        self._jec = grid_indexing.jec
+
+    @dace_inhibitor
+    def __call__(self, qty_to_sum: Quantity) -> Float:
+        assert len(qty_to_sum.field.shape) == 2  # Code handle only 2D quantity
+        self._comm.all_reduce(qty_to_sum, ReductionOperator.SUM, self._tmp_reduce)
+        if isinstance(self._tmp_reduce.data, np.ndarray):
+            return np.sum(
+                self._tmp_reduce.data[
+                    self._isc : self._iec + 1, self._jsc : self._jec + 1
+                ]
+            )
+        elif isinstance(self._tmp_reduce.data, cp.ndarray) and cp is not None:
+            return cp.sum(
+                self._tmp_reduce.data[
+                    self._isc : self._iec + 1, self._jsc : self._jec + 1
+                ]
+            )
+        else:
+            raise TypeError("Unsupported array type for reduction result.")
diff --git a/pyFV3/stencils/a2b_ord4.py b/pyFV3/stencils/a2b_ord4.py
index 328d3142..5f9a9fe6 100644
--- a/pyFV3/stencils/a2b_ord4.py
+++ b/pyFV3/stencils/a2b_ord4.py
@@ -1,3 +1,4 @@
+import xarray as xr
 import gt4py.cartesian.gtscript as gtscript
 from gt4py.cartesian.gtscript import (
     PARALLEL,
@@ -13,29 +14,43 @@
 
 from ndsl import GridIndexing, QuantityFactory, StencilFactory, orchestrate
 from ndsl.constants import X_DIM, X_INTERFACE_DIM, Y_DIM, Y_INTERFACE_DIM, Z_DIM
-from ndsl.dsl.typing import Float, FloatField, FloatFieldI, FloatFieldIJ
+from ndsl.dsl.typing import (
+    Float,
+    FloatField,
+    FloatFieldI64,
+    FloatFieldIJ,
+    FloatFieldIJ64,
+)
 from ndsl.grid import GridData
 from ndsl.stencils.basic_operations import copy_defn
 
 
 # comact 4-pt cubic interpolation
-c1 = 2.0 / 3.0
-c2 = -1.0 / 6.0
-d1 = 0.375
-d2 = -1.0 / 24.0
+c1 = Float(2.0) / Float(3.0)
+c2 = Float(-1.0) / Float(6.0)
+d1 = Float(0.375)
+d2 = Float(-1.0) / Float(24.0)
 # PPM volume mean form
-b1 = 7.0 / 12.0
-b2 = -1.0 / 12.0
+b1 = Float(7.0) / Float(12.0)  # 0.58333333
+b2 = Float(-1.0) / Float(12.0)
 # 4-pt Lagrange interpolation
-a1 = 9.0 / 16.0
-a2 = -1.0 / 16.0
+a1 = Float(0.5625)  # 9/16
+a2 = Float(-0.0625)  # -1/16
+
+r3 = Float(1.0 / 3.0)
 
 
 @gtscript.function
 def great_circle_dist(p1a, p1b, p2a, p2b):
-    tb = sin((p1b - p2b) / 2.0) ** 2.0
-    ta = sin((p1a - p2a) / 2.0) ** 2.0
-    return asin(sqrt(tb + cos(p1b) * cos(p2b) * ta)) * 2.0
+    return (
+        asin(
+            sqrt(
+                sin((p1b - p2b) / 2.0) ** 2.0
+                + cos(p1b) * cos(p2b) * sin((p1a - p2a) / 2.0) ** 2.0
+            )
+        )
+        * 2.0
+    )
 
 
 @gtscript.function
@@ -105,7 +120,7 @@ def _sw_corner(
             qin[1, -2, 0],
         )
 
-        qout = (ec1 + ec2 + ec3) * (1.0 / 3.0)
+        qout = (ec1 + ec2 + ec3) * r3
         tmp_qout_edges = qout
 
 
@@ -159,7 +174,7 @@ def _nw_corner(
             qin[0, 0, 0],
             qin[1, 1, 0],
         )
-        qout = (ec1 + ec2 + ec3) * (1.0 / 3.0)
+        qout = (ec1 + ec2 + ec3) * r3
         tmp_qout_edges = qout
 
 
@@ -213,7 +228,7 @@ def _ne_corner(
             qin[-1, 0, 0],
             qin[-2, 1, 0],
         )
-        qout = (ec1 + ec2 + ec3) * (1.0 / 3.0)
+        qout = (ec1 + ec2 + ec3) * r3
         tmp_qout_edges = qout
 
 
@@ -267,7 +282,7 @@ def _se_corner(
             qin[0, 0, 0],
             qin[1, 1, 0],
         )
-        qout = (ec1 + ec2 + ec3) * (1.0 / 3.0)
+        qout = (ec1 + ec2 + ec3) * r3
         tmp_qout_edges = qout
 
 
@@ -284,7 +299,7 @@ def lagrange_x_func(qy):
 def qout_x_edge(
     qin: FloatField,
     dxa: FloatFieldIJ,
-    edge_w: FloatFieldIJ,
+    edge_w: FloatFieldIJ64,
     qout: FloatField,
     tmp_qout_edges: FloatField,
 ):
@@ -305,7 +320,7 @@ def qout_x_edge(
 def qout_y_edge(
     qin: FloatField,
     dya: FloatFieldIJ,
-    edge_s: FloatFieldI,
+    edge_s: FloatFieldI64,
     qout: FloatField,
     tmp_qout_edges: FloatField,
 ):
@@ -510,11 +525,11 @@ def doubly_periodic_a2b_ord4(qin):
     Grid conversion is much simpler on a doubly-periodic, orthogonal grid so we
     can bypass most of the above code
     """
-    qx = b1 * (qin[-1, 0, 0] + qin) + b2 * (qin[-2, 0, 0] + qin[1, 0, 0])
-    qy = b1 * (qin[0, -1, 0] + qin) + b2 * (qin[0, -2, 0] + qin[0, 1, 0])
+    qx = b2 * (qin[-2, 0, 0] + qin[1, 0, 0]) * b1 * (qin[-1, 0, 0] + qin)
+    qy = b2 * (qin[0, -2, 0] + qin[0, 1, 0]) * b1 * (qin[0, -1, 0] + qin)
     qout = 0.5 * (
-        a1 * (qx[0, -1, 0] + qx + qy[-1, 0, 0] + qy)
-        + a2 * (qx[0, -2, 0] + qx[0, 1, 0] + qy[-2, 0, 0] + qy[1, 0, 0])
+        a2 * (qx[0, -2, 0] + qx[0, 1, 0] + qy[-2, 0, 0] + qy[1, 0, 0])
+        + a1 * (qx[0, -1, 0] + qx + qy[-1, 0, 0] + qy)
     )
     return qout
 
diff --git a/pyFV3/stencils/c_sw.py b/pyFV3/stencils/c_sw.py
index 9208e472..39b58969 100644
--- a/pyFV3/stencils/c_sw.py
+++ b/pyFV3/stencils/c_sw.py
@@ -5,6 +5,8 @@
     horizontal,
     interval,
     region,
+    I,
+    J,
 )
 
 from ndsl import Quantity, QuantityFactory, StencilFactory, orchestrate
@@ -159,7 +161,8 @@ def divergence_corner(
                 )
                 vf0 = v * dxc * 0.5 * (sin_sg3[-1, 0] + sin_sg1)
                 uf0 = u * dyc * 0.5 * (sin_sg4[0, -1] + sin_sg2)
-                divg_d = (-vf0 + uf1 - uf0) * rarea_c
+                divg_d = vf1 - vf0 + uf1 - uf0
+                divg_d = rarea_c * (divg_d - vf1)
 
             with horizontal(region[i_end + 1, j_end + 1], region[i_start, j_end + 1]):
                 vf1 = (
@@ -169,8 +172,8 @@ def divergence_corner(
                     u[-1, 0, 0] * dyc[-1, 0] * 0.5 * (sin_sg4[-1, -1] + sin_sg2[-1, 0])
                 )
                 uf0 = u * dyc * 0.5 * (sin_sg4[0, -1] + sin_sg2)
-                divg_d = (vf1 + uf1 - uf0) * rarea_c
-
+                divg_d = vf1 - vf0 + uf1 - uf0
+                divg_d = rarea_c * (divg_d + vf0)
             # ---------
 
 
@@ -378,7 +381,8 @@ def transportdelp_update_vorticity_and_kineticenergy(
             with horizontal(region[i_end + 1, :], region[i_start, :]):
                 ke = ke * sin_sg1 + v * cos_sg1 if ua > 0.0 else ke
 
-        ke = 0.5 * dt2 * (ua * ke + va * vort)
+        dt4 = 0.5 * dt2
+        ke = dt4 * (ua * ke + va * vort)
 
 
 def circulation_cgrid(
@@ -400,18 +404,18 @@ def circulation_cgrid(
     from __externals__ import i_end, i_start, j_end, j_start
 
     with computation(PARALLEL), interval(...):
-        fx = dxc * uc
-        fy = dyc * vc
-        # fx1 and fy1 are the shifted versions of fx and fy and are defined
-        # because temporaries are not allowed to be accessed with offsets in regions.
-        fx1 = dxc[0, -1] * uc[0, -1, 0]
-        fy1 = dyc[-1, 0] * vc[-1, 0, 0]
-
-        vort_c = fx1 - fx - fy1 + fy
+        fx = uc * dxc
+        fy = vc * dyc
+
+        vort_c = fx[J - 1] - fx - fy[I - 1] + fy
+
+        # Remove the extra term at the corners
+        # WEST
         with horizontal(region[i_start, j_start], region[i_start, j_end + 1]):
-            vort_c = fx1 - fx + fy
+            vort_c = vort_c + (vc[I - 1] * dyc[I - 1])
+        # EAST
         with horizontal(region[i_end + 1, j_start], region[i_end + 1, j_end + 1]):
-            vort_c = fx1 - fx - fy1
+            vort_c = vort_c - fy
 
 
 def absolute_vorticity(vort: FloatField, fC: FloatFieldIJ, rarea_c: FloatFieldIJ):
diff --git a/pyFV3/stencils/compute_total_energy.py b/pyFV3/stencils/compute_total_energy.py
new file mode 100644
index 00000000..bb4f0283
--- /dev/null
+++ b/pyFV3/stencils/compute_total_energy.py
@@ -0,0 +1,165 @@
+from ndsl import StencilFactory, QuantityFactory, orchestrate
+from ndsl.dsl.typing import Float, FloatField, FloatFieldIJ
+from ndsl.constants import X_DIM, Y_DIM, Z_DIM, Z_INTERFACE_DIM, GRAV
+from pyFV3._config import DynamicalCoreConfig
+from pyFV3.tracers import TracersType
+from pyFV3.stencils.moist_cv import moist_cv_nwat6_fn
+from gt4py.cartesian.gtscript import (
+    BACKWARD,
+    FORWARD,
+    interval,
+    computation,
+    K,
+)
+from ndsl.grid import GridData
+
+
+def _compute_total_energy__stencil(
+    hs: FloatFieldIJ,  # type: ignore
+    delp: FloatField,  # type: ignore
+    delz: FloatField,  # type: ignore
+    qc: FloatField,  # type:ignore
+    pt: FloatField,  # type: ignore
+    u: FloatField,  # type: ignore
+    v: FloatField,  # type: ignore
+    w: FloatField,  # type: ignore
+    qvapor: FloatField,  # type: ignore
+    qliquid: FloatField,  # type: ignore
+    qrain: FloatField,  # type: ignore
+    qsnow: FloatField,  # type: ignore
+    qice: FloatField,  # type: ignore
+    qgraupel: FloatField,  # type: ignore
+    rsin2: FloatFieldIJ,  # type: ignore
+    cosa_s: FloatFieldIJ,  # type: ignore
+    phyz: FloatField,  # type: ignore
+    te_2d: FloatFieldIJ,  # type: ignore
+):
+    """
+    Dev Note: this is _very_ close to moist_cv.moist_te. The only numerical differences
+    is that the te/te_2d computation as an extra (1.+qc(i,j,k))*(1.-qd(i))
+
+    Args:
+        hs(in):
+        delp(in):
+        delz(in):
+        pt(in):
+        qc(in):
+        u(in):
+        v(in):
+        w(in):
+        qvapor(in):
+        qliquid(in):
+        qrain(in):
+        qsnow(in):
+        qice(in):
+        qgraupel(in):
+        rsin2(in):
+        cosa_s(in):
+        phyz(inout):
+        te_2d(out):
+    """
+
+    with computation(BACKWARD), interval(-1, None):
+        te_2d = 0.0
+        phis = hs
+    with computation(BACKWARD), interval(0, -1):
+        phis = phis[K + 1] - GRAV * delz
+    with computation(FORWARD), interval(0, -1):
+        cvm, qd = moist_cv_nwat6_fn(qvapor, qliquid, qrain, qsnow, qice, qgraupel)
+
+        te_2d = te_2d + delp * (
+            cvm * pt * (1.0 + qc) * (1.0 - qd)
+            + 0.5
+            * (
+                phis
+                + phis[0, 0, 1]
+                + w**2.0
+                + 0.5
+                * rsin2
+                * (
+                    u**2.0
+                    + u[0, 1, 0] ** 2.0
+                    + v**2.0
+                    + v[1, 0, 0] ** 2.0
+                    - (u + u[0, 1, 0]) * (v + v[1, 0, 0]) * cosa_s
+                )
+            )
+        )
+
+
+class ComputeTotalEnergy:
+    """Compute total energy performs the FV3-consistent
+    computation of the global total energy.
+
+    It includes the potential, internal (latent and sensible heat), kinetic terms."""
+
+    def __init__(
+        self,
+        config: DynamicalCoreConfig,
+        stencil_factory: StencilFactory,
+        quantity_factory: QuantityFactory,
+        grid_data: GridData,
+    ) -> None:
+        orchestrate(
+            obj=self,
+            config=stencil_factory.config.dace_config,
+            dace_compiletime_args=["tracers"],
+        )
+        if config.hydrostatic:
+            raise NotImplementedError(
+                "Dynamics (Compute Total Energy): "
+                " hydrostatic option is not implemented."
+            )
+
+        if not config.moist_phys:
+            raise NotImplementedError(
+                "Dynamics (Compute Total Energy): "
+                " moist_phys=False option is not implemented."
+            )
+
+        self._phyz = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="Unknown",
+            dtype=Float,
+        )
+
+        self._compute_total_energy = stencil_factory.from_dims_halo(
+            func=_compute_total_energy__stencil,
+            compute_dims=[X_DIM, Y_DIM, Z_INTERFACE_DIM],
+        )
+        self._rsin2 = grid_data.rsin2
+        self._cosa_s = grid_data.cosa_s
+
+    def __call__(
+        self,
+        hs: FloatFieldIJ,  # type: ignore
+        delp: FloatField,  # type: ignore
+        delz: FloatField,  # type: ignore
+        qc: FloatField,  # type:ignore
+        pt: FloatField,  # type: ignore
+        u: FloatField,  # type: ignore
+        v: FloatField,  # type: ignore
+        w: FloatField,  # type: ignore
+        tracers: TracersType,
+        te_2d: FloatFieldIJ,  # type: ignore
+    ) -> None:
+        self._compute_total_energy(
+            hs=hs,
+            delp=delp,
+            delz=delz,
+            qc=qc,
+            pt=pt,
+            u=u,
+            v=v,
+            w=w,
+            qvapor=tracers.vapor,
+            qliquid=tracers.liquid,
+            qrain=tracers.rain,
+            qsnow=tracers.snow,
+            qice=tracers.ice,
+            qgraupel=tracers.graupel,
+            rsin2=self._rsin2,
+            cosa_s=self._cosa_s,
+            phyz=self._phyz,
+            te_2d=te_2d,
+        )
diff --git a/pyFV3/stencils/corners.py b/pyFV3/stencils/corners.py
new file mode 100644
index 00000000..266627df
--- /dev/null
+++ b/pyFV3/stencils/corners.py
@@ -0,0 +1,169 @@
+from ndsl import StencilFactory, orchestrate
+from ndsl.dsl.typing import FloatField
+
+
+def corner_copy_x(field_to_copy):
+    """Equivalent to the copy_corners_x functions in fortran.
+
+    This is written to operate on plain ndarrarys and not use the GT4Py framework.
+    This choice was made because we've seen a lot of performance left on the table using
+    orchestration without explicitly describing the operations but rather have full 3d-
+    sweeps with conditionals.
+    Since DaCe can handle (simple) operations on ndarrays directly this gives us a more
+    explicit entrypoint to the language and more optimization-potential.
+
+    Args:
+        field_to_copy (ndarray): field to apply the corner copy on.
+            This is explicitly not type-hinted for orchestration
+    """
+    field_to_copy[0, 0] = field_to_copy[0, 5]
+    field_to_copy[0, 1] = field_to_copy[1, 5]
+    field_to_copy[0, 2] = field_to_copy[2, 5]
+
+    field_to_copy[1, 0] = field_to_copy[0, 4]
+    field_to_copy[1, 1] = field_to_copy[1, 4]
+    field_to_copy[1, 2] = field_to_copy[2, 4]
+
+    field_to_copy[2, 0] = field_to_copy[0, 3]
+    field_to_copy[2, 1] = field_to_copy[1, 3]
+    field_to_copy[2, 2] = field_to_copy[2, 3]
+
+    field_to_copy[0, -4] = field_to_copy[2, -7]
+    field_to_copy[0, -3] = field_to_copy[1, -7]
+    field_to_copy[0, -2] = field_to_copy[0, -7]
+
+    field_to_copy[1, -4] = field_to_copy[2, -6]
+    field_to_copy[1, -3] = field_to_copy[1, -6]
+    field_to_copy[1, -2] = field_to_copy[0, -6]
+
+    field_to_copy[2, -4] = field_to_copy[2, -5]
+    field_to_copy[2, -3] = field_to_copy[1, -5]
+    field_to_copy[2, -2] = field_to_copy[0, -5]
+
+    field_to_copy[-4, 0] = field_to_copy[-2, 3]
+    field_to_copy[-4, 1] = field_to_copy[-3, 3]
+    field_to_copy[-4, 2] = field_to_copy[-4, 3]
+
+    field_to_copy[-3, 0] = field_to_copy[-2, 4]
+    field_to_copy[-3, 1] = field_to_copy[-3, 4]
+    field_to_copy[-3, 2] = field_to_copy[-4, 4]
+
+    field_to_copy[-2, 0] = field_to_copy[-2, 5]
+    field_to_copy[-2, 1] = field_to_copy[-3, 5]
+    field_to_copy[-2, 2] = field_to_copy[-4, 5]
+
+    field_to_copy[-4, -2] = field_to_copy[-2, -5]
+    field_to_copy[-4, -3] = field_to_copy[-3, -5]
+    field_to_copy[-4, -4] = field_to_copy[-4, -5]
+
+    field_to_copy[-3, -2] = field_to_copy[-2, -6]
+    field_to_copy[-3, -3] = field_to_copy[-3, -6]
+    field_to_copy[-3, -4] = field_to_copy[-4, -6]
+
+    field_to_copy[-2, -2] = field_to_copy[-2, -7]
+    field_to_copy[-2, -3] = field_to_copy[-3, -7]
+    field_to_copy[-2, -4] = field_to_copy[-4, -7]
+
+
+def corner_copy_y(field_to_copy):
+    """Equivalent to the copy_corners_y functions in fortran.
+
+    This is written to operate on plain ndarrarys and not use the GT4Py framework.
+    This choice was made because we've seen a lot of performance left on the table using
+    orchestration without explicitly describing the operations but rather have full 3d-
+    sweeps with conditionals.
+    Since DaCe can handle (simple) operations on ndarrays directly this gives us a more
+    explicit entrypoint to the language and more optimization-potential.
+
+    Args:
+        field_to_copy (ndarray): field to apply the corner copy on.
+            This is explicitly not type-hinted for orchestration
+    """
+    field_to_copy[0, 0] = field_to_copy[5, 0]
+    field_to_copy[1, 0] = field_to_copy[5, 1]
+    field_to_copy[2, 0] = field_to_copy[5, 2]
+
+    field_to_copy[0, 1] = field_to_copy[4, 0]
+    field_to_copy[1, 1] = field_to_copy[4, 1]
+    field_to_copy[2, 1] = field_to_copy[4, 2]
+
+    field_to_copy[0, 2] = field_to_copy[3, 0]
+    field_to_copy[1, 2] = field_to_copy[3, 1]
+    field_to_copy[2, 2] = field_to_copy[3, 2]
+
+    field_to_copy[-4, 0] = field_to_copy[-7, 2]
+    field_to_copy[-3, 0] = field_to_copy[-7, 1]
+    field_to_copy[-2, 0] = field_to_copy[-7, 0]
+
+    field_to_copy[-4, 1] = field_to_copy[-6, 2]
+    field_to_copy[-3, 1] = field_to_copy[-6, 1]
+    field_to_copy[-2, 1] = field_to_copy[-6, 0]
+
+    field_to_copy[-4, 2] = field_to_copy[-5, 2]
+    field_to_copy[-3, 2] = field_to_copy[-5, 1]
+    field_to_copy[-2, 2] = field_to_copy[-5, 0]
+
+    field_to_copy[0, -2] = field_to_copy[5, -2]
+    field_to_copy[0, -3] = field_to_copy[4, -2]
+    field_to_copy[0, -4] = field_to_copy[3, -2]
+
+    field_to_copy[1, -2] = field_to_copy[5, -3]
+    field_to_copy[1, -3] = field_to_copy[4, -3]
+    field_to_copy[1, -4] = field_to_copy[3, -3]
+
+    field_to_copy[2, -2] = field_to_copy[5, -4]
+    field_to_copy[2, -3] = field_to_copy[4, -4]
+    field_to_copy[2, -4] = field_to_copy[3, -4]
+
+    field_to_copy[-2, -4] = field_to_copy[-5, -2]
+    field_to_copy[-2, -3] = field_to_copy[-6, -2]
+    field_to_copy[-2, -2] = field_to_copy[-7, -2]
+
+    field_to_copy[-3, -4] = field_to_copy[-5, -3]
+    field_to_copy[-3, -3] = field_to_copy[-6, -3]
+    field_to_copy[-3, -2] = field_to_copy[-7, -3]
+
+    field_to_copy[-4, -4] = field_to_copy[-5, -4]
+    field_to_copy[-4, -3] = field_to_copy[-6, -4]
+    field_to_copy[-4, -2] = field_to_copy[-7, -4]
+
+
+class CopyCornersX:
+    """
+    Helper-class to copy corners corresponding to the fortran function copy_corners_x
+    """
+
+    def __init__(self, stencil_factory: StencilFactory) -> None:
+        orchestrate(
+            obj=self,
+            config=stencil_factory.config.dace_config,
+        )
+
+        if stencil_factory.grid_indexing.n_halo != 3:
+            raise NotImplementedError(
+                "Corner-Copy only implemented for exactly 3 Halo-Points"
+            )
+
+    def __call__(self, field: FloatField):
+        corner_copy_x(field)
+
+
+class CopyCornersY:
+    """
+    Helper-class to copy corners corresponding to the fortran function
+    copy_corners_y
+    """
+
+    def __init__(self, stencil_factory: StencilFactory) -> None:
+        orchestrate(
+            obj=self,
+            config=stencil_factory.config.dace_config,
+        )
+
+        if stencil_factory.grid_indexing.n_halo != 3:
+            raise NotImplementedError(
+                "Corner-Copy only implemented for exactly 3 Halo-Points"
+            )
+
+    def __call__(self, field: FloatField):
+        corner_copy_y(field)
diff --git a/pyFV3/stencils/d2a2c_vect.py b/pyFV3/stencils/d2a2c_vect.py
index 1d622082..6436fdf9 100644
--- a/pyFV3/stencils/d2a2c_vect.py
+++ b/pyFV3/stencils/d2a2c_vect.py
@@ -3,15 +3,15 @@
 
 from ndsl import QuantityFactory, StencilFactory, orchestrate
 from ndsl.constants import X_DIM, Y_DIM, Z_DIM
-from ndsl.dsl.typing import Float, FloatField, FloatFieldIJ
+from ndsl.dsl.typing import Float, FloatField, FloatFieldIJ, get_precision
 from ndsl.grid import GridData
 from ndsl.stencils import corners
 from pyFV3.stencils.a2b_ord4 import a1, a2, lagrange_x_func, lagrange_y_func
 
 
-c1 = -2.0 / 14.0
-c2 = 11.0 / 14.0
-c3 = 5.0 / 14.0
+c1 = Float(-2.0) / Float(14.0)
+c2 = Float(11.0) / Float(14.0)
+c3 = Float(5.0) / Float(14.0)
 OFFSET = 2
 
 
@@ -140,7 +140,7 @@ def east_west_edges(
             uc = utc * sin_sg3[-1, 0] if utc > 0 else utc * sin_sg1
 
         with horizontal(region[i_end + 2, local_js - 1 : local_je + 2]):
-            uc = vol_conserv_cubic_interp_func_x_rev(utmp)
+            uc = vol_conserv_cubic_interp_func_x_rev_2(utmp)
 
         with horizontal(region[i_end, local_js - 1 : local_je + 2]):
             utc = contravariant(uc, v, cosa_u, rsin_u)
@@ -300,6 +300,13 @@ def vol_conserv_cubic_interp_func_x_rev(u):
     return c1 * u[1, 0, 0] + c2 * u + c3 * u[-1, 0, 0]
 
 
+@gtscript.function
+def vol_conserv_cubic_interp_func_x_rev_2(u):
+    """Series order is reversed compared to original
+    vol_conserv_cubic_interp_func_x_rev to match Fortran"""
+    return c3 * u[-1, 0, 0] + c2 * u + c1 * u[1, 0, 0]
+
+
 @gtscript.function
 def vol_conserv_cubic_interp_func_y(v):
     return c1 * v[0, -2, 0] + c2 * v[0, -1, 0] + c3 * v
@@ -361,18 +368,20 @@ def vc_y_edge1(
 def edge_interpolate4_x(ua, dxa):
     t1 = dxa[-2, 0] + dxa[-1, 0]
     t2 = dxa[0, 0] + dxa[1, 0]
-    n1 = (t1 + dxa[-1, 0]) * ua[-1, 0, 0] - dxa[-1, 0] * ua[-2, 0, 0]
-    n2 = (t1 + dxa[0, 0]) * ua[0, 0, 0] - dxa[0, 0] * ua[1, 0, 0]
-    return 0.5 * (n1 / t1 + n2 / t2)
+    return 0.5 * (
+        ((t1 + dxa[-1, 0]) * ua[-1, 0, 0] - dxa[-1, 0] * ua[-2, 0, 0]) / t1
+        + ((t1 + dxa[0, 0]) * ua[0, 0, 0] - dxa[0, 0] * ua[1, 0, 0]) / t2
+    )
 
 
 @gtscript.function
 def edge_interpolate4_y(va, dya):
     t1 = dya[0, -2] + dya[0, -1]
     t2 = dya[0, 0] + dya[0, 1]
-    n1 = (t1 + dya[0, -1]) * va[0, -1, 0] - dya[0, -1] * va[0, -2, 0]
-    n2 = (t1 + dya[0, 0]) * va[0, 0, 0] - dya[0, 0] * va[0, 1, 0]
-    return 0.5 * (n1 / t1 + n2 / t2)
+    return 0.5 * (
+        ((t1 + dya[0, -1]) * va[0, -1, 0] - dya[0, -1] * va[0, -2, 0]) / t1
+        + ((t1 + dya[0, 0]) * va[0, 0, 0] - dya[0, 0] * va[0, 1, 0]) / t2
+    )
 
 
 class DGrid2AGrid2CGridVectors:
@@ -409,7 +418,7 @@ def __init__(
         self._sin_sg4 = grid_data.sin_sg4
         self._grid_type = grid_type
 
-        self._big_number = 1e30  # 1e8 if 32 bit
+        self._big_number = Float(1e30) if get_precision() == 64 else Float(1e8)
         nx = grid_indexing.iec + 1  # grid.npx + 2
         ny = grid_indexing.jec + 1  # grid.npy + 2
         i1 = grid_indexing.isc - 1
@@ -640,15 +649,6 @@ def __call__(self, uc, vc, u, v, ua, va, utc, vtc):
             va,
         )
 
-        self._ut_main(
-            self._utmp,
-            uc,
-            v,
-            self._cosa_u,
-            self._rsin_u,
-            utc,
-        )
-
         if self._grid_type < 3:
             self._east_west_edges(
                 u,
@@ -664,6 +664,15 @@ def __call__(self, uc, vc, u, v, ua, va, utc, vtc):
                 self._dxa,
             )
 
+        self._ut_main(
+            self._utmp,
+            uc,
+            v,
+            self._cosa_u,
+            self._rsin_u,
+            utc,
+        )
+
         # Ydir:
         self._fill_corners_y(
             self._utmp,
diff --git a/pyFV3/stencils/d_sw.py b/pyFV3/stencils/d_sw.py
index 28940049..24f73591 100644
--- a/pyFV3/stencils/d_sw.py
+++ b/pyFV3/stencils/d_sw.py
@@ -8,15 +8,16 @@
     horizontal,
     interval,
     region,
+    I,
+    J,
 )
 
 from ndsl import Quantity, QuantityFactory, StencilFactory, orchestrate
 from ndsl.constants import X_DIM, X_INTERFACE_DIM, Y_DIM, Y_INTERFACE_DIM, Z_DIM
-from ndsl.dsl.typing import Float, FloatField, FloatFieldIJ, FloatFieldK
+from ndsl.dsl.typing import Float, FloatField, FloatField64, FloatFieldIJ, FloatFieldK
 from ndsl.grid import DampingCoefficients, GridData
 from pyFV3._config import DGridShallowWaterLagrangianDynamicsConfig
 from pyFV3.stencils import delnflux
-from pyFV3.stencils.d2a2c_vect import contravariant
 from pyFV3.stencils.delnflux import DelnFluxNoSG
 from pyFV3.stencils.divergence_damping import DivergenceDamping
 from pyFV3.stencils.fvtp2d import FiniteVolumeTransport
@@ -26,14 +27,14 @@
 from pyFV3.version import IS_GEOS
 
 
-dcon_threshold = 1e-5
+dcon_threshold = Float(1e-5)
 
 
 def flux_capacitor(
-    cx: FloatField,
-    cy: FloatField,
-    xflux: FloatField,
-    yflux: FloatField,
+    cx: FloatField64,
+    cy: FloatField64,
+    xflux: FloatField64,
+    yflux: FloatField64,
     crx_adv: FloatField,
     cry_adv: FloatField,
     fx: FloatField,
@@ -92,6 +93,8 @@ def heat_diss(
         damp_w (in):
         ke_bg (in):
     """
+    from __externals__ import do_stochastic_ke_backscatter
+
     with computation(PARALLEL), interval(...):
         heat_source = 0.0
         diss_est = 0.0
@@ -99,7 +102,8 @@ def heat_diss(
             dd8 = ke_bg * abs(dt)
             dw = (fx2 - fx2[1, 0, 0] + fy2 - fy2[0, 1, 0]) * rarea
             heat_source = dd8 - dw * (w + 0.5 * dw)
-            diss_est = heat_source
+            if __INLINED(do_stochastic_ke_backscatter):
+                diss_est = heat_source
 
 
 @gtscript.function
@@ -200,6 +204,16 @@ def apply_pt_delp_fluxes_stencil_defn(
         pt, delp = apply_pt_delp_fluxes(gx, gy, rarea, fx, fy, pt, delp)
 
 
+def delp_increment_accumulation(
+    dpx: FloatField64,
+    fx: FloatField,
+    fy: FloatField,
+    rarea: FloatFieldIJ,
+):
+    with computation(PARALLEL), interval(...):
+        dpx = dpx + ((fx - fx[1, 0, 0]) + (fy - fy[0, 1, 0])) * rarea
+
+
 def compute_kinetic_energy(
     vc: FloatField,
     uc: FloatField,
@@ -241,20 +255,22 @@ def compute_kinetic_energy(
     from __externals__ import grid_type
 
     with computation(PARALLEL), interval(...):
+        dt4 = 0.25 * dt
+        dt5 = 0.5 * dt
         if __INLINED(grid_type < 3):
             ub_contra, vb_contra = interpolate_uc_vc_to_cell_corners(
-                uc, vc, cosa, rsina, uc_contra, vc_contra
+                uc, vc, cosa, rsina, uc_contra, vc_contra, dt4, dt5
             )
         else:
-            ub_contra = 0.5 * (uc[0, -1, 0] + uc)
-            vb_contra = 0.5 * (vc[-1, 0, 0] + vc)
+            ub_contra = dt5 * (uc[0, -1, 0] + uc)
+            vb_contra = dt5 * (vc[-1, 0, 0] + vc)
         advected_v = advect_v_along_y(v, vb_contra, rdy=rdy, dy=dy, dya=dya, dt=dt)
         advected_u = advect_u_along_x(u, ub_contra, rdx=rdx, dx=dx, dxa=dxa, dt=dt)
         # makes sure the kinetic energy part of the governing equation is computed
         # the same way as the vorticity flux part (in terms of time splitting)
         # to avoid a Hollingsworth-Kallberg instability
-        dt_kinetic_energy_on_cell_corners = (
-            0.5 * dt * (ub_contra * advected_u + vb_contra * advected_v)
+        dt_kinetic_energy_on_cell_corners = 0.5 * (
+            ub_contra * advected_u + vb_contra * advected_v
         )
         dt_kinetic_energy_on_cell_corners = all_corners_ke(
             dt_kinetic_energy_on_cell_corners, u, v, uc_contra, vc_contra, dt
@@ -326,11 +342,9 @@ def compute_vorticity(
         # cell-mean vorticity is equal to the circulation around the gridcell
         # divided by the area of the gridcell. It isn't exactly true that
         # area = dx * dy, so the form below is necessary to get an exact result.
-        rdy_tmp = rarea * dx
-        rdx_tmp = rarea * dy
-        vorticity = (u - u[0, 1, 0] * dx[0, 1] / dx) * rdy_tmp + (
-            v[1, 0, 0] * dy[1, 0] / dy - v
-        ) * rdx_tmp
+        ut = v * dy
+        vt = u * dx
+        vorticity = rarea * (vt - vt[J + 1] - ut + ut[I + 1])
 
 
 def adjust_w_and_qcon(
@@ -373,9 +387,7 @@ def vort_differencing(
     from __externals__ import local_ie, local_is, local_je, local_js
 
     with computation(PARALLEL), interval(...):
-        # TODO: this should likely be dcon[k] rather than dcon[0] so that this
-        # can be turned on and off per-layer
-        if dcon[0] > dcon_threshold:
+        if dcon > dcon_threshold:
             # Creating a gtscript function for the ub/vb computation
             # results in an "NotImplementedError" error for Jenkins
             # Inlining the ub/vb computation in this stencil resolves the Jenkins error
@@ -537,7 +549,6 @@ def heat_source_from_vorticity_damping(
             to explicitly damp and convert into heat.
     """
     from __externals__ import (  # noqa (see below)
-        d_con,
         do_stochastic_ke_backscatter,
         local_ie,
         local_is,
@@ -631,24 +642,15 @@ def set_low_kvals(col: Mapping[str, Quantity], k):
 
 # For the column namelist at a specific k-level
 # set the vorticity parameters if do_vort_damp is true
-def vorticity_damping_option_FV3GFS(column, k, do_vort_damp):
+def vorticity_damping_option(column, k, do_vort_damp):
     if do_vort_damp:
         column["nord_v"].view[k] = 0
         column["damp_vt"].view[k] = 0.5 * column["d2_divg"].view[k]
 
 
-def vorticity_damping_option_GEOS(column, k, do_vort_damp):
-    # GEOS does not set damp_vt
-    if do_vort_damp:
-        column["nord_v"].view[k] = 0
-
-
 def lowest_kvals(column, k, do_vort_damp):
     set_low_kvals(column, k)
-    if IS_GEOS:
-        vorticity_damping_option_GEOS(column, k, do_vort_damp)
-    else:
-        vorticity_damping_option_FV3GFS(column, k, do_vort_damp)
+    vorticity_damping_option(column, k, do_vort_damp)
 
 
 def get_column_namelist(
@@ -741,42 +743,39 @@ def get_column_namelist(
 
 @gtscript.function
 def interpolate_uc_vc_to_cell_corners(
-    uc_cov, vc_cov, cosa, rsina, uc_contra, vc_contra
+    uc_cov, vc_cov, cosa, rsina, uc_contra, vc_contra, dt4, dt5
 ):
     """
     Convert covariant C-grid winds to contravariant B-grid (cell-corner) winds.
     """
     from __externals__ import i_end, i_start, j_end, j_start
 
-    # In the original Fortran, this routine was given dt4 (0.25 * dt)
-    # and dt5 (0.5 * dt), and its outputs were wind times timestep. This has
-    # been refactored so the timestep is later explicitly multiplied, when
-    # the wind is integrated forward in time.
-    # TODO: ask Lucas why we interpolate then convert to contravariant in tile center,
-    # but convert to contravariant and then interpolate on tile edges.
-    ub_cov = 0.5 * (uc_cov[0, -1, 0] + uc_cov)
-    vb_cov = 0.5 * (vc_cov[-1, 0, 0] + vc_cov)
-    ub_contra = contravariant(ub_cov, vb_cov, cosa, rsina)
-    vb_contra = contravariant(vb_cov, ub_cov, cosa, rsina)
-    # ASSUME : if __INLINED(namelist.grid_type < 3):
+    # Orders matter because corners take the last edge computation values
+
+    # Center domain
+    ub = dt5 * (uc_cov[J - 1] + uc_cov - (vc_cov[I - 1] + vc_cov) * cosa) * rsina
+    vb = dt5 * (vc_cov[I - 1] + vc_cov - (uc_cov[J - 1] + uc_cov) * cosa) * rsina
+    # UB - Orders matter because corners take the last edge computation values
+    # North/South edge
     with horizontal(region[:, j_start], region[:, j_end + 1]):
-        ub_contra = 0.25 * (
-            -uc_contra[0, -2, 0]
-            + 3.0 * (uc_contra[0, -1, 0] + uc_contra)
-            - uc_contra[0, 1, 0]
+        ub = dt4 * (
+            -uc_contra[J - 2] + 3.0 * (uc_contra[J - 1] + uc_contra) - uc_contra[J + 1]
         )
+    # East/West
     with horizontal(region[i_start, :], region[i_end + 1, :]):
-        ub_contra = 0.5 * (uc_contra[0, -1, 0] + uc_contra)
+        ub = dt5 * (uc_contra[J - 1] + uc_contra)
+
+    # VB - Orders matter because corners take the last edge computation values
+    # North/South edge
     with horizontal(region[i_start, :], region[i_end + 1, :]):
-        vb_contra = 0.25 * (
-            -vc_contra[-2, 0, 0]
-            + 3.0 * (vc_contra[-1, 0, 0] + vc_contra)
-            - vc_contra[1, 0, 0]
+        vb = dt4 * (
+            -vc_contra[I - 2] + 3.0 * (vc_contra[I - 1] + vc_contra) - vc_contra[I + 1]
         )
+    # East/West
     with horizontal(region[:, j_start], region[:, j_end + 1]):
-        vb_contra = 0.5 * (vc_contra[-1, 0, 0] + vc_contra)
+        vb = dt5 * (vc_contra[I - 1] + vc_contra)
 
-    return ub_contra, vb_contra
+    return ub, vb
 
 
 class DGridShallowWaterLagrangianDynamics:
@@ -993,11 +992,14 @@ def make_quantity():
         self._heat_diss_stencil = stencil_factory.from_dims_halo(
             func=heat_diss,
             compute_dims=[X_DIM, Y_DIM, Z_DIM],
+            externals={
+                "do_stochastic_ke_backscatter": config.do_skeb,
+            },
         )
         self._heat_source_from_vorticity_damping_stencil = (
             stencil_factory.from_dims_halo(
                 func=heat_source_from_vorticity_damping,
-                compute_dims=[X_INTERFACE_DIM, Y_INTERFACE_DIM, Z_DIM],
+                compute_dims=[X_DIM, Y_DIM, Z_DIM],
                 externals={
                     "do_stochastic_ke_backscatter": config.do_skeb,
                     "d_con": config.d_con,
@@ -1031,33 +1033,38 @@ def make_quantity():
             da_min=damping_coefficients.da_min_c,
             nord=self._column_namelist["nord_w"],
         )
+        self._accumulate_delp = stencil_factory.from_dims_halo(
+            func=delp_increment_accumulation,
+            compute_dims=[X_DIM, Y_DIM, Z_DIM],
+        )
 
     def __call__(
         self,
-        delpc,
-        delp,
-        pt,
-        u,
-        v,
-        w,
-        uc,
-        vc,
-        ua,
-        va,
-        divgd,
-        mfx,
-        mfy,
-        cx,
-        cy,
-        crx,
-        cry,
-        xfx,
-        yfx,
-        q_con,
-        zh,
-        heat_source,
-        diss_est,
-        dt,
+        delpc: FloatField,
+        delp: FloatField,
+        pt: FloatField,
+        u: FloatField,
+        v: FloatField,
+        w: FloatField,
+        uc: FloatField,
+        vc: FloatField,
+        ua: FloatField,
+        va: FloatField,
+        divgd: FloatField,
+        mfx: FloatField64,
+        mfy: FloatField64,
+        cx: FloatField64,
+        cy: FloatField64,
+        dpx: FloatField64,
+        crx: FloatField,
+        cry: FloatField,
+        xfx: FloatField,
+        yfx: FloatField,
+        q_con: FloatField,
+        zh: FloatField,
+        heat_source: FloatField,
+        diss_est: FloatField,
+        dt: Float,
     ):
         """
         D-Grid shallow water routine, peforms a full-timestep advance
@@ -1085,6 +1092,7 @@ def __call__(
             mfy (inout): accumulated y mass flux
             cx (inout): accumulated Courant number in the x direction
             cy (inout): accumulated Courant number in the y direction
+            dpx (inout): accumulated delp export for Dry Mass Roundoff Control
             crx (out): local courant number in the x direction
             cry (out): local courant number in the y direction
             xfx (out): flux of area in x-direction, in units of m^2
@@ -1223,6 +1231,13 @@ def __call__(
         self._adjust_w_and_qcon_stencil(
             w, delp, self._tmp_dw, q_con, self._column_namelist["damp_w"]
         )
+
+        self._accumulate_delp(
+            dpx=dpx,
+            fx=self._tmp_fx,
+            fy=self._tmp_fy,
+            rarea=self.grid_data.rarea,
+        )
         # at this point, pt, delp, w and q_con have been stepped forward in time
         # the rest of this function updates the winds
         self._compute_kinetic_energy(
diff --git a/pyFV3/stencils/del2cubed.py b/pyFV3/stencils/del2cubed.py
index 24e1976e..6a887469 100644
--- a/pyFV3/stencils/del2cubed.py
+++ b/pyFV3/stencils/del2cubed.py
@@ -1,3 +1,4 @@
+import numpy as np
 from gt4py.cartesian.gtscript import PARALLEL, computation, horizontal, interval, region
 
 import ndsl.stencils.corners as corners
@@ -68,7 +69,7 @@ def corner_fill(q_in: FloatField, q_out: FloatField):
 # Q update stencil
 # ------------------
 def update_q(
-    q: FloatField, rarea: FloatFieldIJ, fx: FloatField, fy: FloatField, cd: Float
+    q: FloatField, rarea: FloatFieldIJ, fx: FloatField, fy: FloatField, cd: np.float64
 ):
     with computation(PARALLEL), interval(...):
         q += cd * rarea * (fx - fx[1, 0, 0] + fy - fy[0, 1, 0])
@@ -169,7 +170,7 @@ def __init__(
             update_q, origins, domains, stencil_factory=stencil_factory
         )
 
-    def __call__(self, qdel: FloatField, cd: Float):
+    def __call__(self, qdel: FloatField, cd: np.float64):
         """
         Perform hyperdiffusion damping/filtering.
 
diff --git a/pyFV3/stencils/delnflux.py b/pyFV3/stencils/delnflux.py
index 58607d6a..8b89f8e4 100644
--- a/pyFV3/stencils/delnflux.py
+++ b/pyFV3/stencils/delnflux.py
@@ -1,6 +1,7 @@
 from typing import Optional
 
 import gt4py.cartesian.gtscript as gtscript
+import numpy as np
 from gt4py.cartesian.gtscript import PARALLEL, computation, horizontal, interval, region
 
 from ndsl import Quantity, QuantityFactory, StencilFactory, orchestrate
@@ -16,7 +17,12 @@ def calc_damp(damp_c: Quantity, da_min: Float, nord: Quantity) -> Quantity:
             "current implementation requires damp_c and nord to have "
             "identical data shape and dims"
         )
-    data = (damp_c.data * da_min) ** (nord.data + 1)
+    # `da_min` is a 64 bit float and we have to cast the array to deal
+    # with downcasting behavior of array * scalar in numpy
+    # We then reproduce the proper casting so `calc_damp` is a 32-bit float
+    data = np.power(
+        (damp_c.data.astype(np.float64) * da_min), (nord.data + 1), dtype=np.float64
+    ).astype(Float)
     return Quantity(
         data=data,
         dims=damp_c.dims,
@@ -97,7 +103,7 @@ def fx_calculation(q: FloatField, del6_v: FloatField):
 
 @gtscript.function
 def fx_calculation_neg(q: FloatField, del6_v: FloatField):
-    return -del6_v * (q[-1, 0, 0] - q)
+    return del6_v * (q - q[-1, 0, 0])
 
 
 @gtscript.function
@@ -107,7 +113,7 @@ def fy_calculation(q: FloatField, del6_u: FloatField):
 
 @gtscript.function
 def fy_calculation_neg(q: FloatField, del6_u: FloatField):
-    return -del6_u * (q[0, -1, 0] - q)
+    return del6_u * (q - q[0, -1, 0])
 
 
 def d2_highorder_stencil(
@@ -120,7 +126,7 @@ def d2_highorder_stencil(
 ):
     with computation(PARALLEL), interval(...):
         if nord > current_nord:
-            d2 = ((fx - fx[1, 0, 0]) + (fy - fy[0, 1, 0])) * rarea
+            d2 = (fx - fx[1, 0, 0] + fy - fy[0, 1, 0]) * rarea
 
 
 def d2_damp_interval(
@@ -178,8 +184,8 @@ def diffusive_damp(
     damp: FloatFieldK,
 ):
     with computation(PARALLEL), interval(...):
-        fx = fx + 0.5 * damp * (mass[-1, 0, 0] + mass) * fx2
-        fy = fy + 0.5 * damp * (mass[0, -1, 0] + mass) * fy2
+        fx = fx + (0.5 * damp) * (mass[-1, 0, 0] + mass) * fx2
+        fy = fy + (0.5 * damp) * (mass[0, -1, 0] + mass) * fy2
 
 
 def copy_corners_y_nord(
@@ -423,7 +429,11 @@ def __init__(
         )
 
         self.delnflux_nosg = DelnFluxNoSG(
-            stencil_factory, damping_coefficients, rarea, nord_col, nk=nk
+            stencil_factory,
+            damping_coefficients,
+            rarea,
+            nord_col,
+            nk=nk,
         )
 
     def __call__(
@@ -437,11 +447,11 @@ def __call__(
         """
         Del-n damping for fluxes, where n = 2 * nord + 2
         Args:
-            q: Field for which to calculate damped fluxes (in)
-            fx: x-flux on A-grid (inout)
-            fy: y-flux on A-grid (inout)
-            d2: A damped copy of the q field (in)
-            mass: Mass to weight the diffusive flux by (in)
+            q (in): Field for which to calculate damped fluxes
+            fx (inout): x-flux on A-grid
+            fy (inout): y-flux on A-grid
+            d2 (in): A damped copy of the q field
+            mass (in): Mass to weight the diffusive flux by
         """
         if self._no_compute is True:
             return fx, fy
@@ -611,14 +621,12 @@ def __init__(
             externals={**corner_axis_offsets},
             origin=corner_origin,
             domain=corner_domain,
-            skip_passes=("UnreachableStmtPruning",),
         )
         self._copy_corners_y_nord = stencil_factory.from_origin_domain(
             copy_corners_y_nord,
             externals={**corner_axis_offsets},
             origin=corner_origin,
             domain=corner_domain,
-            skip_passes=("UnreachableStmtPruning",),
         )
 
     def __call__(self, q, fx2, fy2, damp_c, d2, mass=None):
@@ -639,17 +647,46 @@ def __call__(self, q, fx2, fy2, damp_c, d2, mass=None):
         """
 
         if mass is None:
-            self._d2_damp(q=q, d2=d2, damp=damp_c, nord=self._nord)
+            self._d2_damp(
+                q=q,
+                d2=d2,
+                damp=damp_c,
+                nord=self._nord,
+            )
         else:
-            self._copy_stencil_interval(q_in=q, q_out=d2, nord=self._nord)
+            self._copy_stencil_interval(
+                q_in=q,
+                q_out=d2,
+                nord=self._nord,
+            )
 
-        self._copy_corners_x_nord(q_in=d2, q_out=d2, nord=self._nord, current_nord=0)
+        self._copy_corners_x_nord(
+            q_in=d2,
+            q_out=d2,
+            nord=self._nord,
+            current_nord=0,
+        )
 
-        self._fx_calc_stencil(q=d2, del6_v=self._del6_v, fx=fx2, nord=self._nord)
+        self._fx_calc_stencil(
+            q=d2,
+            del6_v=self._del6_v,
+            fx=fx2,
+            nord=self._nord,
+        )
 
-        self._copy_corners_y_nord(q_in=d2, q_out=d2, nord=self._nord, current_nord=0)
+        self._copy_corners_y_nord(
+            q_in=d2,
+            q_out=d2,
+            nord=self._nord,
+            current_nord=0,
+        )
 
-        self._fy_calc_stencil(q=d2, del6_u=self._del6_u, fy=fy2, nord=self._nord)
+        self._fy_calc_stencil(
+            q=d2,
+            del6_u=self._del6_u,
+            fy=fy2,
+            nord=self._nord,
+        )
 
         for n in range(self._nmax):
             self._d2_stencil[n](
@@ -662,17 +699,31 @@ def __call__(self, q, fx2, fy2, damp_c, d2, mass=None):
             )
 
             self._copy_corners_x_nord(
-                q_in=d2, q_out=d2, nord=self._nord, current_nord=n
+                q_in=d2,
+                q_out=d2,
+                nord=self._nord,
+                current_nord=n,
             )
 
             self._column_conditional_fx_calculation[n](
-                q=d2, del6_v=self._del6_v, fx=fx2, nord=self._nord, current_nord=n
+                q=d2,
+                del6_v=self._del6_v,
+                fx=fx2,
+                nord=self._nord,
+                current_nord=n,
             )
 
             self._copy_corners_y_nord(
-                q_in=d2, q_out=d2, nord=self._nord, current_nord=n
+                q_in=d2,
+                q_out=d2,
+                nord=self._nord,
+                current_nord=n,
             )
 
             self._column_conditional_fy_calculation[n](
-                q=d2, del6_u=self._del6_u, fy=fy2, nord=self._nord, current_nord=n
+                q=d2,
+                del6_u=self._del6_u,
+                fy=fy2,
+                nord=self._nord,
+                current_nord=n,
             )
diff --git a/pyFV3/stencils/divergence_damping.py b/pyFV3/stencils/divergence_damping.py
index 46a949ce..480ff85f 100644
--- a/pyFV3/stencils/divergence_damping.py
+++ b/pyFV3/stencils/divergence_damping.py
@@ -1,8 +1,10 @@
 import gt4py.cartesian.gtscript as gtscript
+import numpy as np
 from gt4py.cartesian.gtscript import (
     __INLINED,
     PARALLEL,
     computation,
+    float32,
     horizontal,
     interval,
     region,
@@ -23,8 +25,13 @@
 
 @gtscript.function
 def damp_tmp(q, da_min_c, d2_bg, dddmp):
-    mintmp = min(0.2, dddmp * abs(q))
-    damp = da_min_c * max(d2_bg, mintmp)
+    damp: float32 = da_min_c * max(d2_bg, min(0.2, dddmp * abs(q)))
+    return damp
+
+
+@gtscript.function
+def damp_tmp2(q, da_min_c, d2_bg, dddmp):
+    damp: float32 = da_min_c * max(d2_bg, min(0.2, dddmp * q))
     return damp
 
 
@@ -51,6 +58,21 @@ def compute_u_contra_dyc(
         sin_sg2 (in):
         sin_sg4 (in):
         u_contra_dyc (out): contravariant u-wind on d-grid
+
+    Porting Notes
+    * The compute_u_contra_dyc and compute_v_contra_dxc functions have
+      the dyc and dxc values incorporated earlier in the calcuation rather than later,
+      and this enables the u_contra_dyc and v_contra_dxc values
+      to match with the Fortran.
+      As a result, the delpc computation matches the Fortran value of delpc.
+
+      Ex : Previous implementation of compute_u_contra_dyc
+      =================================================================
+           u_contra = contravariant(u, vc_from_va, cosa_v, sina_v)
+           with horizontal(region[:, j_start], region[:, j_end + 1]):
+               u_contra = u * sin_sg4[0, -1] if vc > 0 else u * sin_sg2
+           u_contra_dyc = u_contra * dyc
+      =================================================================
     """
     from __externals__ import j_end, j_start
 
@@ -58,10 +80,10 @@ def compute_u_contra_dyc(
         # TODO: why does vc_from_va sometimes have different sign than vc?
         vc_from_va = 0.5 * (va[0, -1, 0] + va)
         # TODO: why do we use vc_from_va and not just vc?
-        u_contra = contravariant(u, vc_from_va, cosa_v, sina_v)
+        u_contra_dyc = contravariant(u, vc_from_va, cosa_v, dyc)
+        u_contra_dyc = u_contra_dyc * sina_v
         with horizontal(region[:, j_start], region[:, j_end + 1]):
-            u_contra = u * sin_sg4[0, -1] if vc > 0 else u * sin_sg2
-        u_contra_dyc = u_contra * dyc
+            u_contra_dyc = u * dyc * sin_sg4[0, -1] if vc > 0 else u * dyc * sin_sg2
 
 
 def compute_v_contra_dxc(
@@ -86,6 +108,20 @@ def compute_v_contra_dxc(
         uc (in):
         sin_sg3 (in):
         sin_sg1 (in):
+
+    Porting Notes
+    * The compute_u_contra_dyc and compute_v_contra_dxc functions
+      have the dyc and dxc values incorporated earlier in the calcuation
+      rather than later, and this enables the u_contra_dyc and v_contra_dxc
+      values to match with the Fortran.  As a result, the delpc computation
+      matches the Fortran value of delpc.
+
+      Ex : Previous implementation of compute_v_contra_dxc
+        =================================================================
+        v_contra = contravariant(v, uc_from_ua, cosa_u, sina_u)
+        with horizontal(region[i_start, :], region[i_end + 1, :]):
+            v_contra = v * sin_sg3[-1, 0] if uc > 0 else v * sin_sg1
+        v_contra_dxc = v_contra * dxc
     """
     from __externals__ import i_end, i_start
 
@@ -93,10 +129,10 @@ def compute_v_contra_dxc(
         # TODO: why does uc_from_ua sometimes have different sign than uc?
         uc_from_ua = 0.5 * (ua[-1, 0, 0] + ua)
         # TODO: why do we use uc_from_ua and not just uc?
-        v_contra = contravariant(v, uc_from_ua, cosa_u, sina_u)
+        v_contra_dxc = contravariant(v, uc_from_ua, cosa_u, dxc)
+        v_contra_dxc = v_contra_dxc * sina_u
         with horizontal(region[i_start, :], region[i_end + 1, :]):
-            v_contra = v * sin_sg3[-1, 0] if uc > 0 else v * sin_sg1
-        v_contra_dxc = v_contra * dxc
+            v_contra_dxc = v * dxc * sin_sg3[-1, 0] if uc > 0 else v * dxc * sin_sg1
 
 
 def delpc_computation(
@@ -141,7 +177,7 @@ def damping(
     vort: FloatField,
     ke: FloatField,
     d2_bg: FloatFieldK,
-    da_min_c: Float,
+    da_min_c: np.float64,
     dddmp: Float,
     dt: Float,
 ):
@@ -165,7 +201,7 @@ def damping_nord_highorder_stencil(
     delpc: FloatField,
     divg_d: FloatField,
     d2_bg: FloatFieldK,
-    da_min_c: Float,
+    da_min_c: np.float64,
     dddmp: Float,
     dd8: Float,
 ):
@@ -181,7 +217,7 @@ def damping_nord_highorder_stencil(
     """
     # TODO: propagate variable renaming into this routine
     with computation(PARALLEL), interval(...):
-        damp = damp_tmp(vort, da_min_c, d2_bg, dddmp)
+        damp = damp_tmp2(vort, da_min_c, d2_bg, dddmp)
         vort = damp * delpc + dd8 * divg_d
         ke = ke + vort
 
@@ -249,7 +285,7 @@ def smagorinsky_diffusion_approx(delpc: FloatField, vort: FloatField, absdt: Flo
         absdt (in): abs(dt)
     """
     with computation(PARALLEL), interval(...):
-        vort = absdt * (delpc ** 2.0 + vort ** 2.0) ** 0.5
+        vort = absdt * sqrt(delpc ** 2 + vort ** 2)
 
 
 def smag_corner(
@@ -309,7 +345,7 @@ def __init__(
         damping_coefficients: DampingCoefficients,
         nested: bool,
         stretched_grid: bool,
-        dddmp,
+        dddmp: Float,
         d4_bg,
         nord: int,
         grid_type,
@@ -324,11 +360,11 @@ def __init__(
         if nested:
             raise NotImplementedError("Divergence Dampoing: nested not implemented.")
         # TODO: make dddmp a compile-time external, instead of runtime scalar
-        self._dddmp = dddmp
+        self._dddmp = Float(dddmp)
         # TODO: make da_min_c a compile-time external, instead of runtime scalar
         self._damping_coefficients = damping_coefficients
         self._stretched_grid = stretched_grid
-        self._d4_bg = d4_bg
+        self._d4_bg = Float(d4_bg)
         self._grid_type = grid_type
         self._nord_column = nord_col
         self._d2_bg_column = d2_bg
@@ -465,7 +501,6 @@ def __init__(
             func=corners.fill_corners_dgrid_defn,
             compute_dims=[X_INTERFACE_DIM, Y_INTERFACE_DIM, Z_DIM],
             compute_halos=(self.grid_indexing.n_halo, self.grid_indexing.n_halo),
-            skip_passes=("UnreachableStmtPruning",),
         )
 
         self._redo_divg_d_stencils = get_stencils_with_varied_bounds(
@@ -539,11 +574,11 @@ def __init__(
     # odd and adds a lot of boilerplate throughout the model code.
 
     @dace_inhibitor
-    def _get_da_min_c(self) -> float:
+    def _get_da_min_c(self) -> np.float64:
         return self._damping_coefficients.da_min_c
 
     @dace_inhibitor
-    def _get_da_min(self) -> float:
+    def _get_da_min(self) -> np.float64:
         return self._damping_coefficients.da_min
 
     def __call__(
@@ -632,7 +667,7 @@ def __call__(
                 self.v_contra_dxc,
             )
 
-            da_min_c: Float = self._get_da_min_c()
+            da_min_c = self._damping_coefficients.da_min_c
             self._damping(
                 delpc,
                 damped_rel_vort_bgrid,
@@ -694,12 +729,14 @@ def __call__(
                     abs(dt),
                 )
 
-        da_min: Float = self._get_da_min()
+        da_min = self._damping_coefficients.da_min
         if self._stretched_grid:
             # reference https://github.com/NOAA-GFDL/GFDL_atmos_cubed_sphere/blob/main/model/sw_core.F90#L1422 # noqa: E501
-            dd8 = da_min * self._d4_bg ** (self._nonzero_nord + 1)
+            dd8 = Float(da_min * np.power(self._d4_bg, (self._nonzero_nord + 1)))
         else:
-            dd8 = (da_min_c * self._d4_bg) ** (self._nonzero_nord + 1)
+            dd8 = np.power((da_min_c * self._d4_bg), (self._nonzero_nord + 1)).astype(
+                Float
+            )
 
         self._damping_nord_highorder_stencil(
             damped_rel_vort_bgrid,
diff --git a/pyFV3/stencils/dyn_core.py b/pyFV3/stencils/dyn_core.py
index 5de8a5c9..a3266cdd 100644
--- a/pyFV3/stencils/dyn_core.py
+++ b/pyFV3/stencils/dyn_core.py
@@ -1,4 +1,4 @@
-from typing import Dict, Mapping, Optional
+from typing import Dict, Mapping
 
 import numpy as np
 from dace.frontend.python.interface import nounroll as dace_nounroll
@@ -30,7 +30,6 @@
     WrappedHaloUpdater,
     orchestrate,
 )
-from ndsl.checkpointer import NullCheckpointer
 from ndsl.constants import (
     X_DIM,
     X_INTERFACE_DIM,
@@ -40,9 +39,9 @@
     Z_INTERFACE_DIM,
 )
 from ndsl.dsl.dace.orchestration import dace_inhibitor
-from ndsl.dsl.typing import Float, FloatField, FloatFieldIJ
+from ndsl.dsl.typing import Float, FloatField, FloatField64, FloatFieldIJ
 from ndsl.grid import DampingCoefficients, GridData
-from ndsl.typing import Checkpointer, Communicator
+from ndsl.typing import Communicator
 from pyFV3._config import AcousticDynamicsConfig
 from pyFV3.dycore_state import DycoreState
 from pyFV3.stencils.c_sw import CGridShallowWaterDynamics
@@ -59,10 +58,10 @@
 
 
 def zero_data(
-    mfxd: FloatField,
-    mfyd: FloatField,
-    cxd: FloatField,
-    cyd: FloatField,
+    mfxd: FloatField64,
+    mfyd: FloatField64,
+    cxd: FloatField64,
+    cyd: FloatField64,
     heat_source: FloatField,
     diss_estd: FloatField,
     first_timestep: bool,
@@ -127,11 +126,9 @@ def compute_geopotential(zh: FloatField, gz: FloatField):
         gz = zh * constants.GRAV
 
 
-def p_grad_c_stencil(
+def p_grad_c_stencil_x(
     rdxc: FloatFieldIJ,
-    rdyc: FloatFieldIJ,
     uc: FloatField,
-    vc: FloatField,
     delpc: FloatField,
     pkc: FloatField,
     gz: FloatField,
@@ -148,11 +145,8 @@ def p_grad_c_stencil(
 
     Args:
         rdxc (in):
-        rdyc (in):
         uc (inout): x-velocity on the C-grid, has been updated due to advection
             but not yet due to pressure gradient force
-        vc (inout): y-velocity on the C-grid, has been updated due to advection
-            but not yet due to pressure gradient force
         delpc (in): vertical delta in pressure
         pkc (in): pressure if non-hydrostatic,
             (edge pressure)**(moist kappa) if hydrostatic
@@ -175,6 +169,26 @@ def p_grad_c_stencil(
             + (gz[-1, 0, 0] - gz[0, 0, 1]) * (pkc[-1, 0, 1] - pkc)
         )
 
+
+def p_grad_c_stencil_y(
+    rdyc: FloatFieldIJ,
+    vc: FloatField,
+    delpc: FloatField,
+    pkc: FloatField,
+    gz: FloatField,
+    dt2: Float,
+):
+    """
+    See p_grad_c_stencil_y
+    """
+    from __externals__ import hydrostatic
+
+    with computation(PARALLEL), interval(...):
+        if __INLINED(hydrostatic):
+            wk = pkc[0, 0, 1] - pkc
+        else:
+            wk = delpc
+        # wk is pressure gradient
         vc = vc + dt2 * rdyc / (wk[0, -1, 0] + wk) * (
             (gz[0, -1, 1] - gz) * (pkc[0, 0, 1] - pkc[0, -1, 0])
             + (gz[0, -1, 0] - gz[0, 0, 1]) * (pkc[0, -1, 1] - pkc)
@@ -203,7 +217,18 @@ def dyncore_temporaries(
     quantity_factory: QuantityFactory,
 ) -> Mapping[str, Quantity]:
     temporaries: Dict[str, Quantity] = {}
-    for name in ["ut", "vt", "gz", "zh", "pem", "pkc", "pk3", "heat_source", "cappa"]:
+    for name in [
+        "ut",
+        "vt",
+        "gz",
+        "zh",
+        "pem",
+        "pkc",
+        "pk3",
+        "heat_source",
+        "cappa",
+        "dpx",
+    ]:
         # TODO: the dimensions of ut and vt may not be correct,
         #       because they are not used. double-check and correct as needed.
         temporaries[name] = quantity_factory.zeros(
@@ -386,7 +411,6 @@ def __init__(
         phis: FloatFieldIJ,
         wsd: FloatFieldIJ,
         state,  # [DaCe] hack to get around quantity as parameters for halo updates
-        checkpointer: Optional[Checkpointer] = None,
     ):
         """
         Args:
@@ -401,9 +425,6 @@ def __init__(
             config: configuration settings
             pfull: atmospheric Eulerian grid reference pressure (Pa)
             phis: surface geopotential height
-            checkpointer: if given, used to perform operations on model data
-                at specific points in model execution, such as testing against
-                reference data
         """
         orchestrate(
             obj=self,
@@ -411,38 +432,19 @@ def __init__(
             dace_compiletime_args=["state"],
         )
 
-        orchestrate(
-            obj=self,
-            config=stencil_factory.config.dace_config,
-            method_to_orchestrate="_checkpoint_csw",
-            dace_compiletime_args=["state", "tag"],
-        )
-
-        orchestrate(
-            obj=self,
-            config=stencil_factory.config.dace_config,
-            method_to_orchestrate="_checkpoint_dsw_in",
-            dace_compiletime_args=["state", "tag"],
-        )
-
-        orchestrate(
-            obj=self,
-            config=stencil_factory.config.dace_config,
-            method_to_orchestrate="_checkpoint_dsw_out",
-            dace_compiletime_args=["state", "tag"],
-        )
-
-        self.call_checkpointer = checkpointer is not None
-        if checkpointer is None:
-            self.checkpointer: Checkpointer = NullCheckpointer()
-        else:
-            self.checkpointer = checkpointer
         grid_indexing = stencil_factory.grid_indexing
         self.config = config
         if config.d_ext != 0:
             raise RuntimeError("Acoustics (dyn_core): d_ext != 0 is not implemented")
         if config.beta != 0:
-            raise RuntimeError("Acoustics (dyn_core): beta != 0 is not implemented")
+            raise RuntimeError(
+                "Acoustics (dyn_core): beta != 0 is not implemented"
+                " (split_p_grad, etc.)"
+            )
+        if config.beta < -0.1:
+            raise RuntimeError(
+                "Acoustics (dyn_core): beta < 0.1 is not implemented (one_grad_p, etc.)"
+            )
         if config.use_logp:
             raise RuntimeError("Acoustics (dyn_core): use_logp=True is not implemented")
         self._da_min = damping_coefficients.da_min
@@ -481,9 +483,11 @@ def __init__(
         self._xfx = temporaries["xfx"]
         self._yfx = temporaries["yfx"]
         self._ws3 = temporaries["ws3"]
+        self._dpx = temporaries["dpx"]
 
         if not config.hydrostatic:
             self._pk3.data[:] = HUGE_R
+        self._gz.data[:] = HUGE_R
 
         column_namelist = d_sw.get_column_namelist(
             config.d_grid_shallow_water, quantity_factory=quantity_factory
@@ -498,7 +502,7 @@ def __init__(
                 dtype=Float,
             )
             self._zs.data[:] = self._zs.np.asarray(
-                phis.data / constants.GRAV, dtype=self._zs.data.dtype
+                phis.data * constants.RGRAV, dtype=self._zs.data.dtype
             )
 
             self.update_height_on_d_grid = updatedzd.UpdateHeightOnDGrid(
@@ -509,6 +513,7 @@ def __init__(
                 grid_type=grid_type,
                 hord_tm=config.hord_tm,
                 column_namelist=column_namelist,
+                dz_min=Float(config.dz_min),
             )
             self.vertical_solver = NonhydrostaticVerticalSolver(
                 stencil_factory,
@@ -561,10 +566,16 @@ def __init__(
             )
         )
 
-        self._p_grad_c = stencil_factory.from_origin_domain(
-            p_grad_c_stencil,
+        self._p_grad_c_x = stencil_factory.from_origin_domain(
+            p_grad_c_stencil_x,
             origin=grid_indexing.origin_compute(),
-            domain=grid_indexing.domain_compute(add=(1, 1, 0)),
+            domain=grid_indexing.domain_compute(add=(1, 0, 0)),
+            externals={"hydrostatic": config.hydrostatic},
+        )
+        self._p_grad_c_y = stencil_factory.from_origin_domain(
+            p_grad_c_stencil_y,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(add=(0, 1, 0)),
             externals={"hydrostatic": config.hydrostatic},
         )
 
@@ -575,13 +586,14 @@ def __init__(
                 area=grid_data.area,
                 dp_ref=grid_data.dp_ref,
                 grid_type=config.grid_type,
+                dz_min=Float(config.dz_min),
             )
         )
 
         self._zero_data = stencil_factory.from_origin_domain(
             zero_data,
             origin=grid_indexing.origin_full(),
-            domain=grid_indexing.domain_full(),
+            domain=grid_indexing.domain_full(add=(1, 1, 0)),
         )
         ax_offsets_pe = grid_indexing.axis_offsets(
             grid_indexing.origin_full(),
@@ -642,87 +654,14 @@ def __init__(
             pkc=self._pkc,
         )
 
-    # See divergence_damping.py, _get_da_min for explanation of this function
-    @dace_inhibitor
-    def _get_da_min(self) -> float:
-        return self._da_min
-
-    def _checkpoint_csw(self, state: DycoreState, tag: str):
-        if self.call_checkpointer:
-            self.checkpointer(
-                f"C_SW-{tag}",
-                delpd=state.delp,
-                ptd=state.pt,
-                ud=state.u,
-                vd=state.v,
-                wd=state.w,
-                ucd=state.uc,
-                vcd=state.vc,
-                uad=state.ua,
-                vad=state.va,
-                utd=self._ut,
-                vtd=self._vt,
-                divgdd=self._divgd,
-            )
-
-    def _checkpoint_dsw_in(self, state: DycoreState):
-        if self.call_checkpointer:
-            self.checkpointer(
-                "D_SW-In",
-                ucd=state.uc,
-                vcd=state.vc,
-                wd=state.w,
-                # delpc is a temporary and not a variable in D_SW savepoint
-                delpcd=self._vt,
-                delpd=state.delp,
-                ud=state.u,
-                vd=state.v,
-                ptd=state.pt,
-                uad=state.ua,
-                vad=state.va,
-                zhd=self._zh,
-                divgdd=self._divgd,
-                xfxd=self._xfx,
-                yfxd=self._yfx,
-                mfxd=state.mfxd,
-                mfyd=state.mfyd,
-            )
-
-    def _checkpoint_dsw_out(self, state: DycoreState):
-        if self.call_checkpointer:
-            self.checkpointer(
-                "D_SW-Out",
-                ucd=state.uc,
-                vcd=state.vc,
-                wd=state.w,
-                delpcd=self._vt,
-                delpd=state.delp,
-                ud=state.u,
-                vd=state.v,
-                ptd=state.pt,
-                uad=state.ua,
-                vad=state.va,
-                divgdd=self._divgd,
-                xfxd=self._xfx,
-                yfxd=self._yfx,
-                mfxd=state.mfxd,
-                mfyd=state.mfyd,
-            )
-
-    # TODO: fix me - we shouldn't need a function here, Dace is fudging the types
-    # See https://github.com/GEOS-ESM/pace/issues/9
-    @dace_inhibitor
-    def dt_acoustic_substep(self, timestep: Float) -> Float:
-        return timestep / self.config.n_split
-
-    # TODO: Same as above
-    @dace_inhibitor
-    def dt2(self, dt_acoustic_substep: Float) -> Float:
-        return 0.5 * dt_acoustic_substep
-
     def __call__(
         self,
         state: DycoreState,
+        mfxd,
+        mfyd,
+        cxd,
+        cyd,
+        dpx,
         timestep: Float,  # time to step forward by in seconds
         n_map=1,  # [DaCe] replaces state.n_map
     ):
@@ -731,8 +670,8 @@ def __call__(
         # akap, ptop, n_map, comm):
         end_step = n_map == self.config.k_split
         # dt = state.mdt / self.config.n_split
-        dt_acoustic_substep: Float = self.dt_acoustic_substep(timestep)
-        dt2: Float = self.dt2(dt_acoustic_substep)
+        dt_acoustic_substep = Float(timestep / self.config.n_split)
+        dt2 = Float(0.5) * dt_acoustic_substep
         n_split = self.config.n_split
         # NOTE: In Fortran model the halo update starts happens in fv_dynamics, not here
         self._halo_updaters.q_con__cappa.start()
@@ -741,10 +680,10 @@ def __call__(
         self._halo_updaters.q_con__cappa.wait()
 
         self._zero_data(
-            state.mfxd,
-            state.mfyd,
-            state.cxd,
-            state.cyd,
+            mfxd,
+            mfyd,
+            cxd,
+            cyd,
             self._heat_source,
             state.diss_estd,
             n_map == 1,
@@ -793,7 +732,6 @@ def __call__(
                 self._halo_updaters.w.wait()
 
             # compute the c-grid winds at t + 1/2 timestep
-            self._checkpoint_csw(state, tag="In")
             self.cgrid_shallow_water_lagrangian_dynamics(
                 state.delp,
                 state.pt,
@@ -810,7 +748,6 @@ def __call__(
                 state.omga,
                 dt2,
             )
-            self._checkpoint_csw(state, tag="Out")
 
             # TODO: Computing the pressure gradient outside of C_SW was originally done
             # so that we could transpose into a vertical-first memory ordering for the
@@ -833,7 +770,12 @@ def __call__(
                     )
             if not self.config.hydrostatic:
                 self.update_geopotential_height_on_c_grid(
-                    self._zs, self._ut, self._vt, self._gz, self._ws3, dt2
+                    zs=self._zs,
+                    ut=self._ut,
+                    vt=self._vt,
+                    gz=self._gz,
+                    ws=self._ws3,
+                    dt=dt2,
                 )
                 # TODO (floriand): Due to DaCe VRAM pooling creating a memory
                 # leak with the usage pattern of those two fields
@@ -843,63 +785,69 @@ def __call__(
                 # DaCe has already a fix on their side and it awaits release
                 # issue
                 self.vertical_solver_cgrid(
-                    dt2,
-                    self.cappa,
-                    self._ptop,
-                    state.phis,
-                    self._ws3,
-                    self.cgrid_shallow_water_lagrangian_dynamics.ptc,
-                    state.q_con,
-                    self.cgrid_shallow_water_lagrangian_dynamics.delpc,
-                    self._gz,
-                    self._pkc,
-                    state.omga,
+                    dt2=dt2,
+                    cappa=self.cappa,
+                    ptop=self._ptop,
+                    hs=state.phis,
+                    ws=self._ws3,
+                    ptc=self.cgrid_shallow_water_lagrangian_dynamics.ptc,
+                    q_con=state.q_con,
+                    delpc=self.cgrid_shallow_water_lagrangian_dynamics.delpc,
+                    gz=self._gz,
+                    pef=self._pkc,
+                    w3=state.omga,
                 )
 
-            self._p_grad_c(
-                self.grid_data.rdxc,
-                self.grid_data.rdyc,
-                state.uc,
-                state.vc,
-                self.cgrid_shallow_water_lagrangian_dynamics.delpc,
-                self._pkc,
-                self._gz,
-                dt2,
+            self._p_grad_c_x(
+                rdxc=self.grid_data.rdxc,
+                uc=state.uc,
+                delpc=self.cgrid_shallow_water_lagrangian_dynamics.delpc,
+                pkc=self._pkc,
+                gz=self._gz,
+                dt2=dt2,
             )
+            self._p_grad_c_y(
+                rdyc=self.grid_data.rdyc,
+                vc=state.vc,
+                delpc=self.cgrid_shallow_water_lagrangian_dynamics.delpc,
+                pkc=self._pkc,
+                gz=self._gz,
+                dt2=dt2,
+            )
+
             self._halo_updaters.uc__vc.start()
             if self.config.nord > 0:
                 self._halo_updaters.divgd.wait()
             self._halo_updaters.uc__vc.wait()
             # use the computed c-grid winds to evolve the d-grid winds forward
             # by 1 timestep
-            self._checkpoint_dsw_in(state)
             self.dgrid_shallow_water_lagrangian_dynamics(
-                self._vt,
-                state.delp,
-                state.pt,
-                state.u,
-                state.v,
-                state.w,
-                state.uc,
-                state.vc,
-                state.ua,
-                state.va,
-                self._divgd,
-                state.mfxd,
-                state.mfyd,
-                state.cxd,
-                state.cyd,
-                self._crx,
-                self._cry,
-                self._xfx,
-                self._yfx,
-                state.q_con,
-                self._zh,
-                self._heat_source,
-                state.diss_estd,
-                dt_acoustic_substep,
-            )
-            self._checkpoint_dsw_out(state)
+                delpc=self._vt,
+                delp=state.delp,
+                pt=state.pt,
+                u=state.u,
+                v=state.v,
+                w=state.w,
+                uc=state.uc,
+                vc=state.vc,
+                ua=state.ua,
+                va=state.va,
+                divgd=self._divgd,
+                mfx=mfxd,
+                mfy=mfyd,
+                cx=cxd,
+                cy=cyd,
+                dpx=dpx,
+                crx=self._crx,
+                cry=self._cry,
+                xfx=self._xfx,
+                yfx=self._yfx,
+                q_con=state.q_con,
+                zh=self._zh,
+                heat_source=self._heat_source,
+                diss_est=state.diss_estd,
+                dt=dt_acoustic_substep,
+            )
             # note that uc and vc are not needed at all past this point.
             # they will be re-computed from scratch on the next acoustic timestep.
 
@@ -924,23 +872,23 @@ def __call__(
                     dt=dt_acoustic_substep,
                 )
                 self.vertical_solver(
-                    remap_step,
-                    dt_acoustic_substep,
-                    self.cappa,
-                    self._ptop,
-                    self._zs,
-                    self._wsd,
-                    state.delz,
-                    state.q_con,
-                    state.delp,
-                    state.pt,
-                    self._zh,
-                    state.pe,
-                    self._pkc,
-                    self._pk3,
-                    state.pk,
-                    state.peln,
-                    state.w,
+                    last_call=remap_step,
+                    dt=dt_acoustic_substep,
+                    cappa=self.cappa,
+                    ptop=self._ptop,
+                    zs=self._zs,
+                    ws=self._wsd,
+                    delz=state.delz,
+                    q_con=state.q_con,
+                    delp=state.delp,
+                    pt=state.pt,
+                    zh=self._zh,
+                    p=state.pe,
+                    ppe=self._pkc,
+                    pk3=self._pk3,
+                    pk=state.pk,
+                    log_p_interface=state.peln,
+                    w=state.w,
                 )
 
                 self._halo_updaters.zh.start()
@@ -963,15 +911,15 @@ def __call__(
                 self._halo_updaters.pkc.wait()
 
                 self.nonhydrostatic_pressure_gradient(
-                    state.u,
-                    state.v,
-                    self._pkc,
-                    self._gz,
-                    self._pk3,
-                    state.delp,
-                    dt_acoustic_substep,
-                    self._ptop,
-                    self._akap,
+                    u=state.u,
+                    v=state.v,
+                    pp=self._pkc,
+                    gz=self._gz,
+                    pk3=self._pk3,
+                    delp=state.delp,
+                    dt=dt_acoustic_substep,
+                    ptop=self._ptop,
+                    akap=self._akap,
                 )
 
             if self.config.rf_fast:
@@ -996,19 +944,19 @@ def __call__(
                 if self.config.grid_type < 4:
                     self._halo_updaters.interface_uc__vc.interface()
 
-        # we are here
-
         if self._do_del2cubed:
             self._halo_updaters.heat_source.update()
             # TODO: move dependence on da_min into init of hyperdiffusion class
-            da_min: Float = self._get_da_min()
-            cd = constants.CNST_0P20 * da_min
+            cd = constants.CNST_0P20 * self._da_min
             # we want to diffuse the heat source from damping before we apply it,
             # so that we don't reinforce the same grid-scale patterns we're trying
             # to damp
             self._hyperdiffusion(self._heat_source, cd)
             if not self.config.hydrostatic:
-                delt_time_factor = abs(dt_acoustic_substep * self.config.delt_max)
+                delt_time_factor = np.abs(
+                    dt_acoustic_substep * Float(self.config.delt_max),
+                    dtype=Float,
+                )
                 # TODO: it looks like state.pkz is being used as a temporary here,
                 # and overwritten at the start of remapping. See if we can make it
                 # an internal temporary of this stencil.
diff --git a/pyFV3/stencils/fillz.py b/pyFV3/stencils/fillz.py
index 5cd5c239..36a166e1 100644
--- a/pyFV3/stencils/fillz.py
+++ b/pyFV3/stencils/fillz.py
@@ -1,15 +1,15 @@
-import typing
-from typing import Dict
+from typing import no_type_check
 
 from gt4py.cartesian.gtscript import BACKWARD, FORWARD, PARALLEL, computation, interval
 
-import ndsl.dsl.gt4py_utils as utils
-from ndsl import Quantity, QuantityFactory, StencilFactory, orchestrate
+from ndsl import QuantityFactory, StencilFactory, orchestrate
 from ndsl.constants import X_DIM, Y_DIM, Z_DIM
-from ndsl.dsl.typing import Float, FloatField, FloatFieldIJ, IntFieldIJ
+from ndsl.dsl.typing import Float, Int, FloatField, FloatFieldIJ, IntFieldIJ
+from pyFV3.tracers import TracersType
+import dace
 
 
-@typing.no_type_check
+@no_type_check
 def fix_tracer(
     q: FloatField,
     dp: FloatField,
@@ -117,15 +117,12 @@ def __init__(
         self,
         stencil_factory: StencilFactory,
         quantity_factory: QuantityFactory,
-        nq: int,
-        tracers: Dict[str, Quantity],
     ):
         orchestrate(
             obj=self,
             config=stencil_factory.config.dace_config,
             dace_compiletime_args=["tracers"],
         )
-        self._nq = int(nq)
         self._fix_tracer_stencil = stencil_factory.from_dims_halo(
             fix_tracer,
             compute_dims=[X_DIM, Y_DIM, Z_DIM],
@@ -133,7 +130,7 @@ def __init__(
 
         # Setting initial value of upper_fix to zero is only needed for validation.
         # The values in the compute domain are set to zero in the stencil.
-        self._zfix = quantity_factory.zeros([X_DIM, Y_DIM], units="unknown", dtype=int)
+        self._zfix = quantity_factory.zeros([X_DIM, Y_DIM], units="unknown", dtype=Int)
         self._sum0 = quantity_factory.zeros(
             [X_DIM, Y_DIM],
             units="unknown",
@@ -145,23 +142,19 @@ def __init__(
             dtype=Float,
         )
 
-        self._filtered_tracer_dict = {
-            name: tracers[name] for name in utils.tracer_variables[0 : self._nq]
-        }
-
     def __call__(
         self,
         dp2: FloatField,
-        tracers: Dict[str, Quantity],
+        tracers: TracersType,
     ):
         """
         Args:
             dp2 (in): pressure thickness of atmospheric layer
             tracers (inout): tracers to fix negative masses in
         """
-        for tracer_name in self._filtered_tracer_dict.keys():
+        for i_tracer in dace.nounroll(range(tracers.shape[3])):
             self._fix_tracer_stencil(
-                tracers[tracer_name],
+                tracers.quantity.data[:, :, :, i_tracer],
                 dp2,
                 self._zfix,
                 self._sum0,
diff --git a/pyFV3/stencils/fv_dynamics.py b/pyFV3/stencils/fv_dynamics.py
index 50ffba27..2aedcca5 100644
--- a/pyFV3/stencils/fv_dynamics.py
+++ b/pyFV3/stencils/fv_dynamics.py
@@ -1,34 +1,171 @@
 from datetime import timedelta
-from typing import Mapping, Optional
+from typing import List, Mapping
 
 from dace.frontend.python.interface import nounroll as dace_no_unroll
-from gt4py.cartesian.gtscript import PARALLEL, computation, interval
+from gt4py.cartesian.gtscript import FORWARD, PARALLEL, computation, interval
 
-import ndsl.dsl.gt4py_utils as utils
 import pyFV3.stencils.moist_cv as moist_cv
 from ndsl import Quantity, QuantityFactory, StencilFactory, WrappedHaloUpdater
-from ndsl.checkpointer import NullCheckpointer
 from ndsl.comm.mpi import MPI
-from ndsl.constants import KAPPA, NQ, X_DIM, Y_DIM, Z_DIM, Z_INTERFACE_DIM, ZVIR
+from ndsl.constants import (
+    KAPPA,
+    NQ,
+    X_DIM,
+    X_INTERFACE_DIM,
+    Y_DIM,
+    Y_INTERFACE_DIM,
+    Z_DIM,
+    Z_INTERFACE_DIM,
+    ZVIR,
+)
 from ndsl.dsl.dace.orchestration import dace_inhibitor, orchestrate
-from ndsl.dsl.typing import Float, FloatField
+from ndsl.dsl.typing import (
+    NDSL_64BIT_FLOAT_TYPE,
+    Float,
+    FloatField,
+    FloatField64,
+    FloatFieldIJ64,
+    get_precision,
+)
 from ndsl.grid import DampingCoefficients, GridData
 from ndsl.logging import ndsl_log
 from ndsl.performance import NullTimer, Timer
-from ndsl.stencils.basic_operations import copy_defn
+from ndsl.stencils.basic_operations import copy_defn, set_value_defn
 from ndsl.stencils.c2l_ord import CubedToLatLon
-from ndsl.typing import Checkpointer, Communicator
+from ndsl.typing import Communicator
 from pyFV3._config import DynamicalCoreConfig
 from pyFV3.dycore_state import DycoreState
 from pyFV3.stencils import fvtp2d, tracer_2d_1l
+from pyFV3.stencils.compute_total_energy import ComputeTotalEnergy
 from pyFV3.stencils.del2cubed import HyperdiffusionDamping
 from pyFV3.stencils.dyn_core import AcousticDynamics
 from pyFV3.stencils.neg_adj3 import AdjustNegativeTracerMixingRatio
 from pyFV3.stencils.remapping import LagrangianToEulerian
+from pyFV3.stencils.remapping_GEOS import LagrangianToEulerian_GEOS
+from pyFV3.version import IS_GEOS
+
+
+class DryMassRoundOff:
+    def __init__(
+        self,
+        comm: Communicator,
+        quantity_factory: QuantityFactory,
+        stencil_factory: StencilFactory,
+        state: DycoreState,
+        hydrostatic: bool,
+    ) -> None:
+        self.psx_2d = quantity_factory.zeros(
+            dims=[X_DIM, Y_DIM],
+            units="unknown",
+            dtype=NDSL_64BIT_FLOAT_TYPE,
+            allow_mismatch_float_precision=True,
+        )
+        self.dpx = quantity_factory.zeros(
+            dims=[X_DIM, Y_DIM, Z_DIM],
+            units="unknown",
+            dtype=NDSL_64BIT_FLOAT_TYPE,
+            allow_mismatch_float_precision=True,
+        )
+        self.dpx0_2d = quantity_factory.zeros(
+            dims=[X_DIM, Y_DIM],
+            units="unknown",
+            dtype=NDSL_64BIT_FLOAT_TYPE,
+            allow_mismatch_float_precision=True,
+        )
+
+        self._reset = stencil_factory.from_origin_domain(
+            DryMassRoundOff._reset_stencil,
+            origin=stencil_factory.grid_indexing.origin_compute(),
+            domain=stencil_factory.grid_indexing.domain_compute(),
+        )
+        self._apply_psx_to_pe = stencil_factory.from_origin_domain(
+            DryMassRoundOff._apply_psx_to_pe_stencil,
+            origin=stencil_factory.grid_indexing.origin_compute(),
+            domain=stencil_factory.grid_indexing.domain_compute(),
+        )
+        self._apply_dpx_to_psx = stencil_factory.from_origin_domain(
+            DryMassRoundOff._apply_dpx_to_psx_stencil,
+            origin=stencil_factory.grid_indexing.origin_compute(),
+            domain=stencil_factory.grid_indexing.domain_compute(),
+        )
+
+        halo_spec = quantity_factory.get_quantity_halo_spec(
+            dims=[X_DIM, Y_DIM, Z_INTERFACE_DIM],
+            n_halo=stencil_factory.grid_indexing.n_halo,
+            dtype=Float,
+        )
+        self._pe_halo_updater = WrappedHaloUpdater(
+            comm.get_scalar_halo_updater([halo_spec]),
+            state,
+            ["pe"],
+        )
+
+        self._hydrostatic = hydrostatic
+
+    @staticmethod
+    def _reset_stencil(
+        dpx: FloatField64,  # type:ignore
+        psx_2d: FloatFieldIJ64,  # type:ignore
+        pe: FloatField,  # type:ignore
+    ):
+        with computation(PARALLEL), interval(...):
+            dpx = 0.0
+        with computation(FORWARD), interval(-1, None):
+            psx_2d = pe[0, 0, 1]
+
+    @staticmethod
+    def _apply_dpx_to_psx_stencil(
+        dpx: FloatField64,  # type:ignore
+        dpx0_2d: FloatFieldIJ64,  # type:ignore
+        psx_2d: FloatFieldIJ64,  # type:ignore
+    ):
+        with computation(FORWARD), interval(0, 1):
+            dpx0_2d = dpx
+
+        with computation(FORWARD), interval(1, None):
+            dpx0_2d += dpx
+
+        with computation(FORWARD), interval(0, 1):
+            psx_2d += psx_2d + dpx0_2d
+
+    @staticmethod
+    def _apply_psx_to_pe_stencil(
+        psx_2d: FloatFieldIJ64,  # type:ignore
+        pe: FloatField,  # type:ignore
+    ):
+        with computation(FORWARD), interval(-1, None):
+            pe[0, 0, 1] = psx_2d
+
+    def reset(self, pe: FloatField):  # type:ignore
+        self._reset(dpx=self.dpx, psx_2d=self.psx_2d, pe=pe)
+
+    def apply(self, pe: FloatField):  # type:ignore
+        self._apply_dpx_to_psx(self.dpx, self.dpx0_2d, self.psx_2d)
+        self._pe_halo_updater.update()
+        self._apply_psx_to_pe(self.psx_2d, pe)
+
+
+def _increment_stencil(
+    value: FloatField,  # type:ignore
+    increment: FloatField,  # type:ignore
+):
+    with computation(PARALLEL), interval(...):
+        value += increment
+
+
+def _copy_cast_defn(
+    q_in_64: FloatField64,  # type:ignore
+    q_out: FloatField,  # type:ignore
+):
+    with computation(PARALLEL), interval(...):
+        q_out = q_in_64
 
 
 def pt_to_potential_density_pt(
-    pkz: FloatField, dp_initial: FloatField, q_con: FloatField, pt: FloatField
+    pkz: FloatField,  # type: ignore
+    dp_initial: FloatField,  # type: ignore
+    q_con: FloatField,  # type: ignore
+    pt: FloatField,  # type: ignore
 ):
     """
     Args:
@@ -43,7 +180,12 @@ def pt_to_potential_density_pt(
         pt = pt * (1.0 + dp_initial) * (1.0 - q_con) / pkz
 
 
-def omega_from_w(delp: FloatField, delz: FloatField, w: FloatField, omega: FloatField):
+def omega_from_w(
+    delp: FloatField,  # type: ignore
+    delz: FloatField,  # type: ignore
+    w: FloatField,  # type: ignore
+    omega: FloatField,  # type: ignore
+):
     """
     Args:
         delp (in): vertical layer thickness in Pa
@@ -59,7 +201,7 @@ def fvdyn_temporaries(
     quantity_factory: QuantityFactory,
 ) -> Mapping[str, Quantity]:
     tmps = {}
-    for name in ["te_2d", "te0_2d", "wsd"]:
+    for name in ["te0_2d", "wsd"]:
         quantity = quantity_factory.zeros(
             dims=[X_DIM, Y_DIM],
             units="unknown",
@@ -98,8 +240,8 @@ def __init__(
         config: DynamicalCoreConfig,
         phis: Quantity,
         state: DycoreState,
+        exclude_tracers: List[str],
         timestep: timedelta,
-        checkpointer: Optional[Checkpointer] = None,
     ):
         """
         Args:
@@ -111,10 +253,9 @@ def __init__(
                 the namelist in the Fortran model
             phis: surface geopotential height
             state: model state
+            exclude_tracer: List of named tracer to be excluded from the Advection,
+                and Remapping schemes
             timestep: model timestep
-            checkpointer: if given, used to perform operations on model data
-                at specific points in model execution, such as testing against
-                reference data
         """
         orchestrate(
             obj=self,
@@ -137,41 +278,6 @@ def __init__(
             dace_compiletime_args=["state", "timer"],
         )
 
-        orchestrate(
-            obj=self,
-            config=stencil_factory.config.dace_config,
-            method_to_orchestrate="_checkpoint_fvdynamics",
-            dace_compiletime_args=["state", "tag"],
-        )
-
-        orchestrate(
-            obj=self,
-            config=stencil_factory.config.dace_config,
-            method_to_orchestrate="_checkpoint_remapping_in",
-            dace_compiletime_args=[
-                "state",
-            ],
-        )
-
-        orchestrate(
-            obj=self,
-            config=stencil_factory.config.dace_config,
-            method_to_orchestrate="_checkpoint_remapping_out",
-            dace_compiletime_args=["state"],
-        )
-
-        orchestrate(
-            obj=self,
-            config=stencil_factory.config.dace_config,
-            method_to_orchestrate="_checkpoint_tracer_advection_in",
-            dace_compiletime_args=["state"],
-        )
-        orchestrate(
-            obj=self,
-            config=stencil_factory.config.dace_config,
-            method_to_orchestrate="_checkpoint_tracer_advection_out",
-            dace_compiletime_args=["state"],
-        )
         if timestep == timedelta(seconds=0):
             raise RuntimeError(
                 "Bad dynamical core configuration:"
@@ -179,11 +285,6 @@ def __init__(
             )
         # nested and stretched_grid are options in the Fortran code which we
         # have not implemented, so they are hard-coded here.
-        self.call_checkpointer = checkpointer is not None
-        if checkpointer is None:
-            self.checkpointer: Checkpointer = NullCheckpointer()
-        else:
-            self.checkpointer = checkpointer
         nested = False
         stretched_grid = False
         grid_indexing = stencil_factory.grid_indexing
@@ -198,12 +299,43 @@ def __init__(
                 f" nwat=={config.nwat} is not implemented."
                 " Only nwat=6 has been implemented."
             )
+
+        # Implemented dynamics options require those tracers to be present at minima
+        # this is a more granular list than carried by the `nwat` single integer
+        # but cover the same topic
+        required_tracers = [
+            "vapor",
+            "liquid",
+            "rain",
+            "snow",
+            "ice",
+            "graupel",
+            "cloud",
+        ]
+        if not all(n in state.tracers._indexer.keys() for n in required_tracers):
+            raise NotImplementedError(
+                "Dynamical core (fv_dynamics):"
+                " missing required tracers. Dynamics requires:\n"
+                f" {required_tracers}\n"
+                "but only the following where given:\n"
+                f" {state.tracers._indexer.keys()}"
+            )
+
+        self._comm = comm
         self.comm_rank = comm.rank
         self.grid_data = grid_data
         self.grid_indexing = grid_indexing
         self._da_min = damping_coefficients.da_min
         self.config = config
 
+        self.dry_mass_control = DryMassRoundOff(
+            comm=comm,
+            quantity_factory=quantity_factory,
+            stencil_factory=stencil_factory,
+            state=state,
+            hydrostatic=self.config.hydrostatic,
+        )
+
         tracer_transport = fvtp2d.FiniteVolumeTransport(
             stencil_factory=stencil_factory,
             quantity_factory=quantity_factory,
@@ -213,12 +345,7 @@ def __init__(
             hord=config.hord_tr,
         )
 
-        self.tracers = {}
-        for name in utils.tracer_variables[0:NQ]:
-            self.tracers[name] = state.__dict__[name]
-
         temporaries = fvdyn_temporaries(quantity_factory)
-        self._te_2d = temporaries["te_2d"]
         self._te0_2d = temporaries["te0_2d"]
         self._wsd = temporaries["wsd"]
         self._dp_initial = temporaries["dp1"]
@@ -231,7 +358,7 @@ def __init__(
             tracer_transport,
             self.grid_data,
             comm,
-            self.tracers,
+            state.tracers,
         )
         self._ak = grid_data.ak
         self._bk = grid_data.bk
@@ -275,7 +402,6 @@ def __init__(
             phis=self._phis,
             wsd=self._wsd,
             state=state,
-            checkpointer=checkpointer,
         )
         self._hyperdiffusion = HyperdiffusionDamping(
             stencil_factory,
@@ -307,17 +433,37 @@ def __init__(
             hydrostatic=self.config.hydrostatic,
         )
 
-        self._lagrangian_to_eulerian_obj = LagrangianToEulerian(
+        self._compute_total_energy = ComputeTotalEnergy(
+            config=config,
             stencil_factory=stencil_factory,
             quantity_factory=quantity_factory,
-            config=config.remapping,
-            area_64=grid_data.area_64,
-            nq=NQ,
-            pfull=self._pfull,
-            tracers=self.tracers,
-            checkpointer=checkpointer,
+            grid_data=grid_data,
         )
 
+        if IS_GEOS:
+            self._lagrangian_to_eulerian_GEOS = LagrangianToEulerian_GEOS(
+                stencil_factory=stencil_factory,
+                quantity_factory=quantity_factory,
+                config=config.remapping,
+                comm=comm,
+                grid_data=grid_data,
+                nq=NQ,
+                pfull=self._pfull,
+                tracers=state.tracers,
+                adiabatic=config.adiabatic,
+            )
+
+        else:
+            self._lagrangian_to_eulerian_obj = LagrangianToEulerian(
+                stencil_factory=stencil_factory,
+                quantity_factory=quantity_factory,
+                config=config.remapping,
+                area_64=grid_data.area_64,
+                pfull=self._pfull,
+                tracers=state.tracers,
+                exclude_tracers=exclude_tracers,
+            )
+
         full_xyz_spec = quantity_factory.get_quantity_halo_spec(
             dims=[X_DIM, Y_DIM, Z_DIM],
             n_halo=grid_indexing.n_halo,
@@ -331,113 +477,68 @@ def __init__(
         self._conserve_total_energy = config.consv_te
         self._timestep = timestep.total_seconds()
 
-    # See divergence_damping.py, _get_da_min for explanation of this function
-    @dace_inhibitor
-    def _get_da_min(self) -> float:
-        return self._da_min
-
-    def _checkpoint_fvdynamics(self, state: DycoreState, tag: str):
-        if self.call_checkpointer:
-            self.checkpointer(
-                f"FVDynamics-{tag}",
-                u=state.u,
-                v=state.v,
-                w=state.w,
-                delz=state.delz,
-                # ua is not checked as its halo values differ from Fortran,
-                # this can be re-enabled if no longer comparing to Fortran, if the
-                # Fortran is updated to match the Python, or if the checkpointer
-                # can check only the compute domain values
-                # ua=state.ua,
-                va=state.va,
-                uc=state.uc,
-                vc=state.vc,
-                qvapor=state.qvapor,
-            )
-
-    def _checkpoint_remapping_in(
-        self,
-        state: DycoreState,
-    ):
-        if self.call_checkpointer:
-            self.checkpointer(
-                "Remapping-In",
-                pt=state.pt,
-                delp=state.delp,
-                delz=state.delz,
-                peln=state.peln.transpose(
-                    [X_DIM, Z_INTERFACE_DIM, Y_DIM]
-                ),  # [x, z, y] fortran data
-                u=state.u,
-                v=state.v,
-                w=state.w,
-                ua=state.ua,
-                va=state.va,
-                cappa=self._cappa,
-                pk=state.pk,
-                pe=state.pe.transpose(
-                    [X_DIM, Z_INTERFACE_DIM, Y_DIM]
-                ),  # [x, z, y] fortran data
-                phis=state.phis,
-                te_2d=self._te0_2d,
-                ps=state.ps,
-                wsd=self._wsd,
-                omga=state.omga,
-                dp1=self._dp_initial,
+        # At 32-bit precision we still need
+        self._f32_correction = get_precision() == 32
+        if self._f32_correction:
+            self._mfx_f64 = quantity_factory.zeros(
+                dims=[X_INTERFACE_DIM, Y_DIM, Z_DIM],
+                units="unknown",
+                dtype=NDSL_64BIT_FLOAT_TYPE,
+                allow_mismatch_float_precision=True,
             )
-
-    def _checkpoint_remapping_out(
-        self,
-        state: DycoreState,
-    ):
-        if self.call_checkpointer:
-            self.checkpointer(
-                "Remapping-Out",
-                pt=state.pt,
-                delp=state.delp,
-                delz=state.delz,
-                peln=state.peln.transpose(
-                    [X_DIM, Z_INTERFACE_DIM, Y_DIM]
-                ),  # [x, z, y] fortran data
-                u=state.u,
-                v=state.v,
-                w=state.w,
-                cappa=self._cappa,
-                pkz=state.pkz,
-                pk=state.pk,
-                pe=state.pe.transpose(
-                    [X_DIM, Z_INTERFACE_DIM, Y_DIM]
-                ),  # [x, z, y] fortran data
-                dp1=self._dp_initial,
+            self._mfy_f64 = quantity_factory.zeros(
+                dims=[X_DIM, Y_INTERFACE_DIM, Z_DIM],
+                units="unknown",
+                dtype=NDSL_64BIT_FLOAT_TYPE,
+                allow_mismatch_float_precision=True,
             )
-
-    def _checkpoint_tracer_advection_in(
-        self,
-        state: DycoreState,
-    ):
-        if self.call_checkpointer:
-            self.checkpointer(
-                "Tracer2D1L-In",
-                dp1=self._dp_initial,
-                mfxd=state.mfxd,
-                mfyd=state.mfyd,
-                cxd=state.cxd,
-                cyd=state.cyd,
+            self._cx_f64 = quantity_factory.zeros(
+                dims=[X_INTERFACE_DIM, Y_DIM, Z_DIM],
+                units="unknown",
+                dtype=NDSL_64BIT_FLOAT_TYPE,
+                allow_mismatch_float_precision=True,
             )
-
-    def _checkpoint_tracer_advection_out(
-        self,
-        state: DycoreState,
-    ):
-        if self.call_checkpointer:
-            self.checkpointer(
-                "Tracer2D1L-Out",
-                dp1=self._dp_initial,
-                mfxd=state.mfxd,
-                mfyd=state.mfyd,
-                cxd=state.cxd,
-                cyd=state.cyd,
+            self._cy_f64 = quantity_factory.zeros(
+                dims=[X_DIM, Y_INTERFACE_DIM, Z_DIM],
+                units="unknown",
+                dtype=NDSL_64BIT_FLOAT_TYPE,
+                allow_mismatch_float_precision=True,
             )
+        self._mfx_local = quantity_factory.zeros(
+            dims=[X_INTERFACE_DIM, Y_DIM, Z_DIM],
+            units="unknown",
+            dtype=Float,
+        )
+        self._mfy_local = quantity_factory.zeros(
+            dims=[X_DIM, Y_INTERFACE_DIM, Z_DIM],
+            units="unknown",
+            dtype=Float,
+        )
+        self._cx_local = quantity_factory.zeros(
+            dims=[X_INTERFACE_DIM, Y_DIM, Z_DIM],
+            units="unknown",
+            dtype=Float,
+        )
+        self._cy_local = quantity_factory.zeros(
+            dims=[X_DIM, Y_INTERFACE_DIM, Z_DIM],
+            units="unknown",
+            dtype=Float,
+        )
+        self._set_value = stencil_factory.from_origin_domain(
+            func=set_value_defn,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(add=(1, 1, 0)),
+        )
+        self._increment = stencil_factory.from_origin_domain(
+            func=_increment_stencil,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(add=(1, 1, 0)),
+        )
+        self._copy_cast = stencil_factory.from_origin_domain(
+            func=_copy_cast_defn,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(add=(1, 1, 0)),
+        )
 
     def step_dynamics(
         self,
@@ -451,9 +552,7 @@ def step_dynamics(
             timer: keep time of model sections
             state: model prognostic state and inputs
         """
-        self._checkpoint_fvdynamics(state=state, tag="In")
         self._compute(state, timer)
-        self._checkpoint_fvdynamics(state=state, tag="Out")
 
     def compute_preamble(self, state: DycoreState, is_root_rank: bool):
         if self.config.hydrostatic:
@@ -461,13 +560,19 @@ def compute_preamble(self, state: DycoreState, is_root_rank: bool):
         if __debug__:
             log_on_rank_0("FV Setup")
 
+        # Reset fluxes
+        self._set_value(state.mfxd, Float(0.0))
+        self._set_value(state.mfyd, Float(0.0))
+        self._set_value(state.cxd, Float(0.0))
+        self._set_value(state.cyd, Float(0.0))
+
         self._fv_setup_stencil(
-            state.qvapor,
-            state.qliquid,
-            state.qrain,
-            state.qsnow,
-            state.qice,
-            state.qgraupel,
+            state.tracers.vapor,
+            state.tracers.liquid,
+            state.tracers.rain,
+            state.tracers.snow,
+            state.tracers.ice,
+            state.tracers.graupel,
             state.q_con,
             self._cvm,
             state.pkz,
@@ -478,31 +583,51 @@ def compute_preamble(self, state: DycoreState, is_root_rank: bool):
             self._dp_initial,
         )
 
-        if self._conserve_total_energy > 0:
-            raise NotImplementedError(
-                "Dynamical Core (fv_dynamics): compute total energy is not implemented"
+        # Compute total energy
+        if self.config.consv_te > 0.0:
+            self._compute_total_energy(
+                hs=state.phis,
+                delp=state.delp,
+                delz=state.delz,
+                qc=self._dp_initial,
+                pt=state.pt,
+                u=state.u,
+                v=state.v,
+                w=state.w,
+                tracers=state.tracers,
+                te_2d=self._te0_2d,
             )
 
-        if (not self.config.rf_fast) and self.config.tau != 0:
+        # Rayleigh fast
+        if (
+            not self.config.hydrostatic
+            and not self.config.acoustic_dynamics.rf_fast
+            and self.config.acoustic_dynamics.tau > 0
+        ):
             raise NotImplementedError(
-                "Dynamical Core (fv_dynamics): Rayleigh_Super,"
-                " called when rf_fast=False and tau !=0, is not implemented"
+                "Dynamical Core (fv_dynamics): Rayleigh Friction is not implemented."
             )
 
-        if self.config.adiabatic and self.config.kord_tm > 0:
+        # Adjust pt
+        if self.config.adiabatic:
             raise NotImplementedError(
-                "Dynamical Core (fv_dynamics): Adiabatic with positive kord_tm"
-                " is not implemented."
+                "Dynamical Core (fv_dynamics): Adiabatic pt adjust is not implemented."
             )
         else:
-            if __debug__:
-                log_on_rank_0("Adjust pt")
-            self._pt_to_potential_density_pt(
-                state.pkz,
-                self._dp_initial,
-                state.q_con,
-                state.pt,
-            )
+            if self.config.hydrostatic:
+                raise NotImplementedError(
+                    "Dynamical Core (fv_dynamics): Hydrostatic pt adjust"
+                    " is not implemented."
+                )
+            else:
+                self._pt_to_potential_density_pt(
+                    state.pkz,
+                    self._dp_initial,
+                    state.q_con,
+                    state.pt,
+                )
+
+        self.dry_mass_control.reset(pe=state.pe)
 
     def __call__(self, *args, **kwargs):
         return self.step_dynamics(*args, **kwargs)
@@ -526,24 +651,34 @@ def _compute(self, state: DycoreState, timer: Timer):
                 log_on_rank_0("DynCore")
             with timer.clock("DynCore"):
                 self.acoustic_dynamics(
-                    state,
+                    state=state,
+                    mfxd=self._mfx_f64 if self._f32_correction else self._mfx_local,
+                    mfyd=self._mfy_f64 if self._f32_correction else self._mfy_local,
+                    cxd=self._cx_f64 if self._f32_correction else self._cx_local,
+                    cyd=self._cy_f64 if self._f32_correction else self._cy_local,
+                    dpx=self.dry_mass_control.dpx,
                     timestep=self._timestep / self._k_split,
                     n_map=n_map,
                 )
+                if self._f32_correction:
+                    self._copy_cast(self._mfx_f64, self._mfx_local)
+                    self._copy_cast(self._mfy_f64, self._mfy_local)
+                    self._copy_cast(self._cx_f64, self._cx_local)
+                    self._copy_cast(self._cy_f64, self._cy_local)
+                if last_step and self.config.hydrostatic:
+                    self.dry_mass_control.apply(state.pe)
             if self.config.z_tracer:
                 if __debug__:
                     log_on_rank_0("TracerAdvection")
                 with timer.clock("TracerAdvection"):
-                    self._checkpoint_tracer_advection_in(state)
                     self.tracer_advection(
-                        self.tracers,
+                        state.tracers,
                         self._dp_initial,
-                        state.mfxd,
-                        state.mfyd,
-                        state.cxd,
-                        state.cyd,
+                        x_mass_flux=self._mfx_local,
+                        y_mass_flux=self._mfy_local,
+                        x_courant=self._cx_local,
+                        y_courant=self._cy_local,
                     )
-                    self._checkpoint_tracer_advection_out(state)
             else:
                 raise NotImplementedError("z_tracer=False is not implemented")
 
@@ -565,46 +700,82 @@ def _compute(self, state: DycoreState, timer: Timer):
                 if __debug__:
                     log_on_rank_0("Remapping")
                 with timer.clock("Remapping"):
-                    self._checkpoint_remapping_in(state)
-
-                    # TODO: When NQ=9, we shouldn't need to pass qcld explicitly
-                    #       since it's in self.tracers. It should not be an issue since
-                    #       we don't have self.tracers & qcld computation at the same
-                    #       time
-                    #       When NQ=8, we do need qcld passed explicitely
-                    self._lagrangian_to_eulerian_obj(
-                        self.tracers,
-                        state.pt,
-                        state.delp,
-                        state.delz,
-                        state.peln,
-                        state.u,
-                        state.v,
-                        state.w,
-                        self._cappa,
-                        state.q_con,
-                        state.qcld,
-                        state.pkz,
-                        state.pk,
-                        state.pe,
-                        state.phis,
-                        state.ps,
-                        self._wsd,
-                        self._ak,
-                        self._bk,
-                        self._dp_initial,
-                        self._ptop,
-                        KAPPA,
-                        ZVIR,
-                        last_step,
-                        self._conserve_total_energy,
-                        self._timestep / self._k_split,
-                    )
-                    self._checkpoint_remapping_out(state)
+                    if IS_GEOS:
+                        self._lagrangian_to_eulerian_GEOS(
+                            tracers=state.tracers,
+                            pt=state.pt,
+                            delp=state.delp,
+                            delz=state.delz,
+                            peln=state.peln,
+                            u=state.u,
+                            v=state.v,
+                            w=state.w,
+                            mfx=self._mfx_local,
+                            mfy=self._mfy_local,
+                            cx=self._cx_local,
+                            cy=self._cy_local,
+                            cappa=self._cappa,
+                            q_con=state.q_con,
+                            pkz=state.pkz,
+                            pk=state.pk,
+                            pe=state.pe,
+                            hs=state.phis,
+                            te0_2d=self._te0_2d,
+                            ps=state.ps,
+                            wsd=self._wsd,
+                            ak=self._ak,
+                            bk=self._bk,
+                            dp1=self._dp_initial,
+                            ptop=self._ptop,
+                            akap=KAPPA,
+                            zvir=ZVIR,
+                            last_step=last_step,
+                            consv_te=self._conserve_total_energy,
+                            mdt=self._timestep / self._k_split,
+                        )
+                    else:
+                        # TODO: When NQ=9, we shouldn't need to pass qcld explicitly
+                        #       since it's in self.tracers. It should not be an issue
+                        #       since we don't have self.tracers & qcld computation
+                        #       at the same time
+                        #       When NQ=8, we do need qcld passed explicitely
+                        self._lagrangian_to_eulerian_obj(
+                            state.tracers,
+                            state.pt,
+                            state.delp,
+                            state.delz,
+                            state.peln,
+                            state.u,
+                            state.v,
+                            state.w,
+                            self._cappa,
+                            state.q_con,
+                            state.pkz,
+                            state.pk,
+                            state.pe,
+                            state.phis,
+                            state.ps,
+                            self._wsd,
+                            self._ak,
+                            self._bk,
+                            self._dp_initial,
+                            self._ptop,
+                            KAPPA,
+                            ZVIR,
+                            last_step,
+                            self._conserve_total_energy,
+                            self._timestep / self._k_split,
+                        )
                 # TODO: can we pull this block out of the loop intead of
                 # using an if-statement?
+
+                # Update state fluxes and courant number
+                self._increment(state.mfxd, self._mfx_local)
+                self._increment(state.mfyd, self._mfy_local)
+                self._increment(state.cxd, self._cx_local)
+                self._increment(state.cyd, self._cy_local)
+
                 if last_step:
-                    da_min: Float = self._get_da_min()
                     if not self.config.hydrostatic:
                         if __debug__:
                             log_on_rank_0("Omega")
@@ -620,18 +791,18 @@ def _compute(self, state: DycoreState, timer: Timer):
                         if __debug__:
                             log_on_rank_0("Del2Cubed")
                         self._omega_halo_updater.update()
-                        self._hyperdiffusion(state.omga, 0.18 * da_min)
+                        self._hyperdiffusion(state.omga, Float(0.18) * self._da_min)
 
         if __debug__:
             log_on_rank_0("Neg Adj 3")
         self._adjust_tracer_mixing_ratio(
-            state.qvapor,
-            state.qliquid,
-            state.qrain,
-            state.qsnow,
-            state.qice,
-            state.qgraupel,
-            state.qcld,
+            state.tracers.vapor,
+            state.tracers.liquid,
+            state.tracers.rain,
+            state.tracers.snow,
+            state.tracers.ice,
+            state.tracers.graupel,
+            state.tracers.cloud,
             state.pt,
             state.delp,
         )
diff --git a/pyFV3/stencils/fvtp2d.py b/pyFV3/stencils/fvtp2d.py
index 77aad226..d3e5b356 100644
--- a/pyFV3/stencils/fvtp2d.py
+++ b/pyFV3/stencils/fvtp2d.py
@@ -11,6 +11,7 @@
 from pyFV3.stencils.delnflux import DelnFlux
 from pyFV3.stencils.xppm import XPiecewiseParabolic
 from pyFV3.stencils.yppm import YPiecewiseParabolic
+from pyFV3.stencils.corners import CopyCornersX, CopyCornersY
 
 
 @gtscript.function
@@ -179,9 +180,7 @@ def make_quantity():
             # self.delnflux = None
             self._do_delnflux = False
 
-        self._copy_corners_y: corners.CopyCorners = corners.CopyCorners(
-            "y", stencil_factory
-        )
+        self._copy_corners_y = CopyCornersY(stencil_factory)
         self.y_piecewise_parabolic_inner = YPiecewiseParabolic(
             stencil_factory=stencil_factory,
             dya=grid_data.dya,
@@ -204,9 +203,7 @@ def make_quantity():
             domain=idx.domain_compute(add=(1, 1, 1)),
         )
 
-        self._copy_corners_x: corners.CopyCorners = corners.CopyCorners(
-            "x", stencil_factory
-        )
+        self._copy_corners_x = CopyCornersX(stencil_factory)
         self.x_piecewise_parabolic_inner = XPiecewiseParabolic(
             stencil_factory=stencil_factory,
             dxa=grid_data.dxa,
@@ -274,7 +271,7 @@ def __call__(
         by contrast are area weighted.
 
         Args:
-            q (in): scalar to be transported
+            q (inout): scalar to be transported (corners are copied in halo)
             crx (in): Courant number in x-direction
             cry (in): Courant number in y-direction
             x_area_flux (in): flux of area in x-direction, in units of m^2
diff --git a/pyFV3/stencils/fxadv.py b/pyFV3/stencils/fxadv.py
index c1bf805d..626a6bfc 100644
--- a/pyFV3/stencils/fxadv.py
+++ b/pyFV3/stencils/fxadv.py
@@ -484,24 +484,44 @@ def fxadv_fluxes_stencil(
         y_area_flux (out):
         uc_contra (in):
         vc_contra (in):
+
+    Porting Note
+    * The tmp introduced in the computation allows fxadv_fluxes_stencil to closely match the Fortran order
+      of computation, which allows the x_area_flux and y_area_flux match the
+      respective Fortran values.
+
+      Example of previous stencil looked as follows:
+        ==========================================================
+        if uc_contra > 0:
+                crx = dt * uc_contra * rdxa[-1, 0]
+                x_area_flux = dy * dt * uc_contra * sin_sg3[-1, 0]
+            else:
+                crx = dt * uc_contra * rdxa
+                x_area_flux = dy * dt * uc_contra * sin_sg1
+        ==========================================================
     """
     from __externals__ import local_ie, local_is, local_je, local_js
 
     with computation(PARALLEL), interval(...):
         with horizontal(region[local_is : local_ie + 2, :]):
+            # Including the temporary (tmp) calculation enables x_area_flux and y_area_flux 
+            # to more closely precision match the respective Fortran calculation 
+            # since Fortran also performs this temporary calcuation
+            tmp = dt * uc_contra
             if uc_contra > 0:
-                crx = dt * uc_contra * rdxa[-1, 0]
-                x_area_flux = dy * dt * uc_contra * sin_sg3[-1, 0]
+                crx = tmp * rdxa[-1, 0]
+                x_area_flux = dy * tmp * sin_sg3[-1, 0]
             else:
-                crx = dt * uc_contra * rdxa
-                x_area_flux = dy * dt * uc_contra * sin_sg1
+                crx = tmp * rdxa
+                x_area_flux = dy * tmp * sin_sg1
         with horizontal(region[:, local_js : local_je + 2]):
+            tmp = dt * vc_contra
             if vc_contra > 0:
-                cry = dt * vc_contra * rdya[0, -1]
-                y_area_flux = dx * dt * vc_contra * sin_sg4[0, -1]
+                cry = tmp * rdya[0, -1]
+                y_area_flux = dx * tmp * sin_sg4[0, -1]
             else:
-                cry = dt * vc_contra * rdya
-                y_area_flux = dx * dt * vc_contra * sin_sg2
+                cry = tmp * rdya
+                y_area_flux = dx * tmp * sin_sg2
 
 
 class FiniteVolumeFluxPrep:
diff --git a/pyFV3/stencils/map_single.py b/pyFV3/stencils/map_single.py
index a768f86a..be7e3808 100644
--- a/pyFV3/stencils/map_single.py
+++ b/pyFV3/stencils/map_single.py
@@ -4,7 +4,15 @@
 
 from ndsl import QuantityFactory, StencilFactory, orchestrate
 from ndsl.constants import X_DIM, Y_DIM, Z_DIM
-from ndsl.dsl.typing import Float, FloatField, FloatFieldIJ, IntFieldIJ  # noqa: F401
+from ndsl.dsl.typing import (  # noqa: F401
+    Int,
+    BoolFieldIJ,
+    Float,
+    FloatField,
+    FloatFieldIJ,
+    IntField,
+    IntFieldIJ,
+)
 from ndsl.stencils.basic_operations import copy_defn
 from pyFV3.stencils.remap_profile import RemapProfile
 
@@ -79,6 +87,233 @@ def lagrangian_contributions(
         lev = lev - 1
 
 
+class LagrangianContribution:
+    """Lagrangian contribution as it appears in FV3GFS/SHiELD"""
+
+    def __init__(self, stencil_factory: StencilFactory, dims: Sequence[str]) -> None:
+        self._lagrangian_contributions = stencil_factory.from_dims_halo(
+            lagrangian_contributions,
+            compute_dims=dims,
+        )
+
+    def __call__(
+        self,
+        q: FloatField,  # type: ignore
+        pe1: FloatField,  # type: ignore
+        pe2: FloatField,  # type: ignore
+        q4_1: FloatField,  # type: ignore
+        q4_2: FloatField,  # type: ignore
+        q4_3: FloatField,  # type: ignore
+        q4_4: FloatField,  # type: ignore
+        dp1: FloatField,  # type: ignore
+        lev: IntFieldIJ,  # type: ignore
+    ):
+        self._lagrangian_contributions(
+            q,
+            pe1,
+            pe2,
+            q4_1,
+            q4_2,
+            q4_3,
+            q4_4,
+            dp1,
+            lev,
+        )
+
+
+def lagrangian_contributions_interp(
+    km: int,
+    not_exit_loop: BoolFieldIJ,
+    INDEX_LM1: IntField,
+    INDEX_LP0: IntField,
+    q: FloatField,
+    pe1: FloatField,
+    pe2: FloatField,
+    q4_1: FloatField,
+    q4_2: FloatField,
+    q4_3: FloatField,
+    q4_4: FloatField,
+    dp1: FloatField,
+    lev: IntFieldIJ,
+):
+    """
+    Args:
+        km (in):
+        not_exit_loop (in/temp):
+        LM1 (in/temp):
+        LP0 (in/temp):
+        q (in/out):
+        pe1 (in):
+        pe2 (in):
+        q4_1 (in):
+        q4_2 (in):
+        q4_3 (in):
+        q4_4 (in):
+        dp1 (in):
+        lev (inout):
+    """
+
+    # This computation creates a IntField that allows for "absolute" references
+    # in the k-dimension for q and pe1.
+
+    # INDEX_LM1 and INDEX_LP0 is initialized such that if it's plugged into "q"
+    # (ex: q[0,0,INDEX_LM1]), the k level in q is "k = 0".
+
+    # For example, during the stencil computation at k = 2, INDEX_LM1[i,j,2] = -2
+    with computation(FORWARD):
+        with interval(0, 1):
+            INDEX_LM1 = 0
+            INDEX_LP0 = 0
+        with interval(1, None):
+            INDEX_LM1 = INDEX_LM1[0, 0, -1] - 1
+            INDEX_LP0 = INDEX_LP0[0, 0, -1] - 1
+
+    # TODO: Can we make lev a 2D temporary?
+    with computation(FORWARD), interval(...):
+        LM1 = 1
+        LP0 = 1
+        not_exit_loop = True
+        while LP0 <= km and not_exit_loop:
+            if pe1[0, 0, INDEX_LP0] < pe2:
+                LP0 = LP0 + 1
+                INDEX_LP0 = INDEX_LP0 + 1
+            else:
+                not_exit_loop = False
+
+        LM1 = max(LP0 - 1, 1)
+        INDEX_LM1 = INDEX_LM1 + (LM1 - 1)
+        LP0 = min(LP0, km)
+
+        if LP0 == 1:
+            INDEX_LP0 = INDEX_LM1
+        elif LP0 <= km:
+            INDEX_LP0 = INDEX_LM1 + 1
+        else:
+            INDEX_LP0 = INDEX_LM1
+
+        if LM1 == 1 and LP0 == 1:
+            q_temp = q[0, 0, INDEX_LM1] + (
+                q[0, 0, INDEX_LM1 + 1] - q[0, 0, INDEX_LM1]
+            ) * (pe2 - pe1[0, 0, INDEX_LM1]) / (
+                pe1[0, 0, INDEX_LM1 + 1] - pe1[0, 0, INDEX_LM1]
+            )
+
+        elif LM1 == km and LP0 == km:
+            q_temp = q[0, 0, INDEX_LM1] + (
+                q[0, 0, INDEX_LM1] - q[0, 0, INDEX_LM1 - 1]
+            ) * (pe2 - pe1[0, 0, INDEX_LM1]) / (
+                pe1[0, 0, INDEX_LM1] - pe1[0, 0, INDEX_LM1 - 1]
+            )
+
+        elif LM1 == 1 or LP0 == km:
+            q_temp = q[0, 0, INDEX_LP0] + (q[0, 0, INDEX_LM1] - q[0, 0, INDEX_LP0]) * (
+                pe2 - pe1[0, 0, INDEX_LP0]
+            ) / (pe1[0, 0, INDEX_LM1] - pe1[0, 0, INDEX_LP0])
+
+        else:
+            while pe2 < pe1[0, 0, lev] or pe2 > pe1[0, 0, lev + 1]:
+                lev = lev + 1
+            pl = (pe2 - pe1[0, 0, lev]) / dp1[0, 0, lev]
+            if pe2[0, 0, 1] <= pe1[0, 0, lev + 1]:
+                pr = (pe2[0, 0, 1] - pe1[0, 0, lev]) / dp1[0, 0, lev]
+                q_temp = (
+                    q4_2[0, 0, lev]
+                    + 0.5
+                    * (q4_4[0, 0, lev] + q4_3[0, 0, lev] - q4_2[0, 0, lev])
+                    * (pr + pl)
+                    - q4_4[0, 0, lev] * 1.0 / 3.0 * (pr * (pr + pl) + pl * pl)
+                )
+            else:
+                qsum = (pe1[0, 0, lev + 1] - pe2) * (
+                    q4_2[0, 0, lev]
+                    + 0.5
+                    * (q4_4[0, 0, lev] + q4_3[0, 0, lev] - q4_2[0, 0, lev])
+                    * (1.0 + pl)
+                    - q4_4[0, 0, lev] * 1.0 / 3.0 * (1.0 + pl * (1.0 + pl))
+                )
+                lev = lev + 1
+                while pe1[0, 0, lev + 1] < pe2[0, 0, 1]:
+                    qsum += dp1[0, 0, lev] * q4_1[0, 0, lev]
+                    lev = lev + 1
+                dp = pe2[0, 0, 1] - pe1[0, 0, lev]
+                esl = dp / dp1[0, 0, lev]
+                qsum += dp * (
+                    q4_2[0, 0, lev]
+                    + 0.5
+                    * esl
+                    * (
+                        q4_3[0, 0, lev]
+                        - q4_2[0, 0, lev]
+                        + q4_4[0, 0, lev] * (1.0 - (2.0 / 3.0) * esl)
+                    )
+                )
+                q_temp = qsum / (pe2[0, 0, 1] - pe2)
+
+        lev = lev - 1
+
+        q = q_temp
+
+
+class LagrangianContributionInterpolated:
+    """Lagrangian contribution as it appears in GEOS, modified from original
+    FV3GFS version"""
+
+    def __init__(
+        self,
+        stencil_factory: StencilFactory,
+        quantity_factory: QuantityFactory,
+        dims: Sequence[str],
+    ) -> None:
+        self._lagrangian_contributions_interp = stencil_factory.from_dims_halo(
+            lagrangian_contributions_interp,
+            compute_dims=dims,
+        )
+
+        self._INDEX_LM1 = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="",
+            dtype=Int,
+        )
+
+        self._INDEX_LP0 = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="",
+            dtype=Int,
+        )
+        self._km = stencil_factory.grid_indexing.domain[2]
+        self._not_exit_loop = quantity_factory.zeros(
+            [X_DIM, Y_DIM], units="", dtype=bool
+        )
+
+    def __call__(
+        self,
+        q: FloatField,  # type: ignore
+        pe1: FloatField,  # type: ignore
+        pe2: FloatField,  # type: ignore
+        q4_1: FloatField,  # type: ignore
+        q4_2: FloatField,  # type: ignore
+        q4_3: FloatField,  # type: ignore
+        q4_4: FloatField,  # type: ignore
+        dp1: FloatField,  # type: ignore
+        lev: IntFieldIJ,  # type: ignore
+    ):
+        self._lagrangian_contributions_interp(
+            km=self._km,
+            not_exit_loop=self._not_exit_loop,
+            INDEX_LM1=self._INDEX_LM1,
+            INDEX_LP0=self._INDEX_LP0,
+            q=q,
+            pe1=pe1,
+            pe2=pe2,
+            q4_1=q4_1,
+            q4_2=q4_2,
+            q4_3=q4_3,
+            q4_4=q4_4,
+            dp1=dp1,
+            lev=lev,
+        )
+
+
 class MapSingle:
     """
     Fortran name is map_single, test classes are Map1_PPM_2d, Map_Scalar_2d
@@ -91,6 +326,7 @@ def __init__(
         kord: int,
         mode: int,
         dims: Sequence[str],
+        interpolate_contribution: bool = False,
     ):
         orchestrate(
             obj=self,
@@ -117,7 +353,7 @@ def make_quantity():
             units="unknown",
             dtype=Float,
         )
-        self._lev = quantity_factory.zeros([X_DIM, Y_DIM], units="", dtype=int)
+        self._lev = quantity_factory.zeros([X_DIM, Y_DIM], units="", dtype=Int)
 
         self._copy_stencil = stencil_factory.from_dims_halo(
             copy_defn,
@@ -137,10 +373,14 @@ def make_quantity():
             dims=dims,
         )
 
-        self._lagrangian_contributions = stencil_factory.from_dims_halo(
-            lagrangian_contributions,
-            compute_dims=dims,
-        )
+        if interpolate_contribution:
+            self._lagrangian_contributions = LagrangianContributionInterpolated(
+                stencil_factory, quantity_factory, dims
+            )
+        else:
+            self._lagrangian_contributions = LagrangianContribution(
+                stencil_factory, dims
+            )
 
     @property
     def i_extent(self):
@@ -180,7 +420,7 @@ def __call__(
                 self._q4_3,
                 self._q4_4,
                 self._dp1,
-                qmin,
+                Float(qmin),
             )
         else:
             self._remap_profile(
@@ -190,17 +430,17 @@ def __call__(
                 self._q4_3,
                 self._q4_4,
                 self._dp1,
-                qmin,
+                Float(qmin),
             )
+
         self._lagrangian_contributions(
-            q1,
-            pe1,
-            pe2,
-            self._q4_1,
-            self._q4_2,
-            self._q4_3,
-            self._q4_4,
-            self._dp1,
-            self._lev,
+            q=q1,
+            pe1=pe1,
+            pe2=pe2,
+            q4_1=self._q4_1,
+            q4_2=self._q4_2,
+            q4_3=self._q4_3,
+            q4_4=self._q4_4,
+            dp1=self._dp1,
+            lev=self._lev,
         )
-        return q1
diff --git a/pyFV3/stencils/mapn_tracer.py b/pyFV3/stencils/mapn_tracer.py
index 0696d145..36a40dd0 100644
--- a/pyFV3/stencils/mapn_tracer.py
+++ b/pyFV3/stencils/mapn_tracer.py
@@ -1,11 +1,10 @@
-from typing import Dict
-
-import ndsl.dsl.gt4py_utils as utils
-from ndsl import Quantity, QuantityFactory, StencilFactory, orchestrate
+from ndsl import QuantityFactory, StencilFactory, orchestrate
 from ndsl.constants import X_DIM, Y_DIM, Z_DIM
 from ndsl.dsl.typing import Float, FloatField
 from pyFV3.stencils.fillz import FillNegativeTracerValues
 from pyFV3.stencils.map_single import MapSingle
+from pyFV3.tracers import TracersType
+from dace import nounroll
 
 
 class MapNTracer:
@@ -18,53 +17,52 @@ def __init__(
         stencil_factory: StencilFactory,
         quantity_factory: QuantityFactory,
         kord: int,
-        nq: int,
         fill: bool,
-        tracers: Dict[str, Quantity],
+        tracers: TracersType,
     ):
         orchestrate(
             obj=self,
             config=stencil_factory.config.dace_config,
             dace_compiletime_args=["tracers"],
         )
-        self._nq = int(nq)
         self._qs = quantity_factory.zeros(
             [X_DIM, Y_DIM, Z_DIM],
             units="unknown",
             dtype=Float,
         )
 
-        kord_tracer = [kord] * self._nq
-        kord_tracer[5] = 9  # qcld
-
-        self._list_of_remap_objects = [
-            MapSingle(
-                stencil_factory,
-                quantity_factory,
-                kord_tracer[i],
-                0,
-                dims=[X_DIM, Y_DIM, Z_DIM],
-            )
-            for i in range(len(kord_tracer))
-        ]
+        self._map_single = MapSingle(
+            stencil_factory,
+            quantity_factory,
+            kord,
+            0,
+            dims=[X_DIM, Y_DIM, Z_DIM],
+        )
+        self._map_single_kord9 = MapSingle(
+            stencil_factory,
+            quantity_factory,
+            9,
+            0,
+            dims=[X_DIM, Y_DIM, Z_DIM],
+        )
 
         if fill:
             self._fill_negative_tracers = True
             self._fillz = FillNegativeTracerValues(
                 stencil_factory,
                 quantity_factory,
-                self._nq,
-                tracers,
             )
         else:
             self._fill_negative_tracers = False
 
+        self._index_cloud = tracers.index("cloud")
+
     def __call__(
         self,
         pe1: FloatField,
         pe2: FloatField,
         dp2: FloatField,
-        tracers: Dict[str, Quantity],
+        tracers: TracersType,
     ):
         """
         Remaps the tracer species onto the Eulerian grid
@@ -77,8 +75,12 @@ def __call__(
             dp2 (in): Difference in pressure between Eulerian levels
             tracers (inout): tracers to be remapped
         """
-        for i, q in enumerate(utils.tracer_variables[0 : self._nq]):
-            self._list_of_remap_objects[i](tracers[q], pe1, pe2, self._qs)
+        for i_tracer in nounroll(range(tracers.shape[3])):
+            if i_tracer != self._index_cloud:
+                self._map_single(
+                    tracers.quantity.data[:, :, :, i_tracer], pe1, pe2, self._qs
+                )
+        self._map_single_kord9(tracers.cloud, pe1, pe2, self._qs)
 
         if self._fill_negative_tracers is True:
             self._fillz(dp2, tracers)
diff --git a/pyFV3/stencils/moist_cv.py b/pyFV3/stencils/moist_cv.py
index c571a24e..a8c53242 100644
--- a/pyFV3/stencils/moist_cv.py
+++ b/pyFV3/stencils/moist_cv.py
@@ -1,6 +1,8 @@
 import gt4py.cartesian.gtscript as gtscript
 from gt4py.cartesian.gtscript import (
     __INLINED,
+    BACKWARD,
+    FORWARD,
     PARALLEL,
     computation,
     exp,
@@ -9,7 +11,7 @@
 )
 
 import ndsl.constants as constants
-from ndsl.dsl.typing import Float, FloatField
+from ndsl.dsl.typing import Float, FloatField, FloatFieldIJ
 
 
 @gtscript.function
@@ -20,9 +22,10 @@ def set_cappa(qvapor, cvm, r_vir):
 
 @gtscript.function
 def moist_cvm(qvapor, gz, ql, qs):
+    # CK : GEOS applies the "max" function to tracer values
     cvm = (
-        (1.0 - (qvapor + gz)) * constants.CV_AIR
-        + qvapor * constants.CV_VAP
+        (1.0 - (max(qvapor, 0.0) + gz)) * constants.CV_AIR
+        + max(qvapor, 0.0) * constants.CV_VAP
         + ql * constants.C_LIQ
         + qs * constants.C_ICE
     )
@@ -38,8 +41,9 @@ def moist_cv_nwat6_fn(
     qice: FloatField,
     qgraupel: FloatField,
 ):
-    ql = qliquid + qrain
-    qs = qice + qsnow + qgraupel
+    # CK : GEOS applies the "max" function to tracer values
+    ql = max(qliquid, 0.0) + max(qrain, 0.0)
+    qs = max(qice, 0.0) + max(qsnow, 0.0) + max(qgraupel, 0.0)
     gz = ql + qs
     cvm = moist_cvm(qvapor, gz, ql, qs)
     return cvm, gz
@@ -88,7 +92,6 @@ def moist_pt_last_step(
     qsnow: FloatField,
     qice: FloatField,
     qgraupel: FloatField,
-    gz: FloatField,
     pt: FloatField,
     pkz: FloatField,
     dtmp: Float,
@@ -102,7 +105,6 @@ def moist_pt_last_step(
         qsnow (in):
         qice (in):
         qgraupel (in):
-        gz (out):
         pt (inout):
         pkz (in):
         dtmp (in):
@@ -134,9 +136,6 @@ def moist_pkz(
     qsnow: FloatField,
     qice: FloatField,
     qgraupel: FloatField,
-    q_con: FloatField,
-    gz: FloatField,
-    cvm: FloatField,
     pkz: FloatField,
     pt: FloatField,
     cappa: FloatField,
@@ -152,9 +151,6 @@ def moist_pkz(
         qsnow (in):
         qice (in):
         qgraupel (in):
-        q_con (out):
-        gz (out):
-        cvm (out):
         pkz (out):
         pt (in):
         cappa (out):
@@ -167,11 +163,116 @@ def moist_pkz(
         cvm, gz = moist_cv_nwat6_fn(
             qvapor, qliquid, qrain, qsnow, qice, qgraupel
         )  # if (nwat == 6) else moist_cv_default_fn(constants.CV_AIR)
-        q_con[0, 0, 0] = gz
+        # q_con[0, 0, 0] = gz
         cappa = set_cappa(qvapor, cvm, r_vir)
         pkz = compute_pkz_func(delp, delz, pt, cappa)
 
 
+def moist_te(
+    qvapor: FloatField,
+    qliquid: FloatField,
+    qrain: FloatField,
+    qsnow: FloatField,
+    qice: FloatField,
+    qgraupel: FloatField,
+    u: FloatField,
+    v: FloatField,
+    w: FloatField,
+    te: FloatFieldIJ,
+    pt: FloatField,
+    phis: FloatField,
+    delp: FloatField,
+    rsin2: FloatFieldIJ,
+    cosa_s: FloatFieldIJ,
+    hs: FloatFieldIJ,
+    delz: FloatField,
+    grav: Float,
+):
+    """
+    Args:
+        qvapor (in):
+        qliquid (in):
+        qrain (in):
+        qsnow (in):
+        qice (in):
+        qgraupel (in):
+        u (in):
+        v (in):
+        w (in):
+        te (out):
+        pt (in):
+        phis (in):
+        delp (in):
+        rsin2 (in):
+        cosa_s (in):
+        hs (in):
+    """
+    with computation(FORWARD), interval(-1, None):
+        te = 0.0
+        phis = hs
+    with computation(BACKWARD), interval(0, -1):
+        phis = phis[0, 0, 1] - grav * delz
+    with computation(FORWARD), interval(0, -1):
+        cvm, _gz = moist_cv_nwat6_fn(qvapor, qliquid, qrain, qsnow, qice, qgraupel)
+
+        te = te + delp * (
+            cvm * pt
+            + 0.5
+            * (
+                phis
+                + phis[0, 0, 1]
+                + w ** 2.0
+                + 0.5
+                * rsin2
+                * (
+                    u ** 2.0
+                    + u[0, 1, 0] ** 2.0
+                    + v ** 2.0
+                    + v[1, 0, 0] ** 2.0
+                    - (u + u[0, 1, 0]) * (v + v[1, 0, 0]) * cosa_s
+                )
+            )
+        )
+
+
+def te_zsum(
+    te_2d: FloatFieldIJ,
+    te0_2d: FloatFieldIJ,
+    delp: FloatField,
+    pkz: FloatField,
+    zsum1: FloatFieldIJ,
+):
+    with computation(FORWARD):
+        with interval(0, 1):
+            te_2d = te0_2d - te_2d
+            zsum1 = pkz * delp
+
+        with interval(1, None):
+            zsum1 = zsum1 + pkz * delp
+
+
+def cond_output(
+    q_con: FloatField,
+    qliquid: FloatField,
+    qrain: FloatField,
+    qsnow: FloatField,
+    qice: FloatField,
+    qgraupel: FloatField,
+):
+    with computation(PARALLEL), interval(...):
+        q_con = 0.0
+        if qliquid > 0.0:
+            q_con = q_con + qliquid
+        if qice > 0.0:
+            q_con = q_con + qice
+        if qrain > 0.0:
+            q_con = q_con + qrain
+        if qsnow > 0.0:
+            q_con = q_con + qsnow
+        if qgraupel > 0.0:
+            q_con = q_con + qgraupel
+
+
 def fv_setup(
     qvapor: FloatField,
     qliquid: FloatField,
diff --git a/pyFV3/stencils/nh_p_grad.py b/pyFV3/stencils/nh_p_grad.py
index 03988d0d..bf540255 100644
--- a/pyFV3/stencils/nh_p_grad.py
+++ b/pyFV3/stencils/nh_p_grad.py
@@ -172,7 +172,7 @@ def __init__(
             z_dim=Z_INTERFACE_DIM,
             replace=True,
         )
-        self.a2b_kbuffer = AGrid2BGridFourthOrder(
+        self.a2b_kinterface = AGrid2BGridFourthOrder(
             stencil_factory,
             quantity_factory=quantity_factory,
             grid_data=grid_data,
@@ -245,7 +245,7 @@ def __call__(
         self.a2b_k1(pp, self._tmp_wk1)
         self.a2b_k1(pk3, self._tmp_wk1)
 
-        self.a2b_kbuffer(gz, self._tmp_wk1)
+        self.a2b_kinterface(gz, self._tmp_wk1)
         self.a2b_kstandard(delp, self._tmp_wk1)
 
         self._set_k0_and_calc_wk_stencil(pp, pk3, self._tmp_wk, top_value)
diff --git a/pyFV3/stencils/pk3_halo.py b/pyFV3/stencils/pk3_halo.py
index 6daf945a..20436ac9 100644
--- a/pyFV3/stencils/pk3_halo.py
+++ b/pyFV3/stencils/pk3_halo.py
@@ -1,4 +1,12 @@
-from gt4py.cartesian.gtscript import FORWARD, computation, horizontal, interval, region
+from gt4py.cartesian.gtscript import (
+    FORWARD,
+    computation,
+    horizontal,
+    interval,
+    region,
+    log,
+    exp,
+)
 
 from ndsl import QuantityFactory, StencilFactory
 from ndsl.constants import X_DIM, Y_DIM
@@ -29,7 +37,7 @@ def edge_pe_update(
                 region[local_is - 2 : local_ie + 3, local_je + 1 : local_je + 3],
             ):
                 pe = pe + delp[0, 0, -1]
-                pk3 = pe ** akap
+                pk3 = exp(akap * log(pe))
 
 
 class PK3Halo:
diff --git a/pyFV3/stencils/ray_fast.py b/pyFV3/stencils/ray_fast.py
index dbc082d4..96523da9 100644
--- a/pyFV3/stencils/ray_fast.py
+++ b/pyFV3/stencils/ray_fast.py
@@ -5,6 +5,7 @@
     FORWARD,
     PARALLEL,
     computation,
+    float64,
     horizontal,
     interval,
     log,
@@ -14,11 +15,17 @@
 
 import ndsl.constants as constants
 from ndsl import StencilFactory, orchestrate
-from ndsl.constants import X_INTERFACE_DIM, Y_INTERFACE_DIM, Z_DIM
+from ndsl.boilerplate import get_factories_single_tile
+from ndsl.constants import (
+    SECONDS_PER_DAY,
+    X_INTERFACE_DIM,
+    Y_INTERFACE_DIM,
+    Z_DIM,
+    X_DIM,
+    Y_DIM,
+)
 from ndsl.dsl.typing import Float, FloatField, FloatFieldK
-
-
-SDAY = 86400.0
+import numpy as np
 
 
 # NOTE: The fortran version of this computes rf in the first timestep only. Then
@@ -36,7 +43,7 @@ def compute_rf_vals(pfull, bdt, rf_cutoff, tau0, ptop):
 @gtscript.function
 def compute_rff_vals(pfull, dt, rf_cutoff, tau0, ptop):
     rffvals = compute_rf_vals(pfull, dt, rf_cutoff, tau0, ptop)
-    rffvals = 1.0 / (1.0 + rffvals)
+    rffvals = float64(1.0) / (float64(1.0) + rffvals)
     return rffvals
 
 
@@ -45,14 +52,31 @@ def dm_layer(rf, dp, wind):
     return (1.0 - rf) * dp * wind
 
 
+def ray_fast_damping_increment(
+    pfull: FloatFieldK,  # type:ignore
+    dt: Float,  # type:ignore
+    ptop: Float,  # type:ignore
+    rf: FloatField,  # type:ignore
+):
+    """rf is rayleigh damping increment, fraction of vertical velocity
+    left after doing rayleigh damping (w -> w * rf)
+    """
+    from __externals__ import rf_cutoff, tau
+
+    with computation(PARALLEL), interval(...):
+        if pfull < rf_cutoff:
+            # rf is rayleigh damping increment, fraction of vertical velocity
+            # left after doing rayleigh damping (w -> w * rf)
+            rf = compute_rff_vals(pfull, dt, rf_cutoff, tau * SECONDS_PER_DAY, ptop)
+
+
 def ray_fast_wind_compute(
     u: FloatField,
     v: FloatField,
     w: FloatField,
     delta_p_ref: FloatFieldK,  # reference delta pressure
     pfull: FloatFieldK,  # input layer pressure reference?
-    dt: Float,
-    ptop: Float,
+    rf: FloatFieldK,
     rf_cutoff_nudge: Float,
 ):
     """
@@ -70,13 +94,6 @@ def ray_fast_wind_compute(
     from __externals__ import hydrostatic, local_ie, local_je, rf_cutoff, tau
 
     # dm_stencil
-    with computation(PARALLEL), interval(...):
-        # TODO -- in the fortran model rf is only computed once, repeating
-        # the computation every time ray_fast is run is inefficient
-        if pfull < rf_cutoff:
-            # rf is rayleigh damping increment, fraction of vertical velocity
-            # left after doing rayleigh damping (w -> w * rf)
-            rf = compute_rff_vals(pfull, dt, rf_cutoff, tau * SDAY, ptop)
     with computation(FORWARD):
         with interval(0, 1):
             if pfull < rf_cutoff_nudge:
@@ -155,14 +172,26 @@ class RayleighDamping:
     Fortran name: ray_fast.
     """
 
-    def __init__(self, stencil_factory: StencilFactory, rf_cutoff, tau, hydrostatic):
+    def __init__(
+        self,
+        stencil_factory: StencilFactory,
+        rf_cutoff: Float,
+        tau: Float,
+        hydrostatic: bool,
+    ):
         orchestrate(obj=self, config=stencil_factory.config.dace_config)
         grid_indexing = stencil_factory.grid_indexing
-        self._rf_cutoff = rf_cutoff
+        self._rf_cutoff = Float(rf_cutoff)
         origin, domain = grid_indexing.get_origin_domain(
             [X_INTERFACE_DIM, Y_INTERFACE_DIM, Z_DIM]
         )
 
+        if tau == 0:
+            raise NotImplementedError(
+                "Dynamical Core (fv_dynamics): RayleighDamping,"
+                " with tau <= 0, is not implemented"
+            )
+
         ax_offsets = grid_indexing.axis_offsets(origin, domain)
         local_axis_offsets = {}
         for axis_offset_name, axis_offset_value in ax_offsets.items():
@@ -175,12 +204,35 @@ def __init__(self, stencil_factory: StencilFactory, rf_cutoff, tau, hydrostatic)
             domain=domain,
             externals={
                 "hydrostatic": hydrostatic,
-                "rf_cutoff": rf_cutoff,
+                "rf_cutoff": self._rf_cutoff,
                 "tau": tau,
                 **local_axis_offsets,
             },
         )
 
+        # We compute the damping increment once using a trick to write a
+        # FloatFieldK as a (1, 1, K) 3D writable Field
+        _K_stencil_factory, K_quantity_factory = get_factories_single_tile(
+            1,
+            1,
+            domain[2],
+            0,
+            stencil_factory.backend,
+        )
+        self._ray_fast_damping_increment = stencil_factory.from_origin_domain(
+            ray_fast_damping_increment,
+            origin=(0, 0, origin[2]),
+            domain=(1, 1, domain[2]),
+            externals={
+                "rf_cutoff": self._rf_cutoff,
+                "tau": tau,
+            },
+        )
+        self._damping_increment = K_quantity_factory.ones(
+            [X_DIM, Y_DIM, Z_DIM], units="n/a"
+        )
+        self._initialize_damping_increment = np.ones((1,), dtype=int)
+
     def __call__(
         self,
         u: FloatField,
@@ -191,15 +243,31 @@ def __call__(
         dt: Float,
         ptop: Float,
     ):
-        rf_cutoff_nudge = self._rf_cutoff + min(100.0, 10.0 * ptop)
+        """
+        Args:
+            u (inout)
+            v (inout)
+            w (inout)
+            dp (in)
+            pfull (in)
+            dt (in)
+            ptop (in)
+        """
+        rf_cutoff_nudge = self._rf_cutoff + min(Float(100.0), Float(10.0) * ptop)
 
+        # TODO: this is a bad fix to go around an orchestration issue
+        #       on compile-time values. Do better.
+        if self._initialize_damping_increment[0] == 1:
+            self._ray_fast_damping_increment(
+                pfull=pfull, dt=dt, ptop=ptop, rf=self._damping_increment
+            )
+            self._initialize_damping_increment[0] = 0
         self._ray_fast_wind_compute(
-            u,
-            v,
-            w,
-            dp,
-            pfull,
-            dt,
-            ptop,
-            rf_cutoff_nudge,
+            u=u,
+            v=v,
+            w=w,
+            delta_p_ref=dp,
+            pfull=pfull,
+            rf=self._damping_increment.field[0, 0, :],
+            rf_cutoff_nudge=rf_cutoff_nudge,
         )
diff --git a/pyFV3/stencils/remap_profile.py b/pyFV3/stencils/remap_profile.py
index f37d32d3..5e5cc41e 100644
--- a/pyFV3/stencils/remap_profile.py
+++ b/pyFV3/stencils/remap_profile.py
@@ -691,5 +691,5 @@ def __call__(
                 self._ext5,
                 self._ext6,
                 self._extm,
-                qmin,
+                Float(qmin),
             )
diff --git a/pyFV3/stencils/remapping.py b/pyFV3/stencils/remapping.py
index a14c38a1..82406501 100644
--- a/pyFV3/stencils/remapping.py
+++ b/pyFV3/stencils/remapping.py
@@ -1,7 +1,6 @@
-from typing import Dict, Optional
+from typing import List, Optional, no_type_check
 
 from gt4py.cartesian.gtscript import (
-    __INLINED,
     BACKWARD,
     FORWARD,
     PARALLEL,
@@ -13,7 +12,7 @@
     region,
 )
 
-from ndsl import Quantity, QuantityFactory, StencilFactory, orchestrate
+from ndsl import QuantityFactory, StencilFactory, orchestrate
 from ndsl.constants import (
     X_DIM,
     X_INTERFACE_DIM,
@@ -31,6 +30,10 @@
 from pyFV3.stencils.mapn_tracer import MapNTracer
 from pyFV3.stencils.moist_cv import moist_pt_func, moist_pt_last_step
 from pyFV3.stencils.saturation_adjustment import SatAdjust3d
+from pyFV3.tracers import TracersType
+
+
+# from pyFV3.tracers import Tracers
 
 
 # TODO: Should this be set here or in global_constants?
@@ -98,8 +101,10 @@ def moist_cv_pt_pressure(
     bk: FloatFieldK,
     dp2: FloatField,
     ps: FloatFieldIJ,
+    pn1: FloatField,
     pn2: FloatField,
     peln: FloatField,
+    remap_t: bool,
     r_vir: Float,
 ):
     """
@@ -125,12 +130,14 @@ def moist_cv_pt_pressure(
         ps (out):
         pn2 (out):
         peln (in):
+        remap_t (in):
+        r_vir (in):
     """
-    from __externals__ import hydrostatic, kord_tm
 
     # moist_cv.moist_pt
     with computation(PARALLEL), interval(0, -1):
-        if __INLINED(kord_tm < 0):
+        # if __INLINED(kord_tm < 0):
+        if remap_t:
             cvm, gz, q_con, cappa, pt = moist_pt_func(
                 qvapor,
                 qliquid,
@@ -145,9 +152,11 @@ def moist_cv_pt_pressure(
                 delz,
                 r_vir,
             )
-        # delz_adjust
-        if __INLINED(not hydrostatic):
-            delz = -delz / delp
+        # NOTE : GEOS does not perform the delz computation at this location
+        # # delz_adjust
+        # if __INLINED(not hydrostatic):
+        #     delz = -delz / delp
+
     # pressure_updates
     with computation(FORWARD):
         with interval(-1, None):
@@ -155,23 +164,28 @@ def moist_cv_pt_pressure(
     with computation(PARALLEL):
         with interval(0, 1):
             pn2 = peln
+            pn1 = peln
         # TODO: refactor the pe2 = ptop assignment from
         # previous stencil into this one, and remove
         # pe2 from the other stencil
         with interval(1, -1):
             pe2 = ak + bk * ps
+            pn1 = peln
         with interval(-1, None):
             pn2 = peln
+            pn1 = peln
     with computation(BACKWARD), interval(0, -1):
         dp2 = pe2[0, 0, 1] - pe2
-    # copy_stencil
-    with computation(PARALLEL), interval(0, -1):
-        delp = dp2
+
+    # # NOTE : GEOS doesn't perform the delp calcuation at this location
+    # # copy_stencil
+    # # with computation(PARALLEL), interval(0, -1):
+    # #     delp = dp2
 
 
 def pn2_pk_delp(
-    dp2: FloatField,
-    delp: FloatField,
+    # dp2: FloatField,
+    # delp: FloatField,
     pe2: FloatField,
     pn2: FloatField,
     pk: FloatField,
@@ -186,18 +200,26 @@ def pn2_pk_delp(
         pk (out):
     """
     with computation(PARALLEL), interval(...):
-        delp = dp2
+        # NOTE : GEOS doesn't perform the delp calcuation at this location
+        #        Also, in moist_cv_pt_pressure, the below calculation is also done
+        # delp = dp2
         pn2 = log(pe2)
         pk = exp(akap * pn2)
 
 
+def pe0_ptop_xmax(pe0: FloatField, ptop: Float):
+    with computation(PARALLEL), interval(0, 1):
+        pe0 = ptop
+
+
 def pressures_mapu(
     pe: FloatField,
-    pe1: FloatField,
+    # pe1: FloatField,
     ak: FloatFieldK,
     bk: FloatFieldK,
     pe0: FloatField,
     pe3: FloatField,
+    ptop: Float,
 ):
     """
     Args:
@@ -211,18 +233,20 @@ def pressures_mapu(
     with computation(BACKWARD):
         with interval(-1, None):
             pe_bottom = pe
-            pe1_bottom = pe
+            # pe1_bottom = pe
         with interval(0, -1):
             pe_bottom = pe_bottom[0, 0, 1]
-            pe1_bottom = pe1_bottom[0, 0, 1]
+            # pe1_bottom = pe1_bottom[0, 0, 1]
     with computation(FORWARD):
         with interval(0, 1):
-            pe0 = pe
+            # pe0 = pe
+            pe0 = ptop
         with interval(1, None):
-            pe0 = 0.5 * (pe[0, -1, 0] + pe1)
+            # pe0 = 0.5 * (pe[0, -1, 0] + pe1)
+            pe0 = 0.5 * (pe[0, -1, 0] + pe)
     with computation(FORWARD), interval(...):
         bkh = 0.5 * bk
-        pe3 = ak + bkh * (pe_bottom[0, -1, 0] + pe1_bottom)
+        pe3 = ak + bkh * (pe_bottom[0, -1, 0] + pe_bottom)
 
 
 def pressures_mapv(
@@ -244,8 +268,9 @@ def pressures_mapv(
             pe_bottom = pe_bottom[0, 0, 1]
     with computation(FORWARD):
         with interval(0, 1):
-            pe3 = ak
-            pe0 = pe
+            bkh = 0.5 * bk
+            pe3 = ak + bkh * (pe_bottom[-1, 0, 0] + pe_bottom)
+            # pe0 = pe
         with interval(1, None):
             bkh = 0.5 * bk
             pe0 = 0.5 * (pe[-1, 0, 0] + pe)
@@ -281,6 +306,51 @@ def copy_from_below(a: FloatField, b: FloatField):
         b = a[0, 0, -1]
 
 
+def pe_pk_delp_peln(
+    pe: FloatField,
+    pk: FloatField,
+    delp: FloatField,
+    peln: FloatField,
+    pe2: FloatField,
+    pk2: FloatField,
+    pn2: FloatField,
+    ak: FloatFieldK,
+    bk: FloatFieldK,
+    akap: Float,
+    ptop: Float,
+):
+    with computation(BACKWARD):
+        with interval(-1, None):
+            pe_bottom = pe
+        with interval(0, -1):
+            pe_bottom = pe_bottom[0, 0, 1]
+
+    with computation(PARALLEL):
+        with interval(0, 1):
+            pe2 = ptop
+            pn2 = peln
+            pk2 = pk
+        with interval(1, -1):
+            pe2 = ak + bk * pe_bottom
+            pn2 = log(pe2)
+            pk2 = exp(akap * pn2)
+        with interval(-1, None):
+            pe2 = pe
+            pn2 = peln
+            pk2 = pk
+
+    with computation(PARALLEL):
+        with interval(0, -1):
+            pe = pe2
+            pk = pk2
+            delp = pe2[0, 0, 1] - pe2[0, 0, 0]
+            peln = pn2
+        with interval(-1, None):
+            pe = pe2
+            pk = pk2
+            peln = pn2
+
+
 class LagrangianToEulerian:
     """
     Fortran name is Lagrangian_to_Eulerian
@@ -292,9 +362,9 @@ def __init__(
         quantity_factory: QuantityFactory,
         config: RemappingConfig,
         area_64,
-        nq,
         pfull,
-        tracers: Dict[str, Quantity],
+        tracers: TracersType,
+        exclude_tracers: List[str],
         checkpointer: Optional[Checkpointer] = None,
     ):
         orchestrate(
@@ -314,7 +384,6 @@ def __init__(
             raise NotImplementedError("Hydrostatic is not implemented")
 
         self._t_min = 184.0
-        self._nq = nq
         # do_omega = hydrostatic and last_step # TODO pull into inputs
         self._domain_jextra = (
             grid_indexing.domain[0],
@@ -375,6 +444,13 @@ def __init__(
 
         self._do_sat_adjust = config.do_sat_adj
 
+        self._remap_t = False
+
+        # NOTE: In GEOS, remap_t is set to True in general
+        #       Add in the "remap_option" check later
+        if True:
+            self._remap_t = True
+
         self.kmp = grid_indexing.domain[2] - 1
         for k in range(pfull.shape[0]):
             if pfull.view[k] > 10.0e2:
@@ -387,7 +463,8 @@ def __init__(
 
         self._moist_cv_pt_pressure = stencil_factory.from_origin_domain(
             moist_cv_pt_pressure,
-            externals={"kord_tm": config.kord_tm, "hydrostatic": hydrostatic},
+            # externals={"kord_tm": config.kord_tm, "hydrostatic": hydrostatic},
+            externals={"hydrostatic": hydrostatic},
             origin=grid_indexing.origin_compute(),
             domain=grid_indexing.domain_compute(add=(0, 0, 1)),
         )
@@ -410,9 +487,9 @@ def __init__(
             stencil_factory,
             quantity_factory,
             abs(config.kord_tr),
-            nq,
             fill=config.fill,
             tracers=tracers,
+            exclude_tracers=exclude_tracers,
         )
 
         self._map_single_w = MapSingle(
@@ -516,9 +593,10 @@ def __init__(
             domain=grid_indexing.domain_compute(),
         )
 
+    @no_type_check
     def __call__(
         self,
-        tracers: Dict[str, Quantity],
+        tracers: TracersType,
         pt: FloatField,
         delp: FloatField,
         delz: FloatField,
@@ -528,7 +606,6 @@ def __call__(
         w: FloatField,
         cappa: FloatField,
         q_con: FloatField,
-        q_cld: FloatField,
         pkz: FloatField,
         pk: FloatField,
         pe: FloatField,
@@ -562,7 +639,6 @@ def __call__(
             va (inout): A-grid y-velocity
             cappa (inout): Power to raise pressure to
             q_con (out): Total condensate mixing ratio
-            q_cld (out): Cloud fraction
             pkz (in): Layer mean pressure raised to the power of Kappa
             pk (out): Interface pressure raised to power of kappa, final acoustic value
             pe (in): Pressure at layer edges
@@ -593,12 +669,12 @@ def __call__(
         # pe2 is final Eulerian edge pressures
 
         self._moist_cv_pt_pressure(
-            tracers["qvapor"],
-            tracers["qliquid"],
-            tracers["qrain"],
-            tracers["qsnow"],
-            tracers["qice"],
-            tracers["qgraupel"],
+            tracers["vapor"],
+            tracers["liquid"],
+            tracers["rain"],
+            tracers["snow"],
+            tracers["ice"],
+            tracers["graupel"],
             q_con,
             pt,
             cappa,
@@ -612,6 +688,7 @@ def __call__(
             ps,
             self._pn2,
             peln,
+            self._remap_t,
             zvir,
         )
 
@@ -625,6 +702,8 @@ def __call__(
         self._map_single_w(w, self._pe1, self._pe2, qs=wsd)
         self._map_single_delz(delz, self._pe1, self._pe2)
 
+        # W_limiter routine will go here
+
         self._undo_delz_adjust_and_copy_peln(delp, delz, peln, self._pe0, self._pn2)
         # if do_omega:  # NOTE untested
         #    pe3 = copy(omga, origin=(grid_indexing.isc, grid_indexing.jsc, 1))
@@ -633,12 +712,12 @@ def __call__(
         # it clear the outputs are not needed until then?
         # or, are its outputs actually used? can we delete this stencil call?
         self._moist_cv_pkz(
-            tracers["qvapor"],
-            tracers["qliquid"],
-            tracers["qrain"],
-            tracers["qsnow"],
-            tracers["qice"],
-            tracers["qgraupel"],
+            tracers["vapor"],
+            tracers["liquid"],
+            tracers["rain"],
+            tracers["snow"],
+            tracers["ice"],
+            tracers["graupel"],
             q_con,
             self._gz,
             self._cvm,
@@ -683,13 +762,13 @@ def __call__(
             fast_mp_consv = consv_te > CONSV_MIN
             self._saturation_adjustment(
                 dp1,
-                tracers["qvapor"],
-                tracers["qliquid"],
-                tracers["qice"],
-                tracers["qrain"],
-                tracers["qsnow"],
-                tracers["qgraupel"],
-                q_cld,
+                tracers["vapor"],
+                tracers["liquid"],
+                tracers["ice"],
+                tracers["rain"],
+                tracers["snow"],
+                tracers["graupel"],
+                tracers["cloud"],
                 hs,
                 peln,
                 delp,
@@ -711,12 +790,12 @@ def __call__(
             # to the physics, but if we're staying in dynamics we need
             # to keep it as the virtual potential temperature
             self._moist_cv_last_step_stencil(
-                tracers["qvapor"],
-                tracers["qliquid"],
-                tracers["qrain"],
-                tracers["qsnow"],
-                tracers["qice"],
-                tracers["qgraupel"],
+                tracers["vapor"],
+                tracers["liquid"],
+                tracers["rain"],
+                tracers["snow"],
+                tracers["ice"],
+                tracers["graupel"],
                 self._gz,
                 pt,
                 pkz,
diff --git a/pyFV3/stencils/remapping_GEOS.py b/pyFV3/stencils/remapping_GEOS.py
new file mode 100644
index 00000000..2376f4d9
--- /dev/null
+++ b/pyFV3/stencils/remapping_GEOS.py
@@ -0,0 +1,637 @@
+from gt4py.cartesian.gtscript import FORWARD, computation, interval
+
+from ndsl import QuantityFactory, StencilFactory, orchestrate
+from ndsl.comm.communicator import Communicator
+from ndsl.constants import (
+    CV_AIR,
+    GRAV,
+    X_DIM,
+    X_INTERFACE_DIM,
+    Y_DIM,
+    Y_INTERFACE_DIM,
+    Z_DIM,
+    Z_INTERFACE_DIM,
+)
+from ndsl.dsl.typing import Float, FloatField, FloatFieldIJ, FloatFieldIJ64, FloatFieldK
+from ndsl.grid import GridData
+from ndsl.stencils.basic_operations import adjust_divide_stencil
+from pyFV3._config import RemappingConfig
+from pyFV3.mpi.sum import GlobalSum
+from pyFV3.stencils import moist_cv
+from pyFV3.stencils.map_single import MapSingle
+from pyFV3.stencils.mapn_tracer import MapNTracer
+from pyFV3.stencils.moist_cv import moist_pt_last_step
+from pyFV3.stencils.remapping import (
+    CONSV_MIN,
+    init_pe,
+    moist_cv_pt_pressure,
+    pe0_ptop_xmax,
+    pe_pk_delp_peln,
+    pn2_pk_delp,
+    pressures_mapu,
+    pressures_mapv,
+)
+from pyFV3.stencils.saturation_adjustment import SatAdjust3d
+from pyFV3.stencils.scale_delz import rescale_delz_1, rescale_delz_2
+from pyFV3.stencils.w_fix_consrv_moment import W_fix_consrv_moment
+from pyFV3.tracers import TracersType
+
+
+def _normalize_to_grid_stencil(
+    te_2d: FloatFieldIJ, zsum_2d: FloatFieldIJ, area: FloatFieldIJ64
+):
+    with computation(FORWARD), interval(0, 1):
+        te_2d = te_2d * area
+        zsum_2d = zsum_2d * area
+
+
+class LagrangianToEulerian_GEOS:
+    """
+    GEOS v11.4.2 remapping - derived from original fvcore.
+
+    Fortran name is Lagrangian_to_Eulerian
+    """
+
+    def __init__(
+        self,
+        stencil_factory: StencilFactory,
+        quantity_factory: QuantityFactory,
+        config: RemappingConfig,
+        comm: Communicator,
+        grid_data: GridData,
+        nq,
+        pfull,
+        tracers: TracersType,
+        adiabatic: bool,
+    ):
+        orchestrate(
+            obj=self,
+            config=stencil_factory.config.dace_config,
+            dace_compiletime_args=["tracers"],
+        )
+        self._comm = comm
+        self._stencil_factory = stencil_factory
+        grid_indexing = stencil_factory.grid_indexing
+
+        # Configuration
+        hydrostatic = config.hydrostatic
+        if hydrostatic:
+            raise NotImplementedError("Hydrostatic is not implemented")
+
+        if adiabatic:
+            raise NotImplementedError("Adiabatic is not implemented")
+
+        self._t_min = Float(184.0)
+        self._nq = nq
+        self._w_max = Float(90.0)
+        self._w_min = Float(-60.0)
+        self._area_64 = grid_data.area_64
+        self._cosa_s = grid_data.cosa_s
+        self._rsin2 = grid_data.rsin2
+        self._kord_tm = abs(config.kord_tm)
+        self._kord_wz = config.kord_wz
+        self._kord_mt = config.kord_mt
+        self._do_sat_adjust = config.do_sat_adj
+        self._adiabatic = adiabatic
+        self.kmp = grid_indexing.domain[2] - 1
+        for k in range(pfull.shape[0]):
+            if pfull.view[k] > 10.0e2:
+                self.kmp = k
+                break
+        # do_omega = hydrostatic and last_step # TODO pull into inputs
+
+        # Quantities
+        self._pe1 = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_INTERFACE_DIM],
+            units="Pa",
+            dtype=Float,
+        )
+        self._pe2 = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_INTERFACE_DIM],
+            units="Pa",
+            dtype=Float,
+        )
+        self._pe3 = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_INTERFACE_DIM],
+            units="Pa",
+            dtype=Float,
+        )
+        self._dp2 = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="Pa",
+            dtype=Float,
+        )
+        self._pn1 = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="Pa",
+            dtype=Float,
+        )
+        self._pn2 = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="Pa",
+            dtype=Float,
+        )
+        self._pe0 = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_INTERFACE_DIM],
+            units="Pa",
+            dtype=Float,
+        )
+        self._pe3 = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_INTERFACE_DIM],
+            units="Pa",
+            dtype=Float,
+        )
+
+        self._gz = quantity_factory.zeros(
+            [X_DIM, Y_DIM],
+            units="m^2 s^-2",
+            dtype=Float,
+        )
+        self._cvm = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="unknown",
+            dtype=Float,
+        )
+        self._compute_performed = quantity_factory.zeros(
+            [X_DIM, Y_DIM],
+            units="mask",
+            dtype=bool,
+        )
+        self._w2 = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="temp W",
+            dtype=Float,
+        )
+        self._pk2 = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="Pa",
+            dtype=Float,
+        )
+
+        self._te_2d = quantity_factory.zeros(
+            [X_DIM, Y_DIM],
+            units="Pa",
+            dtype=Float,
+        )
+
+        self._zsum1 = quantity_factory.zeros(
+            [X_DIM, Y_DIM],
+            units="Pa",
+            dtype=Float,
+        )
+        self._phis = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_INTERFACE_DIM],
+            units="n/a",
+            dtype=Float,
+        )
+
+        # Stencils
+
+        self._global_sum = GlobalSum(
+            communicator=comm,
+            quantity_factory=quantity_factory,
+            grid_indexing=stencil_factory.grid_indexing,
+        )
+
+        self._init_pe = stencil_factory.from_origin_domain(
+            init_pe,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(add=(0, 1, 1)),
+        )
+
+        self._moist_cv_pt_pressure = stencil_factory.from_origin_domain(
+            moist_cv_pt_pressure,
+            externals={"hydrostatic": hydrostatic},
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(add=(0, 0, 1)),
+        )
+
+        self._pn2_pk_delp = stencil_factory.from_origin_domain(
+            pn2_pk_delp,
+            origin=grid_indexing.origin_compute(add=(0, 0, 1)),
+            domain=grid_indexing.domain_compute(add=(0, 0, -1)),
+        )
+
+        self._map_single_pt = MapSingle(
+            stencil_factory,
+            quantity_factory,
+            self._kord_tm,
+            mode=1,
+            dims=[X_DIM, Y_DIM, Z_DIM],
+            interpolate_contribution=True,
+        )
+
+        self._mapn_tracer = MapNTracer(
+            stencil_factory,
+            quantity_factory,
+            kord=abs(config.kord_tr),
+            fill=config.fill,
+            tracers=tracers,
+        )
+
+        self._map_single_w = MapSingle(
+            stencil_factory,
+            quantity_factory,
+            self._kord_wz,
+            mode=-2,
+            dims=[X_DIM, Y_DIM, Z_DIM],
+        )
+
+        self._map_single_delz = MapSingle(
+            stencil_factory,
+            quantity_factory,
+            self._kord_wz,
+            mode=1,
+            dims=[X_DIM, Y_DIM, Z_DIM],
+        )
+
+        self._moist_cv_pkz = stencil_factory.from_origin_domain(
+            moist_cv.moist_pkz,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(),
+        )
+
+        self._pressures_mapu = stencil_factory.from_origin_domain(
+            pressures_mapu,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(add=(0, 1, 1)),
+        )
+
+        self._map_single_u = MapSingle(
+            stencil_factory,
+            quantity_factory,
+            self._kord_mt,
+            mode=-1,
+            dims=[X_DIM, Y_INTERFACE_DIM, Z_DIM],
+        )
+
+        self._pressures_mapv = stencil_factory.from_origin_domain(
+            pressures_mapv,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(add=(1, 0, 1)),
+        )
+
+        self._map_single_v = MapSingle(
+            stencil_factory,
+            quantity_factory,
+            self._kord_mt,
+            mode=-1,
+            dims=[X_INTERFACE_DIM, Y_DIM, Z_DIM],
+        )
+
+        self._saturation_adjustment = SatAdjust3d(
+            stencil_factory, config.sat_adjust, self._area_64, self.kmp
+        )
+
+        self._moist_cv_last_step_stencil = stencil_factory.from_origin_domain(
+            moist_pt_last_step,
+            origin=(grid_indexing.isc, grid_indexing.jsc, 0),
+            domain=(
+                grid_indexing.domain[0],
+                grid_indexing.domain[1],
+                grid_indexing.domain[2] + 1,
+            ),
+        )
+
+        self._fill_cond = stencil_factory.from_origin_domain(
+            moist_cv.cond_output,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(),
+        )
+
+        self._adjust_divide = stencil_factory.from_origin_domain(
+            adjust_divide_stencil,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(),
+        )
+
+        self._rescale_delz_1 = stencil_factory.from_origin_domain(
+            rescale_delz_1,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(),
+        )
+
+        self._rescale_delz_2 = stencil_factory.from_origin_domain(
+            rescale_delz_2,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(),
+        )
+
+        self._w_fix_consrv_moment = stencil_factory.from_origin_domain(
+            func=W_fix_consrv_moment,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(),
+        )
+
+        self._pe0_ptop_xmax = stencil_factory.from_origin_domain(
+            pe0_ptop_xmax,
+            origin=(
+                grid_indexing.n_halo + grid_indexing.domain[0],
+                grid_indexing.n_halo,
+                0,
+            ),
+            domain=(1, grid_indexing.domain[1] + 1, 1),
+        )
+        self._pe_pk_delp_peln = stencil_factory.from_origin_domain(
+            pe_pk_delp_peln,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(add=(0, 0, 1)),
+        )
+        self._moist_cv_te = stencil_factory.from_origin_domain(
+            moist_cv.moist_te,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(add=(0, 0, 1)),
+        )
+
+        self._te_zsum = stencil_factory.from_origin_domain(
+            moist_cv.te_zsum,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(),
+        )
+
+        self._normalize_to_grid = stencil_factory.from_origin_domain(
+            _normalize_to_grid_stencil,
+            origin=grid_indexing.origin_compute(),
+            domain=grid_indexing.domain_compute(),
+        )
+
+    def __call__(
+        self,
+        tracers: TracersType,
+        pt: FloatField,  # type: ignore
+        delp: FloatField,  # type: ignore
+        delz: FloatField,  # type: ignore
+        peln: FloatField,  # type: ignore
+        u: FloatField,  # type: ignore
+        v: FloatField,  # type: ignore
+        w: FloatField,  # type: ignore
+        mfx: FloatField,  # type: ignore
+        mfy: FloatField,  # type: ignore
+        cx: FloatField,  # type: ignore
+        cy: FloatField,  # type: ignore
+        cappa: FloatField,  # type: ignore
+        q_con: FloatField,  # type: ignore
+        pkz: FloatField,  # type: ignore
+        pk: FloatField,  # type: ignore
+        pe: FloatField,  # type: ignore
+        hs: FloatFieldIJ,  # type: ignore
+        te0_2d: FloatFieldIJ,  # type: ignore
+        ps: FloatFieldIJ,  # type: ignore
+        wsd: FloatFieldIJ,  # type: ignore
+        ak: FloatFieldK,  # type: ignore
+        bk: FloatFieldK,  # type: ignore
+        dp1: FloatField,  # type: ignore
+        ptop: Float,  # type: ignore
+        akap: Float,  # type: ignore
+        zvir: Float,  # type: ignore
+        last_step: bool,
+        consv_te: Float,  # type: ignore
+        mdt: Float,  # type: ignore
+    ):
+        """
+        Remap the deformed Lagrangian surfaces onto the reference, or "Eulerian",
+        coordinate levels.
+
+        Args:
+            tracers (inout): Tracer species tracked across
+            pt (inout): D-grid potential temperature
+            delp (inout): Pressure Thickness
+            delz (in): Vertical thickness of atmosphere layers
+            peln (inout): Logarithm of interface pressure
+            u (inout): D-grid x-velocity
+            v (inout): D-grid y-velocity
+            w (inout): Vertical velocity
+            ua (inout): A-grid x-velocity
+            va (inout): A-grid y-velocity
+            cappa (inout): Power to raise pressure to
+            q_con (out): Total condensate mixing ratio
+            pkz (in): Layer mean pressure raised to the power of Kappa
+            pk (out): Interface pressure raised to power of kappa, final acoustic value
+            pe (in): Pressure at layer edges
+            hs (in): Surface geopotential
+            te0_2d (inout): Atmosphere total energy in columns
+            ps (out): Surface pressure
+            wsd (in): Vertical velocity of the lowest level
+            omga (unused): Vertical pressure velocity
+            ak (in): Atmosphere hybrid a coordinate (Pa)
+            bk (in): Atmosphere hybrid b coordinate (dimensionless)
+            pfull (in): Pressure full levels
+            dp1 (out): Pressure thickness before dyn_core (only written
+                if do_sat_adjust=True)
+            ptop (in): The pressure level at the top of atmosphere
+            akap (in): Poisson constant (KAPPA)
+            zvir (in): Constant (Rv/Rd-1)
+            last_step (in): Flag for the last step of k-split remapping
+            consv_te (in): If True, conserve total energy
+            mdt (in) : Remap time step
+            bdt (in): Timestep
+        """
+        # Global structure:
+        #   pe1 is initial lagrangian edge pressures
+        #   pe2 is final Eulerian edge pressures
+
+        # Build remapping profiles
+        self._init_pe(pe, self._pe1, self._pe2, ptop)
+        self._moist_cv_pt_pressure(
+            qvapor=tracers.vapor,
+            qliquid=tracers.liquid,
+            qrain=tracers.rain,
+            qsnow=tracers.snow,
+            qice=tracers.ice,
+            qgraupel=tracers.graupel,
+            q_con=q_con,
+            pt=pt,
+            cappa=cappa,
+            delp=delp,
+            delz=delz,
+            pe=pe,
+            pe2=self._pe2,
+            ak=ak,
+            bk=bk,
+            dp2=self._dp2,
+            ps=ps,
+            pn1=self._pn1,
+            pn2=self._pn2,
+            peln=peln,
+            remap_t=True,
+            r_vir=zvir,
+        )
+        self._pn2_pk_delp(
+            pe2=self._pe2,
+            pn2=self._pn2,
+            pk=self._pk2,
+            akap=akap,
+        )
+
+        # Now that we have the pressure profiles, we can start remapping
+
+        # Map pressure
+        self._map_single_pt(
+            pt,
+            self._pn1,
+            self._pn2,
+            qmin=self._t_min,
+        )
+
+        # Map all tracers
+        self._mapn_tracer(self._pe1, self._pe2, self._dp2, tracers)
+
+        # Map vertical wind
+        self._map_single_w(w, self._pe1, self._pe2, qs=wsd)
+        self._rescale_delz_1(delz, delp)
+        self._map_single_delz(delz, self._pe1, self._pe2)
+        self._rescale_delz_2(delz, self._dp2)
+        self._w_fix_consrv_moment(
+            w=w,
+            w2=self._w2,
+            dp2=self._dp2,
+            gz=self._gz,
+            w_max=self._w_max,
+            w_min=self._w_min,
+            compute_performed=self._compute_performed,
+        )
+
+        # Map horizontal winds, fluxes and courant number
+        self._pressures_mapu(pe, ak, bk, self._pe0, self._pe3, ptop)
+        self._pe0_ptop_xmax(self._pe0, ptop)
+        self._map_single_u(u, self._pe0, self._pe3)
+        self._map_single_u(mfy, self._pe0, self._pe3)
+        self._map_single_u(cy, self._pe0, self._pe3)
+
+        self._pressures_mapv(pe, ak, bk, self._pe0, self._pe3)
+        self._map_single_v(v, self._pe0, self._pe3)
+        self._map_single_v(mfx, self._pe0, self._pe3)
+        self._map_single_v(cx, self._pe0, self._pe3)
+
+        self._pe_pk_delp_peln(
+            pe=pe,
+            pk=pk,
+            delp=delp,
+            peln=peln,
+            pe2=self._pe2,
+            pk2=self._pk2,
+            pn2=self._pn2,
+            ak=ak,
+            bk=bk,
+            akap=akap,
+            ptop=ptop,
+        )
+
+        self._moist_cv_pkz(
+            qvapor=tracers.vapor,
+            qliquid=tracers.liquid,
+            qrain=tracers.rain,
+            qsnow=tracers.snow,
+            qice=tracers.ice,
+            qgraupel=tracers.graupel,
+            pkz=pkz,
+            pt=pt,
+            cappa=cappa,
+            delp=delp,
+            delz=delz,
+            r_vir=zvir,
+        )
+
+        dtmp = 0.0
+        if last_step:
+            if consv_te > CONSV_MIN:
+                self._moist_cv_te(
+                    qvapor=tracers.vapor,
+                    qliquid=tracers.liquid,
+                    qrain=tracers.rain,
+                    qsnow=tracers.snow,
+                    qice=tracers.ice,
+                    qgraupel=tracers.graupel,
+                    u=u,
+                    v=v,
+                    w=w,
+                    te=self._te_2d,
+                    pt=pt,
+                    phis=self._phis,
+                    delp=delp,
+                    rsin2=self._rsin2,
+                    cosa_s=self._cosa_s,
+                    hs=hs,
+                    delz=delz,
+                    grav=GRAV,
+                )
+
+                self._te_zsum(
+                    te_2d=self._te_2d,
+                    te0_2d=te0_2d,
+                    delp=delp,
+                    pkz=pkz,
+                    zsum1=self._zsum1,
+                )
+
+                # We can normalize to the same array because
+                # they are properly reset in the above stencils
+                self._normalize_to_grid(self._te_2d, self._zsum1, self._area_64)
+
+                tesum: Float = self._global_sum(self._te_2d)
+                zsum: Float = self._global_sum(self._zsum1)
+                dtmp = tesum / (CV_AIR * zsum)
+
+            elif consv_te < -CONSV_MIN:
+                raise NotImplementedError(
+                    "Unimplemented/untested case consv("
+                    + str(consv_te)
+                    + ")  < -CONSV_MIN("
+                    + str(-CONSV_MIN)
+                    + ")"
+                )
+
+        if self._do_sat_adjust:
+            fast_mp_consv = consv_te > CONSV_MIN
+            self._saturation_adjustment(
+                dp1,
+                tracers.vapor,
+                tracers.liquid,
+                tracers.ice,
+                tracers.rain,
+                tracers.snow,
+                tracers.graupel,
+                tracers.cloud,
+                hs,
+                peln,
+                delp,
+                delz,
+                q_con,
+                pt,
+                pkz,
+                cappa,
+                zvir,
+                mdt,
+                fast_mp_consv,
+                last_step,
+                akap,
+                self.kmp,
+            )
+
+        if last_step and not self._adiabatic:
+            # on the last step, we need the regular temperature to send
+            # to the physics, but if we're staying in dynamics we need
+            # to keep it as the virtual potential temperature
+            self._moist_cv_last_step_stencil(
+                qvapor=tracers.vapor,
+                qliquid=tracers.liquid,
+                qrain=tracers.rain,
+                qsnow=tracers.snow,
+                qice=tracers.ice,
+                qgraupel=tracers.graupel,
+                pt=pt,
+                pkz=pkz,
+                dtmp=dtmp,
+                r_vir=zvir,
+            )
+            self._fill_cond(
+                q_con=q_con,
+                qliquid=tracers.liquid,
+                qrain=tracers.rain,
+                qsnow=tracers.snow,
+                qice=tracers.ice,
+                qgraupel=tracers.graupel,
+            )
+        else:
+            # converts virtual temperature back to virtual potential temperature
+            self._adjust_divide(pkz, pt)
diff --git a/pyFV3/stencils/riem_solver3.py b/pyFV3/stencils/riem_solver3.py
index 1f915ae6..1e32eebb 100644
--- a/pyFV3/stencils/riem_solver3.py
+++ b/pyFV3/stencils/riem_solver3.py
@@ -1,6 +1,6 @@
-import math
 import typing
 
+import numpy as np
 from gt4py.cartesian.gtscript import (
     __INLINED,
     BACKWARD,
@@ -162,7 +162,7 @@ def __init__(
         grid_indexing = stencil_factory.grid_indexing
         self._sim1_solve = Sim1Solver(
             stencil_factory,
-            config.p_fac,
+            Float(config.p_fac),
             n_halo=0,
         )
         orchestrate(
@@ -215,7 +215,7 @@ def __init__(
         )
         self._finalize_stencil = stencil_factory.from_origin_domain(
             finalize,
-            externals={"use_logp": config.use_logp, "beta": config.beta},
+            externals={"use_logp": config.use_logp, "beta": Float(config.beta)},
             origin=riemorigin,
             domain=domain,
         )
@@ -284,9 +284,9 @@ def __call__(
         # gm2 is gamma (cp/cv)
         # dz2 is delz
 
-        peln1 = math.log(ptop)
+        peln1 = np.log(ptop, dtype=Float)
         # ptk = ptop ** kappa
-        ptk = math.exp(constants.KAPPA * peln1)
+        ptk = np.exp(constants.KAPPA * peln1, dtype=Float)
 
         self._precompute_stencil(
             delp,
diff --git a/pyFV3/stencils/riem_solver_c.py b/pyFV3/stencils/riem_solver_c.py
index df909165..7390405a 100644
--- a/pyFV3/stencils/riem_solver_c.py
+++ b/pyFV3/stencils/riem_solver_c.py
@@ -65,7 +65,7 @@ def precompute(
         dz = gz[0, 0, 1] - gz
     with computation(PARALLEL), interval(...):
         gm = 1.0 / (1.0 - cappa)
-        dm /= constants.GRAV
+        dm *= constants.RGRAV
     with computation(PARALLEL), interval(0, -1):
         # (1) From \partial p*/\partial z = -\rho g, we can separate and integrate
         # over a layer to get
@@ -187,7 +187,7 @@ def __init__(
         )
         self._sim1_solve = Sim1Solver(
             stencil_factory,
-            p_fac,
+            Float(p_fac),
             n_halo=1,
         )
         self._finalize_stencil = stencil_factory.from_origin_domain(
diff --git a/pyFV3/stencils/scale_delz.py b/pyFV3/stencils/scale_delz.py
new file mode 100644
index 00000000..24aa6b0e
--- /dev/null
+++ b/pyFV3/stencils/scale_delz.py
@@ -0,0 +1,19 @@
+from gt4py.cartesian.gtscript import PARALLEL, computation, interval
+
+from ndsl.dsl.typing import FloatField
+
+
+def rescale_delz_1(
+    delz: FloatField,
+    delp: FloatField,
+):
+    with computation(PARALLEL), interval(...):
+        delz = -delz / delp
+
+
+def rescale_delz_2(
+    delz: FloatField,
+    dp: FloatField,
+):
+    with computation(PARALLEL), interval(...):
+        delz = -delz * dp
diff --git a/pyFV3/stencils/sim1_solver.py b/pyFV3/stencils/sim1_solver.py
index 0fcd8f28..0bc1566f 100644
--- a/pyFV3/stencils/sim1_solver.py
+++ b/pyFV3/stencils/sim1_solver.py
@@ -199,8 +199,8 @@ def __call__(
 
         # TODO: email Lucas about any remaining variable naming here
 
-        t1g = 2.0 * dt * dt
-        rdt = 1.0 / dt
+        t1g = Float(2.0) * dt * dt
+        rdt = Float(1.0) / dt
         self._compute_sim1_solve(
             w,
             delta_mass,
diff --git a/pyFV3/stencils/tracer_2d_1l.py b/pyFV3/stencils/tracer_2d_1l.py
index 27514a37..6a4c5552 100644
--- a/pyFV3/stencils/tracer_2d_1l.py
+++ b/pyFV3/stencils/tracer_2d_1l.py
@@ -1,16 +1,25 @@
-import math
-from typing import Dict
+import dace
+from typing import no_type_check
+import numpy as np
 
 import gt4py.cartesian.gtscript as gtscript
-from gt4py.cartesian.gtscript import PARALLEL, computation, horizontal, interval, region
+from gt4py.cartesian.gtscript import (
+    PARALLEL,
+    computation,
+    horizontal,
+    interval,
+    region,
+    int32,
+)
 
 from ndsl import (
-    Quantity,
     QuantityFactory,
     StencilFactory,
     WrappedHaloUpdater,
     orchestrate,
+    Quantity,
 )
+from ndsl.grid import GridData
 from ndsl.constants import (
     N_HALO_DEFAULT,
     X_DIM,
@@ -19,9 +28,12 @@
     Y_INTERFACE_DIM,
     Z_DIM,
 )
-from ndsl.dsl.typing import Float, FloatField, FloatFieldIJ
-from ndsl.typing import Communicator
+from ndsl.dsl.dace.orchestration import dace_inhibitor
+from ndsl.dsl.typing import FloatField, FloatFieldIJ, FloatFieldK
+from ndsl.comm.communicator import Communicator, ReductionOperator
 from pyFV3.stencils.fvtp2d import FiniteVolumeTransport
+from pyFV3.tracers import TracersType
+from ndsl.utils import safe_assign_array
 
 
 @gtscript.function
@@ -46,6 +58,7 @@ def flux_y(cy, dya, dx, sin_sg4, sin_sg2, yfx):
     return yfx
 
 
+@no_type_check
 def flux_compute(
     cx: FloatField,
     cy: FloatField,
@@ -80,6 +93,7 @@ def flux_compute(
         yfx = flux_y(cy, dya, dx, sin_sg4, sin_sg2, yfx)
 
 
+@no_type_check
 def divide_fluxes_by_n_substeps(
     cxd: FloatField,
     xfx: FloatField,
@@ -87,10 +101,11 @@ def divide_fluxes_by_n_substeps(
     cyd: FloatField,
     yfx: FloatField,
     mfyd: FloatField,
-    n_split: int,
+    cmax: FloatFieldK,
 ):
     """
-    Divide all inputs in-place by the number of substeps n_split.
+    Divide all inputs in-place by the number of substeps n_split computed
+    from the max courant number on the grid
 
     Args:
         cxd (inout):
@@ -101,27 +116,18 @@ def divide_fluxes_by_n_substeps(
         mfyd (inout):
     """
     with computation(PARALLEL), interval(...):
-        frac = 1.0 / n_split
-        cxd = cxd * frac
-        xfx = xfx * frac
-        mfxd = mfxd * frac
-        cyd = cyd * frac
-        yfx = yfx * frac
-        mfyd = mfyd * frac
-
-
-def cmax_stencil1(cx: FloatField, cy: FloatField, cmax: FloatField):
-    with computation(PARALLEL), interval(...):
-        cmax = max(abs(cx), abs(cy))
-
-
-def cmax_stencil2(
-    cx: FloatField, cy: FloatField, sin_sg5: FloatField, cmax: FloatField
-):
-    with computation(PARALLEL), interval(...):
-        cmax = max(abs(cx), abs(cy)) + 1.0 - sin_sg5
-
-
+        n_split = int32(1.0 + cmax)
+        if n_split > 1:
+            frac = 1.0 / n_split
+            cxd = cxd * frac
+            xfx = xfx * frac
+            mfxd = mfxd * frac
+            cyd = cyd * frac
+            yfx = yfx * frac
+            mfyd = mfyd * frac
+
+
+@no_type_check
 def apply_mass_flux(
     dp1: FloatField,
     x_mass_flux: FloatField,
@@ -140,11 +146,15 @@ def apply_mass_flux(
     with computation(PARALLEL), interval(...):
         dp2 = (
             dp1
-            + (x_mass_flux - x_mass_flux[1, 0, 0] + y_mass_flux - y_mass_flux[0, 1, 0])
+            + (
+                (x_mass_flux - x_mass_flux[1, 0, 0])
+                + (y_mass_flux - y_mass_flux[0, 1, 0])
+            )
             * rarea
         )
 
 
+@no_type_check
 def apply_tracer_flux(
     q: FloatField,
     dp1: FloatField,
@@ -152,6 +162,8 @@ def apply_tracer_flux(
     fy: FloatField,
     rarea: FloatFieldIJ,
     dp2: FloatField,
+    cmax: FloatFieldK,
+    current_nsplit: int,
 ):
     """
     Args:
@@ -163,7 +175,8 @@ def apply_tracer_flux(
         dp2 (in):
     """
     with computation(PARALLEL), interval(...):
-        q = (q * dp1 + (fx - fx[1, 0, 0] + fy - fy[0, 1, 0]) * rarea) / dp2
+        if current_nsplit < int32(1.0 + cmax):
+            q = (q * dp1 + ((fx - fx[1, 0, 0]) + (fy - fy[0, 1, 0])) * rarea) / dp2
 
 
 # Simple stencil replacing:
@@ -171,6 +184,7 @@ def apply_tracer_flux(
 #   dp1[:] = dp2
 #   dp2[:] = self._tmp_dp2
 # Because dpX can be a quantity or an array
+@no_type_check
 def swap_dp(dp1: FloatField, dp2: FloatField):
     with computation(PARALLEL), interval(...):
         tmp = dp1
@@ -183,6 +197,16 @@ class TracerAdvection:
     Performs horizontal advection on tracers.
 
     Corresponds to tracer_2D_1L in the Fortran code.
+
+    Args:
+        stencil_factory: Stencil maker built on the required grid
+        quantity_factory: Quantity maker built on the required grid
+        transport: The Finite Volume to be applied to each tracers
+        grid_data: Metric Terms for the grid
+        comm: Communicator on the grid
+        tracers: Bundle of data of tracers to be advected
+        exclude_tracers: Tracers to not be advected
+        update_mass_courant: update the mass and courant numbers
     """
 
     def __init__(
@@ -190,9 +214,10 @@ def __init__(
         stencil_factory: StencilFactory,
         quantity_factory: QuantityFactory,
         transport: FiniteVolumeTransport,
-        grid_data,
+        grid_data: GridData,
         comm: Communicator,
-        tracers: Dict[str, Quantity],
+        tracers: TracersType,
+        update_mass_courant: bool = True,
     ):
         orchestrate(
             obj=self,
@@ -201,38 +226,54 @@ def __init__(
         )
         grid_indexing = stencil_factory.grid_indexing
         self.grid_indexing = grid_indexing  # needed for selective validation
-        self._tracer_count = len(tracers)
         self.grid_data = grid_data
+        self._update_mass_courant = update_mass_courant
+
+        if not self._update_mass_courant:
+            self._tmp_mfx = quantity_factory.zeros(
+                [X_INTERFACE_DIM, Y_DIM, Z_DIM],
+                units="unknown",
+            )
+            self._tmp_mfy = quantity_factory.zeros(
+                [X_DIM, Y_INTERFACE_DIM, Z_DIM],
+                units="unknown",
+            )
+            self._tmp_cx = quantity_factory.zeros(
+                [X_INTERFACE_DIM, Y_DIM, Z_DIM],
+                units="unknown",
+            )
+            self._tmp_cy = quantity_factory.zeros(
+                [X_DIM, Y_INTERFACE_DIM, Z_DIM],
+                units="unknown",
+            )
 
         self._x_area_flux = quantity_factory.zeros(
             [X_INTERFACE_DIM, Y_DIM, Z_DIM],
             units="unknown",
-            dtype=Float,
         )
         self._y_area_flux = quantity_factory.zeros(
             [X_DIM, Y_INTERFACE_DIM, Z_DIM],
             units="unknown",
-            dtype=Float,
         )
         self._x_flux = quantity_factory.zeros(
             [X_INTERFACE_DIM, Y_INTERFACE_DIM, Z_DIM],
             units="unknown",
-            dtype=Float,
         )
         self._y_flux = quantity_factory.zeros(
             [X_INTERFACE_DIM, Y_INTERFACE_DIM, Z_DIM],
             units="unknown",
-            dtype=Float,
         )
         self._tmp_dp = quantity_factory.zeros(
             [X_DIM, Y_DIM, Z_DIM],
             units="Pa",
-            dtype=Float,
         )
         self._tmp_dp2 = quantity_factory.zeros(
             [X_DIM, Y_DIM, Z_DIM],
             units="Pa",
-            dtype=Float,
+        )
+        self._cmax = quantity_factory.zeros(
+            [Z_DIM],
+            units="unitless",
         )
 
         ax_offsets = grid_indexing.axis_offsets(
@@ -276,21 +317,24 @@ def __init__(
         )
         self.finite_volume_transport: FiniteVolumeTransport = transport
 
-        # Setup halo updater for tracers
-        tracer_halo_spec = quantity_factory.get_quantity_halo_spec(
-            dims=[X_DIM, Y_DIM, Z_DIM],
-            n_halo=N_HALO_DEFAULT,
-            dtype=Float,
-        )
+        # Halo exchange of all tracers
         self._tracers_halo_updater = WrappedHaloUpdater(
-            comm.get_scalar_halo_updater([tracer_halo_spec] * self._tracer_count),
-            tracers,
-            [t for t in tracers.keys()],
+            comm.get_scalar_halo_updater([tracers.quantity.halo_spec(N_HALO_DEFAULT)]),
+            {"tracers": tracers.quantity},
+            ["tracers"],
+        )
+
+        # Setup tracer courant max reduction calculation
+        self._compute_cmax = TracerCMax(
+            stencil_factory=stencil_factory,
+            quantity_factory=quantity_factory,
+            grid_data=grid_data,
+            comm=comm,
         )
 
     def __call__(
         self,
-        tracers: Dict[str, Quantity],
+        tracers: TracersType,
         dp1,
         x_mass_flux,
         y_mass_flux,
@@ -312,17 +356,25 @@ def __call__(
             x_courant (inout): accumulated courant number in x-direction
             y_courant (inout): accumulated courant number in y-direction
         """
-        # DaCe parsing issue
-        # if len(tracers) != self._tracer_count:
-        #     raise ValueError(
-        #         f"incorrect number of tracers, {self._tracer_count} was "
-        #         f"specified on init but {len(tracers)} were passed"
-        #     )
-        # start HALO update on q (in dyn_core in fortran -- just has started when
-        # this function is called...)
+
+        if self._update_mass_courant:
+            working_x_mass_flux = x_mass_flux
+            working_y_mass_flux = y_mass_flux
+            working_x_courant = x_courant
+            working_y_courant = y_courant
+        else:
+            self._tmp_mfx.data = x_mass_flux
+            self._tmp_mfy.data = y_mass_flux
+            self._tmp_cx.data = x_courant
+            self._tmp_cy.data = y_courant
+            working_x_mass_flux = self._tmp_mfx
+            working_y_mass_flux = self._tmp_mfy
+            working_x_courant = self._tmp_cx
+            working_y_courant = self._tmp_cy
+
         self._flux_compute(
-            x_courant,
-            y_courant,
+            working_x_courant,
+            working_y_courant,
             self.grid_data.dxa,
             self.grid_data.dya,
             self.grid_data.dx,
@@ -331,72 +383,57 @@ def __call__(
             self.grid_data.sin_sg2,
             self.grid_data.sin_sg3,
             self.grid_data.sin_sg4,
-            # TODO: rename xfx/yfx to "area flux"
             self._x_area_flux,
             self._y_area_flux,
         )
 
-        # # TODO for if we end up using the Allreduce and compute cmax globally
-        # (or locally). For now, hardcoded.
-        # split = int(grid_indexing.domain[2] / 6)
-        # self._cmax_1(
-        #     cxd, cyd, self._tmp_cmax, origin=grid_indexing.origin_compute(),
-        #     domain=(grid_indexing.domain[0], self.grid_indexing.domain[1], split)
-        # )
-        # self._cmax_2(
-        #     cxd,
-        #     cyd,
-        #     self.grid.sin_sg5,
-        #     self._tmp_cmax,
-        #     origin=(grid_indexing.isc, self.grid_indexing.jsc, split),
-        #     domain=(
-        #         grid_indexing.domain[0],
-        #         self.grid_indexing.domain[1],
-        #         grid_indexing.domain[2] - split + 1
-        #     ),
-        # )
-        # cmax_flat = np.amax(self._tmp_cmax, axis=(0, 1))
-        # # cmax_flat is a gt4py storage still, but of dimension [npz+1]...
-
-        # cmax_max_all_ranks = cmax_flat.data
-        # # TODO mpi allreduce...
-        # # comm.Allreduce(cmax_flat, cmax_max_all_ranks, op=MPI.MAX)
-
-        cmax_max_all_ranks = 2.0
-        n_split = math.floor(1.0 + cmax_max_all_ranks)
-        # NOTE: cmax is not usually a single value, it varies with k, if return to
-        # that, make n_split a column as well
-
-        if n_split > 1.0:
-            self._divide_fluxes_by_n_substeps(
-                x_courant,
-                self._x_area_flux,
-                x_mass_flux,
-                y_courant,
-                self._y_area_flux,
-                y_mass_flux,
-                n_split,
-            )
+        self._compute_cmax(
+            cx=working_x_courant,
+            cy=working_y_courant,
+            cmax=self._cmax,
+        )
+
+        self._divide_fluxes_by_n_substeps(
+            cxd=working_x_courant,
+            xfx=self._x_area_flux,
+            mfxd=working_x_mass_flux,
+            cyd=working_y_courant,
+            yfx=self._y_area_flux,
+            mfyd=working_y_mass_flux,
+            cmax=self._cmax,
+        )
 
         self._tracers_halo_updater.update()
 
         dp2 = self._tmp_dp
 
-        for it in range(n_split):
-            last_call = it == n_split - 1
+        # The original algorithm works on K level independantly
+        # (from with a  K loop) and therefore compute `nsplit`
+        # per K
+        # The stencil nature of the framework doesn't allow for it
+        # because after advection, an halo exchange need to be carried
+        # (or else we could just move the test within the stencil).
+        # We overcompute to retain true parallelization, by running
+        # a loop on the highest number of nsplit, but restraining
+        # actual update in `apply_tracer_flux` to only the valid
+        # K level for each tracers
+        max_n_split = int(1.0 + self._compute_cmax.max_over_column)
+        for current_nsplit in range(max_n_split):
+            last_call = current_nsplit == max_n_split - 1
             # tracer substep
             self._apply_mass_flux(
                 dp1,
-                x_mass_flux,
-                y_mass_flux,
+                working_x_mass_flux,
+                working_y_mass_flux,
                 self.grid_data.rarea,
                 dp2,
             )
-            for q in tracers.values():
+            for i_tracer in dace.nounroll(range(tracers.shape[3])):
+                q = tracers.quantity.data[:, :, :, i_tracer]
                 self.finite_volume_transport(
                     q,
-                    x_courant,
-                    y_courant,
+                    working_x_courant,
+                    working_y_courant,
                     self._x_area_flux,
                     self._y_area_flux,
                     self._x_flux,
@@ -411,9 +448,107 @@ def __call__(
                     self._y_flux,
                     self.grid_data.rarea,
                     dp2,
+                    cmax=self._cmax,
+                    current_nsplit=current_nsplit,
                 )
             if not last_call:
                 self._tracers_halo_updater.update()
                 # we can't use variable assignment to avoid a data copy
                 # because of current dace limitations
                 self._swap_dp(dp1, dp2)
+
+
+@no_type_check
+def cmax_stencil_low_k(
+    cx: FloatField,
+    cy: FloatField,
+    cmax: FloatField,
+):
+    with computation(PARALLEL), interval(...):
+        cmax = max(abs(cx), abs(cy))
+
+
+@no_type_check
+def cmax_stencil_high_k(
+    cx: FloatField,
+    cy: FloatField,
+    sin_sg5: FloatFieldIJ,
+    cmax: FloatField,
+):
+    with computation(PARALLEL), interval(...):
+        cmax = max(abs(cx), abs(cy)) + 1.0 - sin_sg5
+
+
+class TracerCMax:
+    def __init__(
+        self,
+        stencil_factory: StencilFactory,
+        quantity_factory: QuantityFactory,
+        grid_data: GridData,
+        comm: Communicator,
+    ):
+        """Perform global courant number max.
+
+        The maximum courant number for every atmospheric level on the entire grid.
+        """
+        orchestrate(obj=self, config=stencil_factory.config.dace_config)
+        self._grid_data = grid_data
+        self._comm = comm
+        grid_indexing = stencil_factory.grid_indexing
+        cmax_atmospheric_level_split = int(grid_indexing.domain[2] / 6) - 1
+        self._cmax_low_k = stencil_factory.from_origin_domain(
+            func=cmax_stencil_low_k,
+            origin=grid_indexing.origin_compute(),
+            domain=(
+                grid_indexing.domain[0],
+                grid_indexing.domain[1],
+                cmax_atmospheric_level_split,
+            ),
+        )
+        self._cmax_high_k = stencil_factory.from_origin_domain(
+            func=cmax_stencil_high_k,
+            origin=(
+                grid_indexing.origin_compute()[0],
+                grid_indexing.origin_compute()[1],
+                cmax_atmospheric_level_split,
+            ),
+            domain=(
+                grid_indexing.domain[0],
+                grid_indexing.domain[1],
+                grid_indexing.domain[2] - cmax_atmospheric_level_split,
+            ),
+        )
+        self._tmp_cmax = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="unknown",
+        )
+        self._tmp_cmax_in_K = quantity_factory.zeros(
+            [Z_DIM],
+            units="unknown",
+        )
+        self.max_over_column = 0
+
+    @dace_inhibitor
+    def _reduce(self, cmax: Quantity):
+        if __debug__:
+            if not isinstance(cmax, Quantity):
+                raise TypeError(
+                    f"[pyFV3][Tracer]: cmax must be a quantity, got {type(cmax)}"
+                )
+        cmax.data[:] = self._tmp_cmax.data.max(axis=0).max(axis=0)[:]
+        self._comm.all_reduce_per_element_in_place(cmax, ReductionOperator.MAX)
+        self.max_over_column = cmax.field.max()
+
+    def __call__(self, cx, cy, cmax):
+        self._cmax_low_k(
+            cx=cx,
+            cy=cy,
+            cmax=self._tmp_cmax,
+        )
+        self._cmax_high_k(
+            cx=cx,
+            cy=cy,
+            sin_sg5=self._grid_data.sin_sg5,
+            cmax=self._tmp_cmax,
+        )
+        self._reduce(cmax)
diff --git a/pyFV3/stencils/updatedzc.py b/pyFV3/stencils/updatedzc.py
index 3b3a76b8..2c546475 100644
--- a/pyFV3/stencils/updatedzc.py
+++ b/pyFV3/stencils/updatedzc.py
@@ -1,118 +1,135 @@
 import gt4py.cartesian.gtscript as gtscript
-from gt4py.cartesian.gtscript import BACKWARD, FORWARD, PARALLEL, computation, interval
+from gt4py.cartesian.gtscript import (
+    BACKWARD,
+    FORWARD,
+    PARALLEL,
+    computation,
+    horizontal,
+    interval,
+    region,
+)
 
-import ndsl.constants as constants
 from ndsl import Quantity, QuantityFactory, StencilFactory
 from ndsl.constants import X_DIM, Y_DIM, Z_DIM
 from ndsl.dsl.typing import Float, FloatField, FloatFieldIJ, FloatFieldK
 from ndsl.stencils import corners
 
 
-DZ_MIN = constants.DZ_MIN
-
+def double_copy(q_in: FloatField, copy_1: FloatField, copy_2: FloatField):
+    with computation(PARALLEL), interval(...):
+        copy_1 = q_in
+        copy_2 = q_in
 
-@gtscript.function
-def p_weighted_average_top(vel, dp0):
-    # TODO: ratio is a constant, where should this be placed?
-    ratio = dp0 / (dp0 + dp0[1])
-    return vel + (vel - vel[0, 0, 1]) * ratio
 
+def copy(q_in: FloatField, q_copy: FloatField):
+    with computation(PARALLEL), interval(...):
+        q_copy = q_in
 
-@gtscript.function
-def p_weighted_average_bottom(vel, dp0):
-    ratio = dp0[-1] / (dp0[-2] + dp0[-1])
-    return vel[0, 0, -1] + (vel[0, 0, -1] - vel[0, 0, -2]) * ratio
 
+def compute_weighted_average(
+    dp_ref: FloatFieldK,
+    vel: FloatField,
+    avg: FloatField,
+):
+    """
+    Perform a cubic spline interpolation of wind velocity from grid center to grid edge
 
-@gtscript.function
-def p_weighted_average_domain(vel, dp0):
-    int_ratio = 1.0 / (dp0[-1] + dp0)
-    return (dp0 * vel[0, 0, -1] + dp0[-1] * vel) * int_ratio
+    Args:
+        dp_ref(in): layer thickness in Pa
+        vel(in): grid center wind speed
+        avg(out: interpolated (grid edge) wind speed
+    """
+    # there's some complexity due to gz being defined on interfaces
+    # have to interpolate winds to layer interfaces first, using higher-order
+    with computation(PARALLEL):
+        with interval(0, 1):
+            top_ratio = dp_ref / (dp_ref + dp_ref[1])
+            avg = vel + (vel - vel[0, 0, 1]) * top_ratio
+        with interval(1, -1):
+            int_ratio = 1.0 / (dp_ref[-1] + dp_ref)
+            avg = (dp_ref * vel[0, 0, -1] + dp_ref[-1] * vel) * int_ratio
+        with interval(-1, None):
+            bot_ratio = dp_ref[-1] / (dp_ref[-2] + dp_ref[-1])
+            avg = vel[0, 0, -1] + (vel[0, 0, -1] - vel[0, 0, -2]) * bot_ratio
 
 
-@gtscript.function
-def xy_flux(gz_x, gz_y, xfx, yfx):
+def compute_fx_fy(
+    gz_x: FloatField,
+    gz_y: FloatField,
+    xfx: FloatField,
+    yfx: FloatField,
+    fx: FloatField,
+    fy: FloatField,
+):
     """
     Compute first-order upwind fluxes of gz in x and y directions.
 
     Args:
-        gz_x: gz with corners copied to perform derivatives in x-direction
-        gz_y: gz with corners copied to perform derivatives in y-direction
-        xfx (out): contravariant c-grid u-wind interpolated to layer interfaces,
+        gz_x(in): gz with corners copied to perform derivatives in x-direction
+        gz_y(in): gz with corners copied to perform derivatives in y-direction
+        xfx(in): contravariant c-grid u-wind interpolated to layer interfaces,
             including metric terms to make it a "volume flux"
-        yfx (out): contravariant c-grid v-wind interpolated to layer interfaces
-
-    Returns:
-        fx: first-order upwind x-flux of gz
-        fy: first-order upwind y-flux of gz
+        yfx(in): contravariant c-grid v-wind interpolated to layer interfaces
+        fx(out): first-order upwind x-flux of gz
+        fy(out): first-order upwind y-flux of gz
     """
-    fx = xfx * (gz_x[-1, 0, 0] if xfx > 0.0 else gz_x)
-    fy = yfx * (gz_y[0, -1, 0] if yfx > 0.0 else gz_y)
-    return fx, fy
-
 
-def double_copy(q_in: FloatField, copy_1: FloatField, copy_2: FloatField):
     with computation(PARALLEL), interval(...):
-        copy_1 = q_in
-        copy_2 = q_in
+        if xfx > 0.0:
+            fx = gz_x[-1, 0, 0]
+        else:
+            fx = gz_x
+        fx = xfx * fx
 
+        if yfx > 0.0:
+            fy = gz_y[0, -1, 0]
+        else:
+            fy = gz_y
+        fy = yfx * fy
 
-def update_dz_c(
-    dp_ref: FloatFieldK,
-    zs: FloatFieldIJ,
-    area: FloatFieldIJ,
-    ut: FloatField,
-    vt: FloatField,
-    gz: FloatField,
-    gz_x: FloatField,
+
+def compute_gz_ws(
     gz_y: FloatField,
-    ws: FloatFieldIJ,
-    *,
+    area: FloatFieldIJ,
+    fx: FloatField,
+    fy: FloatField,
+    xfx: FloatField,
+    yfx: FloatField,
+    dz_min: Float,
     dt: Float,
+    zs: FloatFieldIJ,
+    ws: FloatFieldIJ,
+    gz: FloatField,
 ):
     """
-    Step dz forward on c-grid
-    Eusures gz is monotonically increasing in z at the end
-    Args:
-        dp_ref:
-        zs:
-        area:
-        ut:
-        vt:
-        gz:
-        gz_x: gz with corners copied to perform derivatives in x-direction
-        gz_y: gz with corners copied to perform derivatives in y-direction
-        ws: lagrangian (parcel-following) surface vertical wind implied by
+        Compute gz and wd, eusures gz is monotonically increasing in z at the end
+
+    Args
+        gz_y(in): gz with corners copied to perform derivatives in y-direction
+        area(in):
+        fx(in): first-order upwind x-flux of gz
+        fy(in): first-order upwind y-flux of gz
+        xfx(in): contravariant c-grid u-wind interpolated to layer interfaces,
+            including metric terms to make it a "volume flux"
+        yfx(in): contravariant c-grid v-wind interpolated to layer interfaces
+        dz_min(in): Controls minimum thickness in NH solver
+        dt(in): timestep over which to evolve the geopotential height, in seconds
+        zs(in): surface height in m
+        ws(out): lagrangian (parcel-following) surface vertical wind implied by
             lowest-level gz change note that a parcel moving horizontally
             across terrain will be moving in the vertical (eqn 5.5 in documentation)
-        dt:
+        gz(out): geopotential height on model interfaces
     """
 
-    # there's some complexity due to gz being defined on interfaces
-    # have to interpolate winds to layer interfaces first, using higher-order
-    # cubic spline interpolation
-    with computation(PARALLEL):
-        with interval(0, 1):
-            # TODO: inline some or all of these functions
-            xfx = p_weighted_average_top(ut, dp_ref)
-            yfx = p_weighted_average_top(vt, dp_ref)
-        with interval(1, -1):
-            xfx = p_weighted_average_domain(ut, dp_ref)
-            yfx = p_weighted_average_domain(vt, dp_ref)
-        with interval(-1, None):
-            xfx = p_weighted_average_bottom(ut, dp_ref)
-            yfx = p_weighted_average_bottom(vt, dp_ref)
-    # xfx/yfx are now ut/vt interpolated to layer interfaces
     with computation(PARALLEL), interval(...):
-        fx, fy = xy_flux(gz_x, gz_y, xfx, yfx)
-        gz = (gz * area + (fx - fx[1, 0, 0]) + (fy - fy[0, 1, 0])) / (
+        gz = (gz_y * area + (fx - fx[1, 0, 0]) + (fy - fy[0, 1, 0])) / (
             area + (xfx - xfx[1, 0, 0]) + (yfx - yfx[0, 1, 0])
         )
-    with computation(FORWARD), interval(-1, None):
+    with computation(FORWARD), interval(...):
         rdt = 1.0 / dt
         ws = (zs - gz) * rdt
     with computation(BACKWARD), interval(0, -1):
-        gz_kp1 = gz[0, 0, 1] + DZ_MIN
+        gz_kp1 = gz[0, 0, 1] + dz_min
         gz = gz if gz > gz_kp1 else gz_kp1
 
 
@@ -123,11 +140,18 @@ def __init__(
         quantity_factory: QuantityFactory,
         area: Quantity,
         dp_ref: Quantity,
-        grid_type,
+        grid_type: int,
+        dz_min: Float,
     ):
+        """
+        Args:
+            dz_min: controls minimum thickness in NH solver
+        """
+
         grid_indexing = stencil_factory.grid_indexing
         self._area = area
         self._grid_type = grid_type
+        self._dz_min = dz_min
         # TODO: this is needed because GridData.dp_ref does not have access
         # to a QuantityFactory, we should add a way to perform operations on
         # Quantity and persist the QuantityFactory choices
@@ -150,6 +174,31 @@ def __init__(
             units="m**2/s**2",
             dtype=Float,
         )
+        self._gz_filled = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="m**2/s**2",
+            dtype=Float,
+        )
+        self._xfx = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="unknown",
+            dtype=Float,
+        )
+        self._yfx = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="unknown",
+            dtype=Float,
+        )
+        self._fx = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="unknown",
+            dtype=Float,
+        )
+        self._fy = quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="unknown",
+            dtype=Float,
+        )
         full_origin = grid_indexing.origin_full()
         full_domain = grid_indexing.domain_full(add=(0, 0, 1))
         self._double_copy_stencil = stencil_factory.from_origin_domain(
@@ -157,6 +206,11 @@ def __init__(
             origin=full_origin,
             domain=full_domain,
         )
+        self._copy_stencil = stencil_factory.from_origin_domain(
+            copy,
+            origin=full_origin,
+            domain=full_domain,
+        )
 
         ax_offsets = grid_indexing.axis_offsets(full_origin, full_domain)
 
@@ -174,12 +228,26 @@ def __init__(
                 domain=full_domain,
             )
 
-        self._update_dz_c = stencil_factory.from_origin_domain(
-            update_dz_c,
+        self._compute_weighted_average = stencil_factory.from_origin_domain(
+            compute_weighted_average,
+            origin=grid_indexing.origin_compute(add=(-1, -1, 0)),
+            domain=grid_indexing.domain_compute(add=(3, 3, 1)),
+        )
+
+        self._compute_flux = stencil_factory.from_origin_domain(
+            compute_fx_fy,
+            origin=grid_indexing.origin_compute(add=(-1, -1, 0)),
+            domain=grid_indexing.domain_compute(add=(3, 3, 1)),
+        )
+
+        self._compute_gz_ws = stencil_factory.from_origin_domain(
+            compute_gz_ws,
             origin=grid_indexing.origin_compute(add=(-1, -1, 0)),
             domain=grid_indexing.domain_compute(add=(2, 2, 1)),
         )
 
+        self.DEBUG_VAR_1 = quantity_factory.zeros([X_DIM, Y_DIM, Z_DIM], "n/a")
+
     def __call__(
         self,
         zs: FloatFieldIJ,
@@ -190,6 +258,8 @@ def __call__(
         dt: Float,
     ):
         """
+        Step dz forward on c-grid
+
         Args:
             dp_ref: layer thickness in Pa
             zs: surface height in m
@@ -205,20 +275,32 @@ def __call__(
 
         self._double_copy_stencil(gz, self._gz_x, self._gz_y)
 
-        # TODO(eddied): We pass the same fields 2x to avoid GTC validation errors
         if self._grid_type < 3:
             self._fill_corners_x_stencil(self._gz_x, self._gz_x)
             self._fill_corners_y_stencil(self._gz_y, self._gz_y)
 
-        self._update_dz_c(
-            self._dp_ref,
-            zs,
-            self._area,
-            ut,
-            vt,
-            gz,
-            self._gz_x,
-            self._gz_y,
-            ws,
+        self._compute_weighted_average(dp_ref=self._dp_ref, vel=ut, avg=self._xfx)
+        self._compute_weighted_average(dp_ref=self._dp_ref, vel=vt, avg=self._yfx)
+
+        self._compute_flux(
+            gz_x=self._gz_x,
+            gz_y=self._gz_y,
+            xfx=self._xfx,
+            yfx=self._yfx,
+            fx=self._fx,
+            fy=self._fy,
+        )
+
+        self._compute_gz_ws(
+            gz_y=self._gz_y,
+            area=self._area,
+            fx=self._fx,
+            fy=self._fy,
+            xfx=self._xfx,
+            yfx=self._yfx,
+            dz_min=self._dz_min,
             dt=dt,
+            zs=zs,
+            ws=ws,
+            gz=gz,
         )
diff --git a/pyFV3/stencils/updatedzd.py b/pyFV3/stencils/updatedzd.py
index b9c1ccb6..95bca3db 100644
--- a/pyFV3/stencils/updatedzd.py
+++ b/pyFV3/stencils/updatedzd.py
@@ -3,7 +3,6 @@
 import gt4py.cartesian.gtscript as gtscript
 from gt4py.cartesian.gtscript import BACKWARD, FORWARD, PARALLEL, computation, interval
 
-import ndsl.constants as constants
 from ndsl import Quantity, QuantityFactory, StencilFactory, orchestrate
 from ndsl.constants import (
     X_DIM,
@@ -19,9 +18,6 @@
 from pyFV3.stencils.fvtp2d import FiniteVolumeTransport
 
 
-DZ_MIN = constants.DZ_MIN
-
-
 @gtscript.function
 def _apply_height_advective_flux(
     height: FloatField,
@@ -75,6 +71,7 @@ def apply_height_fluxes(
     surface_height: FloatFieldIJ,
     ws: FloatFieldIJ,
     dt: Float,
+    dz_min: Float,
 ):
     """
     Apply all computed fluxes to height profile.
@@ -98,6 +95,7 @@ def apply_height_fluxes(
         surface_height (in): surface height
         ws (out): vertical velocity of the lowest level (to keep it at the surface)
         dt (in): acoustic timestep (seconds)
+        dz_min(in): controls minimum thickness in NH solver
     Grid variable inputs:
         area
     """
@@ -113,10 +111,11 @@ def apply_height_fluxes(
 
     with computation(BACKWARD):
         with interval(-1, None):
-            ws = (surface_height - height) / dt
+            rdt = 1 / dt
+            ws = (surface_height - height) * rdt
         with interval(0, -1):
             # ensure layer thickness exceeds minimum
-            other = height[0, 0, 1] + DZ_MIN
+            other = height[0, 0, 1] + dz_min
             height = height if height > other else other
 
 
@@ -215,8 +214,20 @@ def __init__(
         grid_data: GridData,
         grid_type: int,
         hord_tm: int,
+        dz_min: Float,
         column_namelist,
     ):
+        """
+        Args:
+            stencil_factory
+            quantity_factory
+            damping_coefficients
+            grid_data
+            grid_type
+            hord_tm
+            dz_min (in): controls minimum thickness in NH solver
+            column_namelist
+        """
         orchestrate(
             obj=self,
             config=stencil_factory.config.dace_config,
@@ -229,6 +240,7 @@ def __init__(
             raise NotImplementedError(
                 "damp <= 1e-5 in column_namelist is not implemented"
             )
+        self._dz_min = dz_min
         self._dp_ref = grid_data.dp_ref
         self._allocate_temporary_storages(quantity_factory)
         self._gk, self._beta, self._gamma = cubic_spline_interpolation_constants(
@@ -381,4 +393,5 @@ def __call__(
             surface_height,
             ws,
             dt,
+            self._dz_min,
         )
diff --git a/pyFV3/stencils/w_fix_consrv_moment.py b/pyFV3/stencils/w_fix_consrv_moment.py
new file mode 100644
index 00000000..26d5c3e1
--- /dev/null
+++ b/pyFV3/stencils/w_fix_consrv_moment.py
@@ -0,0 +1,83 @@
+from gt4py.cartesian.gtscript import BACKWARD, FORWARD, PARALLEL, computation, interval
+
+from ndsl.dsl.typing import BoolFieldIJ, Float, FloatField, FloatFieldIJ
+
+
+def W_fix_consrv_moment(
+    w: FloatField,
+    w2: FloatField,
+    dp2: FloatField,
+    gz: FloatFieldIJ,
+    w_max: Float,
+    w_min: Float,
+    compute_performed: BoolFieldIJ,
+):
+    """
+    Args:
+        w (in/out):
+        w2 (in?):
+        dp2(in):
+        w_max(in):
+        w_min(in):
+        compute_performed: (Internal Temporary),
+    """
+
+    with computation(PARALLEL), interval(...):
+        w2 = w
+
+    with computation(FORWARD):
+        with interval(0, 1):
+            compute_performed = False
+            if w2 > w_max:
+                gz = (w2 - w_max) * dp2
+                w2 = w_max
+                compute_performed = True
+            elif w2 < w_min:
+                gz = (w2 - w_min) * dp2
+                w2 = w_min
+                compute_performed = True
+        with interval(1, -1):
+            if compute_performed:
+                w2 = w2 + gz / dp2
+                compute_performed = False
+            if w2 > w_max:
+                gz = (w2 - w_max) * dp2
+                w2 = w_max
+                compute_performed = True
+            elif w2 < w_min:
+                gz = (w2 - w_min) * dp2
+                w2 = w_min
+                compute_performed = True
+
+    with computation(BACKWARD):
+        with interval(-1, None):
+            compute_performed = False
+            if w2 > w_max:
+                gz = (w2 - w_max) * dp2
+                w2 = w_max
+                compute_performed = True
+            elif w2 < w_min:
+                gz = (w2 - w_min) * dp2
+                w2 = w_min
+                compute_performed = True
+        with interval(1, -1):
+            if compute_performed:
+                w2 = w2 + gz / dp2
+                compute_performed = False
+            if w2 > w_max:
+                gz = (w2 - w_max) * dp2
+                w2 = w_max
+                compute_performed = True
+            elif w2 < w_min:
+                gz = (w2 - w_min) * dp2
+                w2 = w_min
+                compute_performed = True
+
+    with computation(FORWARD), interval(0, 1):
+        if w2 > (w_max * 2.0):
+            w2 = w_max * 2.0
+        elif w2 < (w_min * 2.0):
+            w2 = w_min * 2.0
+
+    with computation(PARALLEL), interval(...):
+        w = w2
diff --git a/pyFV3/stencils/xppm.py b/pyFV3/stencils/xppm.py
index 77d7780e..38e17670 100644
--- a/pyFV3/stencils/xppm.py
+++ b/pyFV3/stencils/xppm.py
@@ -46,12 +46,17 @@ def fx1_fn(courant, br, b0, bl):
 
 @gtscript.function
 def get_advection_mask(bl, b0, br):
-    from __externals__ import mord
+    from __externals__ import i_end, i_start, mord
 
     if __INLINED(mord == 5):
         smt5 = bl * br < 0
     else:
         smt5 = (3.0 * abs(b0)) < abs(bl - br)
+        # Fix edge issues
+        with horizontal(region[i_start - 1, :], region[i_start, :]):
+            smt5 = bl * br < 0.0
+        with horizontal(region[i_end, :], region[i_end + 1, :]):
+            smt5 = bl * br < 0.0
 
     if smt5[-1, 0, 0] or smt5[0, 0, 0]:
         advection_mask = 1.0
@@ -162,10 +167,6 @@ def compute_al(q: FloatField, dxa: FloatFieldIJ):
 
     al = ppm.p1 * (q[-1, 0, 0] + q) + ppm.p2 * (q[-2, 0, 0] + q[1, 0, 0])
 
-    if __INLINED(iord < 0):
-        compile_assert(False)
-        al = max(al, 0.0)
-
     if __INLINED(grid_type < 3):
         with horizontal(region[i_start - 1, :], region[i_end, :]):
             al = ppm.c1 * q[-2, 0, 0] + ppm.c2 * q[-1, 0, 0] + ppm.c3 * q
@@ -182,6 +183,9 @@ def compute_al(q: FloatField, dxa: FloatFieldIJ):
         with horizontal(region[i_start + 1, :], region[i_end + 2, :]):
             al = ppm.c3 * q[-1, 0, 0] + ppm.c2 * q[0, 0, 0] + ppm.c1 * q[1, 0, 0]
 
+    if __INLINED(iord < 0):
+        al = max(al, 0.0)
+
     return al
 
 
@@ -273,7 +277,10 @@ def compute_blbr_ord8plus(q: FloatField, dxa: FloatFieldIJ):
 
 
 def compute_x_flux(
-    q: FloatField, courant: FloatField, dxa: FloatFieldIJ, xflux: FloatField
+    q: FloatField,
+    courant: FloatField,
+    dxa: FloatFieldIJ,
+    xflux: FloatField,
 ):
     """
     Args:
@@ -296,6 +303,30 @@ def compute_x_flux(
 class XPiecewiseParabolic:
     """
     Fortran name is xppm
+
+
+    `iord` is `hord_dp` which is hord for `δp`, `δz`, where:
+
+    `δp`: Total air mass (including vapor and condensates)
+        Equal to hydrostatic pressure depth of layer
+    `δz`: Geometric layer depth (nonhydrostatic)
+
+    Value explainers:
+        5: Unlimited “fifth-order” scheme with weak 2∆x filter; fastest
+            and least diffusive (“inviscid”)
+        6: Intermediate-strength 2∆x filter. Gives best ACC and storm
+            structure but weaker TCs (“minimally-diffusive”)
+        8: Lin 2004 monotone PPM constraint (“monotonic”)
+        9: Hunyh constraint: more expensive but less diffusive than #8
+        -5: #5 with a positive-definite constraint
+
+    Undocumented values implemented in Fortran: 7, 10, 11, 12, 13.
+
+    The code below is capable of:
+        - Cube-sphere grid (no doubly periodic)
+        - `iord` == 8 for monotonic behaviors OR
+        - `iord` 5, 6
+        - `iord` must be positive
     """
 
     def __init__(
@@ -310,21 +341,20 @@ def __init__(
         # Arguments come from:
         # namelist.grid_type
         # grid.dxa
-        if grid_type == 3 or grid_type > 4:
-            raise NotImplementedError(
-                "X Piecewise Parabolic (xppm): "
-                f" grid type {grid_type} not implemented. <3 or 4 available."
-            )
-
-        if abs(iord) >= 8 and iord != 8:
+        available_grid_options = [0, 4]
+        if grid_type not in available_grid_options:
             raise NotImplementedError(
-                "X Piecewise Parabolic (xppm): "
-                f"iord {iord} != 8 not implemented when >= 8."
+                "Y Piecewise Parabolic (yppm) configuration: "
+                f"grid type {grid_type} not implemented. "
+                f"Options are {available_grid_options}."
             )
 
-        if iord < 0:
+        available_iords = [-6, 5, 6, 8]
+        if iord not in available_iords:
             raise NotImplementedError(
-                f"X Piecewise Parabolic (xppm): iord {iord} < 0 not implemented."
+                "Y Piecewise Parabolic (yppm) configuration: "
+                f"iord {iord} not implemented. "
+                f"Options are {available_iords}."
             )
 
         self._dxa = dxa
@@ -372,7 +402,10 @@ def __call__(
         # were called "get_flux", while the routine which got the flux was called
         # fx1_fn. The final value was called xflux instead of q_out.
         self._compute_flux_stencil(
-            q_in, c, self._dxa, q_mean_advected_through_x_interface
+            q_in,
+            c,
+            self._dxa,
+            q_mean_advected_through_x_interface,
         )
         # bl and br are "edge perturbation values" as in equation 4.1
         # of the FV3 documentation
diff --git a/pyFV3/stencils/xtp_u.py b/pyFV3/stencils/xtp_u.py
index 9b9bd0ac..94e7e34f 100644
--- a/pyFV3/stencils/xtp_u.py
+++ b/pyFV3/stencils/xtp_u.py
@@ -85,7 +85,7 @@ def advect_u_along_x(
 
     bl, br = get_bl_br(u, dx, dxa)
     b0 = bl + br
-    cfl = ub_contra * dt * rdx[-1, 0] if ub_contra > 0 else ub_contra * dt * rdx
+    cfl = ub_contra * rdx[-1, 0] if ub_contra > 0 else ub_contra * rdx
     fx0 = xppm.fx1_fn(cfl, br, b0, bl)
 
     if __INLINED(iord < 8):
diff --git a/pyFV3/stencils/yppm.py b/pyFV3/stencils/yppm.py
index 129b134b..0b92f661 100644
--- a/pyFV3/stencils/yppm.py
+++ b/pyFV3/stencils/yppm.py
@@ -46,12 +46,19 @@ def fx1_fn(courant, br, b0, bl):
 
 @gtscript.function
 def get_advection_mask(bl, b0, br):
-    from __externals__ import mord
+    from __externals__ import j_end, j_start, mord
 
     if __INLINED(mord == 5):
         smt5 = bl * br < 0
+    elif __INLINED(mord == -5):
+        compile_assert(False)
     else:
         smt5 = (3.0 * abs(b0)) < abs(bl - br)
+        # Fix edge issues
+        with horizontal(region[:, j_start - 1], region[:, j_start]):
+            smt5 = bl * br < 0.0
+        with horizontal(region[:, j_end], region[:, j_end + 1]):
+            smt5 = bl * br < 0.0
 
     if smt5[0, -1, 0] or smt5[0, 0, 0]:
         advection_mask = 1.0
@@ -162,10 +169,6 @@ def compute_al(q: FloatField, dya: FloatFieldIJ):
 
     al = ppm.p1 * (q[0, -1, 0] + q) + ppm.p2 * (q[0, -2, 0] + q[0, 1, 0])
 
-    if __INLINED(jord < 0):
-        compile_assert(False)
-        al = max(al, 0.0)
-
     if __INLINED(grid_type < 3):
         with horizontal(region[:, j_start - 1], region[:, j_end]):
             al = ppm.c1 * q[0, -2, 0] + ppm.c2 * q[0, -1, 0] + ppm.c3 * q
@@ -182,6 +185,9 @@ def compute_al(q: FloatField, dya: FloatFieldIJ):
         with horizontal(region[:, j_start + 1], region[:, j_end + 2]):
             al = ppm.c3 * q[0, -1, 0] + ppm.c2 * q[0, 0, 0] + ppm.c1 * q[0, 1, 0]
 
+    if __INLINED(jord < 0):
+        al = max(al, 0.0)
+
     return al
 
 
@@ -273,7 +279,10 @@ def compute_blbr_ord8plus(q: FloatField, dya: FloatFieldIJ):
 
 
 def compute_y_flux(
-    q: FloatField, courant: FloatField, dya: FloatFieldIJ, yflux: FloatField
+    q: FloatField,
+    courant: FloatField,
+    dya: FloatFieldIJ,
+    yflux: FloatField,
 ):
     """
     Args:
@@ -296,6 +305,29 @@ def compute_y_flux(
 class YPiecewiseParabolic:
     """
     Fortran name is yppm
+
+    `jord` is `hord_dp` which is hord for `δp`, `δz`, where:
+
+    `δp`: Total air mass (including vapor and condensates)
+        Equal to hydrostatic pressure depth of layer
+    `δz`: Geometric layer depth (nonhydrostatic)
+
+    Value explainers:
+        5: Unlimited “fifth-order” scheme with weak 2∆x filter; fastest
+            and least diffusive (“inviscid”)
+        6: Intermediate-strength 2∆x filter. Gives best ACC and storm
+            structure but weaker TCs (“minimally-diffusive”)
+        8: Lin 2004 monotone PPM constraint (“monotonic”)
+        9: Hunyh constraint: more expensive but less diffusive than #8
+        -5: #5 with a positive-definite constraint
+
+    Undocumented values implemented in Fortran: 7, 10, 11, 12, 13.
+
+    The code below is capable of:
+        - Cube-sphere grid (no doubly periodic)
+        - `jord` == 8 for monotonic behaviors OR
+        - `jord` 5, 6
+        - `jord` must be positive
     """
 
     def __init__(
@@ -307,25 +339,29 @@ def __init__(
         origin: Index3D,
         domain: Index3D,
     ):
+        # Dev note: this could be rewrote to split monotonic and not, or per-type of
+        #           scheme as described above with compiler-time `jord` conditional to
+        #           direct the code
+
         orchestrate(obj=self, config=stencil_factory.config.dace_config)
         # Arguments come from:
         # namelist.grid_type
         # grid.dya
-        if grid_type == 3 or grid_type > 4:
-            raise NotImplementedError(
-                "Y Piecewise Parabolic (xppm): "
-                f" grid type {grid_type} not implemented. <3 or 4 available."
-            )
 
-        if abs(jord) >= 8 and jord != 8:
+        available_grid_options = [0, 4]
+        if grid_type not in available_grid_options:
             raise NotImplementedError(
-                "Y Piecewise Parabolic (yppm): "
-                f"jord {jord} != 8 not implemented when >= 8."
+                "Y Piecewise Parabolic (yppm) configuration: "
+                f"grid type {grid_type} not implemented. "
+                f"Options are {available_grid_options}."
             )
 
-        if jord < 0:
+        available_jords = [-6, 5, 6, 8]
+        if jord not in available_jords:
             raise NotImplementedError(
-                f"Y Piecewise Parabolic (yppm): jord {jord} < 0 not implemented."
+                "Y Piecewise Parabolic (yppm) configuration: "
+                f"jord {jord} not implemented. "
+                f"Options are {available_jords}."
             )
 
         self._dya = dya
@@ -373,7 +409,10 @@ def __call__(
         # were called "get_flux", while the routine which got the flux was called
         # fx1_fn. The final value was called yflux instead of q_out.
         self._compute_flux_stencil(
-            q_in, c, self._dya, q_mean_advected_through_y_interface
+            q_in,
+            c,
+            self._dya,
+            q_mean_advected_through_y_interface,
         )
         # bl and br are "edge perturbation values" as in equation 4.1
         # of the FV3 documentation
diff --git a/pyFV3/stencils/ytp_v.py b/pyFV3/stencils/ytp_v.py
index 4c1e4af9..a2ebc6ee 100644
--- a/pyFV3/stencils/ytp_v.py
+++ b/pyFV3/stencils/ytp_v.py
@@ -84,7 +84,7 @@ def advect_v_along_y(
 
     bl, br = get_bl_br(v, dy, dya)
     b0 = bl + br
-    cfl = vb_contra * dt * rdy[0, -1] if vb_contra > 0 else vb_contra * dt * rdy
+    cfl = vb_contra * rdy[0, -1] if vb_contra > 0 else vb_contra * rdy
     fx0 = yppm.fx1_fn(cfl, br, b0, bl)
 
     if __INLINED(jord < 8):
diff --git a/pyFV3/testing/translate_dyncore.py b/pyFV3/testing/translate_dyncore.py
index 8524d090..1ae8d2cd 100644
--- a/pyFV3/testing/translate_dyncore.py
+++ b/pyFV3/testing/translate_dyncore.py
@@ -1,7 +1,9 @@
 import ndsl.dsl.gt4py_utils as utils
-from ndsl import Namelist, Quantity, StencilFactory
+from ndsl import Quantity, StencilFactory
+from f90nml.namelist import Namelist
 from ndsl.constants import X_DIM, X_INTERFACE_DIM, Y_DIM, Y_INTERFACE_DIM, Z_DIM
 from ndsl.stencils.testing import ParallelTranslate2PyState
+from numpy import dtype
 from pyFV3._config import DynamicalCoreConfig
 from pyFV3.dycore_state import DycoreState
 from pyFV3.stencils import dyn_core
@@ -101,6 +103,7 @@ def __init__(
             "ak": {},
             "bk": {},
             "diss_estd": {},
+            "dpx": grid.compute_dict(),
         }
         self._base.in_vars["data_vars"]["wsd"]["kstart"] = grid.npz
         self._base.in_vars["data_vars"]["wsd"]["kend"] = None
@@ -140,8 +143,17 @@ def compute_parallel(self, inputs, communicator):
             grid_data.bk = inputs["bk"]
             grid_data.ptop = inputs["ptop"]
         self._base.make_storage_data_input_vars(inputs)
-        state = DycoreState.init_zeros(quantity_factory=self.grid.quantity_factory)
-        wsd: Quantity = self.grid.quantity_factory.zeros(
+        inputs_dtypes = {}
+        for k, v in inputs.items():
+            if hasattr(v, "dtype"):
+                inputs_dtypes[k] = v.dtype
+        state = DycoreState.init_zeros(
+            quantity_factory=self.grid.quantity_factory,
+            dtype_dict=inputs_dtypes,
+            tracer_count=1,  # No tracers used in acoustics
+            allow_mismatch_float_precision=True,
+        )
+        wsd = self.grid.quantity_factory.zeros(
             dims=[X_DIM, Y_DIM],
             units="unknown",
         )
@@ -153,11 +165,18 @@ def compute_parallel(self, inputs, communicator):
                 state[name].data[selection] = value
             else:
                 setattr(state, name, value)
-        phis: Quantity = self.grid.quantity_factory.zeros(
+        phis = self.grid.quantity_factory.zeros(
             dims=[X_DIM, Y_DIM],
             units="m",
         )
         phis.data[:] = phis.np.asarray(inputs["phis"])
+        dpx = self.grid.quantity_factory.zeros(
+            dims=[X_DIM, Y_DIM, Z_DIM],
+            units="unknown",
+            dtype=inputs_dtypes["dpx"],
+            allow_mismatch_float_precision=True,
+        )
+        dpx.data[:] = dpx.np.asarray(inputs["dpx"])
         acoustic_dynamics = dyn_core.AcousticDynamics(
             comm=communicator,
             stencil_factory=self.stencil_factory,
@@ -174,7 +193,16 @@ def compute_parallel(self, inputs, communicator):
         )
         acoustic_dynamics.cappa.data[:] = inputs["cappa"][:]
 
-        acoustic_dynamics(state, timestep=inputs["mdt"], n_map=state.n_map)
+        acoustic_dynamics(
+            state,
+            mfxd=state.mfxd,
+            mfyd=state.mfyd,
+            cxd=state.cxd,
+            cyd=state.cyd,
+            dpx=dpx,
+            timestep=inputs["mdt"],
+            n_map=state.n_map,
+        )
         # the "inputs" dict is not used to return, we construct a new dict based
         # on variables attached to `state`
         storages_only = {}
@@ -185,4 +213,5 @@ def compute_parallel(self, inputs, communicator):
                 storages_only[name] = value
         storages_only["wsd"] = wsd.data
         storages_only["cappa"] = acoustic_dynamics.cappa.data
+        storages_only["dpx"] = dpx.data
         return self._base.slice_output(storages_only)
diff --git a/pyFV3/testing/translate_fvdynamics.py b/pyFV3/testing/translate_fvdynamics.py
index 5daea9d0..ed78141d 100644
--- a/pyFV3/testing/translate_fvdynamics.py
+++ b/pyFV3/testing/translate_fvdynamics.py
@@ -5,7 +5,8 @@
 import pytest
 
 import ndsl.dsl.gt4py_utils as utils
-from ndsl import Namelist, Quantity, StencilFactory
+from ndsl import Quantity, QuantityFactory, StencilFactory, FieldBundle
+from f90nml import Namelist
 from ndsl.constants import (
     X_DIM,
     X_INTERFACE_DIM,
@@ -90,25 +91,25 @@ class TranslateFVDynamics(ParallelTranslateBaseSlicing):
             "dims": [X_DIM, Z_INTERFACE_DIM, Y_DIM],
             "n_halo": 0,
         },
-        "mfxd": {
+        "mfxd_FV": {
             "name": "accumulated_x_mass_flux",
             "dims": [X_INTERFACE_DIM, Y_DIM, Z_DIM],
             "units": "unknown",
             "n_halo": 0,
         },
-        "mfyd": {
+        "mfyd_FV": {
             "name": "accumulated_y_mass_flux",
             "dims": [X_DIM, Y_INTERFACE_DIM, Z_DIM],
             "units": "unknown",
             "n_halo": 0,
         },
-        "cxd": {
+        "cxd_FV": {
             "name": "accumulated_x_courant_number",
             "dims": [X_INTERFACE_DIM, Y_DIM, Z_DIM],
             "units": "",
             "n_halo": (0, 3),
         },
-        "cyd": {
+        "cyd_FV": {
             "name": "accumulated_y_courant_number",
             "dims": [X_DIM, Y_INTERFACE_DIM, Z_DIM],
             "units": "",
@@ -164,51 +165,6 @@ class TranslateFVDynamics(ParallelTranslateBaseSlicing):
             "units": "m^2 s^-2",
             "dims": [X_DIM, Y_DIM],
         },
-        "qvapor": {
-            "name": "specific_humidity",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "kg/kg",
-        },
-        "qliquid": {
-            "name": "cloud_water_mixing_ratio",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "kg/kg",
-        },
-        "qice": {
-            "name": "cloud_ice_mixing_ratio",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "kg/kg",
-        },
-        "qrain": {
-            "name": "rain_mixing_ratio",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "kg/kg",
-        },
-        "qsnow": {
-            "name": "snow_mixing_ratio",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "kg/kg",
-        },
-        "qgraupel": {
-            "name": "graupel_mixing_ratio",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "kg/kg",
-        },
-        "qo3mr": {
-            "name": "ozone_mixing_ratio",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "kg/kg",
-        },
-        "qsgs_tke": {
-            "name": "turbulent_kinetic_energy",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "m**2/s**2",
-        },
-        "qcld": {
-            "name": "cloud_fraction",
-            "dims": [X_DIM, Y_DIM, Z_DIM],
-            "units": "",
-        },
         "omga": {
             "name": "vertical_pressure_velocity",
             "dims": [X_DIM, Y_DIM, Z_DIM],
@@ -219,6 +175,7 @@ class TranslateFVDynamics(ParallelTranslateBaseSlicing):
     }
 
     outputs = inputs.copy()
+    outputs["tracers"] = {}
 
     for name in ("bdt", "ak", "bk", "ptop", "ua"):
         outputs.pop(name)
@@ -237,15 +194,6 @@ def __init__(
             "v": grid.x3d_domain_dict(),
             "w": {},
             "delz": {},
-            "qvapor": grid.compute_dict(),
-            "qliquid": grid.compute_dict(),
-            "qice": grid.compute_dict(),
-            "qrain": grid.compute_dict(),
-            "qsnow": grid.compute_dict(),
-            "qgraupel": grid.compute_dict(),
-            "qo3mr": grid.compute_dict(),
-            "qsgs_tke": grid.compute_dict(),
-            "qcld": {},
             "ps": {},
             "pe": {
                 "istart": grid.is_ - 1,
@@ -274,10 +222,10 @@ def __init__(
             "va": {},
             "uc": grid.x3d_domain_dict(),
             "vc": grid.y3d_domain_dict(),
-            "mfxd": grid.x3d_compute_dict(),
-            "mfyd": grid.y3d_compute_dict(),
-            "cxd": grid.x3d_compute_domain_y_dict(),
-            "cyd": grid.y3d_compute_domain_x_dict(),
+            "mfxd_FV": grid.x3d_compute_dict(),
+            "mfyd_FV": grid.y3d_compute_dict(),
+            "cxd_FV": grid.x3d_compute_domain_y_dict(),
+            "cyd_FV": grid.y3d_compute_domain_x_dict(),
             "diss_estd": {},
         }
         self._base.in_vars["data_vars"].update(fv_dynamics_vars)
@@ -285,20 +233,37 @@ def __init__(
         self._base.out_vars.update(fv_dynamics_vars)
         self._base.out_vars["ps"] = {"kstart": grid.npz - 1, "kend": grid.npz - 1}
         self._base.out_vars["phis"] = {"kstart": grid.npz - 1, "kend": grid.npz - 1}
+        self._base.out_vars["tracers"] = {}
         self._base.out_vars.pop("ua")
 
         self.max_error = 1e-5
 
         self.ignore_near_zero_errors = {}
-        for qvar in utils.tracer_variables:
-            self.ignore_near_zero_errors[qvar] = True
-        self.ignore_near_zero_errors["q_con"] = True
         self.dycore: Optional[fv_dynamics.DynamicalCore] = None
         self.stencil_factory = stencil_factory
+        self._quantity_factory = QuantityFactory.from_backend(
+            sizer=stencil_factory.grid_indexing._sizer,
+            backend=stencil_factory.backend,
+        )
         self.namelist: DynamicalCoreConfig = DynamicalCoreConfig.from_namelist(namelist)
 
     def state_from_inputs(self, inputs):
+        tracers = self._quantity_factory._numpy.empty(
+            (
+                inputs["tracers"].shape[0] + 1,
+                inputs["tracers"].shape[1] + 1,
+                inputs["tracers"].shape[2] + 1,
+                inputs["tracers"].shape[3],
+            )
+        )
+        tracers[:-1, :-1, :-1, :] = inputs.pop("tracers")
         input_storages = super().state_from_inputs(inputs)
+        input_storages["tracers"] = tracers
+        # Move fluxes and courant numbers
+        input_storages["mfxd"] = input_storages.pop("mfxd_FV")
+        input_storages["mfyd"] = input_storages.pop("mfyd_FV")
+        input_storages["cxd"] = input_storages.pop("cxd_FV")
+        input_storages["cyd"] = input_storages.pop("cyd_FV")
         # making sure we init DycoreState with the exact set of variables
         accepted_keys = [_field.name for _field in fields(DycoreState)]
         todelete = []
@@ -307,8 +272,10 @@ def state_from_inputs(self, inputs):
                 todelete.append(name)
         for name in todelete:
             del input_storages[name]
-
-        state = DycoreState.init_from_storages(input_storages, sizer=self.grid.sizer)
+        state = DycoreState.init_from_storages(
+            storages=input_storages,
+            quantity_factory=self._quantity_factory,
+        )
         return state
 
     def prepare_data(self, inputs) -> Tuple[DycoreState, GridData]:
@@ -340,19 +307,24 @@ def compute_parallel(self, inputs, communicator):
             config=DynamicalCoreConfig.from_namelist(self.namelist),
             phis=state.phis,
             state=state,
-            timestep=timedelta(seconds=inputs["bdt"]),
+            exclude_tracers=["cloud"],
+            timestep=timedelta(seconds=float(inputs["bdt"])),
         )
         self.dycore.step_dynamics(state, NullTimer())
         outputs = self.outputs_from_state(state)
         return outputs
 
-    def outputs_from_state(self, state: dict):
+    def outputs_from_state(self, state: DycoreState):
         if len(self.outputs) == 0:
             return {}
         outputs = {}
         storages = {}
-        for name, properties in self.outputs.items():
-            if isinstance(state[name], Quantity):
+        for name, _properties in self.outputs.items():
+            if name in ["mfxd_FV", "mfyd_FV", "cxd_FV", "cyd_FV"]:
+                storages[name] = state[name[:-3]].data
+            elif isinstance(state[name], FieldBundle):
+                storages[name] = state[name].quantity.data
+            elif isinstance(state[name], Quantity):
                 storages[name] = state[name].data
             elif len(self.outputs[name]["dims"]) > 0:
                 storages[name] = state[name]  # assume it's a storage
diff --git a/pyFV3/tracers.py b/pyFV3/tracers.py
new file mode 100644
index 00000000..0407c5e6
--- /dev/null
+++ b/pyFV3/tracers.py
@@ -0,0 +1,61 @@
+from typing import TypeAlias
+from ndsl import QuantityFactory
+from ndsl.constants import X_DIM, Y_DIM, Z_DIM
+from ndsl.quantity.field_bundle import FieldBundle, FieldBundleType
+from pyFV3.version import IS_GEOS
+
+# Defauult maopping for common models
+_default_mapping_GEOS = {
+    "vapor": 0,
+    "liquid": 1,
+    "ice": 2,
+    "rain": 3,
+    "snow": 4,
+    "graupel": 5,
+    "cloud": 6,
+}
+_default_mapping_PACE = {
+    "vapor": 0,
+    "liquid": 1,
+    "rain": 2,
+    "ice": 3,
+    "snow": 4,
+    "graupel": 5,
+    "om3r": 6,
+    "cloud": 7,
+}
+
+
+TracersType: TypeAlias = FieldBundleType.T("Tracers")  # type: ignore
+
+
+def setup_tracers(
+    number_of_tracers: int,
+    quantity_factory: QuantityFactory,
+    mappings: dict[str, int] | None = None,
+) -> FieldBundle:
+    """Setup a FieldBundle for tracers. Should be called only once."""
+
+    FieldBundleType.register("Tracers", (number_of_tracers,))
+
+    _unit = "g/kg"
+    _dims = [X_DIM, Y_DIM, Z_DIM, "tracers"]
+
+    tracers_qty_factory = FieldBundle.extend_3D_quantity_factory(
+        quantity_factory, {"tracers": number_of_tracers}
+    )
+    data = tracers_qty_factory.zeros(_dims, units=_unit)
+
+    # Some default mappings for ease of use with commonly
+    # run models
+    if mappings is None:
+        if IS_GEOS:
+            mappings = _default_mapping_GEOS
+        else:
+            mappings = _default_mapping_PACE
+
+    return FieldBundle(
+        "Tracers",
+        quantity=data,
+        mapping=mappings,
+    )
diff --git a/pyFV3/wrappers/geos_wrapper.py b/pyFV3/wrappers/geos_wrapper.py
index 98feb79c..53ad91c5 100644
--- a/pyFV3/wrappers/geos_wrapper.py
+++ b/pyFV3/wrappers/geos_wrapper.py
@@ -10,6 +10,7 @@
 from mpi4py import MPI
 
 import pyFV3
+import pyFV3.tracers
 from ndsl import (
     CompilationConfig,
     CubedSphereCommunicator,
@@ -36,6 +37,17 @@
 from ndsl.utils import safe_assign_array
 
 
+GEOS_TRACER_MAPPING = [
+    "vapor",
+    "liquid",
+    "ice",
+    "rain",
+    "snow",
+    "graupel",
+    "cloud",
+]
+
+
 class StencilBackendCompilerOverride:
     """Override the Pace global stencil JIT to allow for 9-rank build
     on any setup.
@@ -104,8 +116,23 @@ def __init__(
         bdt: int,
         comm: Comm,
         backend: str,
+        water_tracers_count: int,
+        all_tracers_count: int,
         fortran_mem_space: MemorySpace = MemorySpace.HOST,
     ):
+        # Check for water species configuration not handled by the interface
+        if water_tracers_count != 6:
+            raise NotImplementedError(
+                "[pyFV3 Bridge] Bridge expect 6 water species,"
+                f" got {water_tracers_count}."
+            )
+
+        # Build the full tracer mapping by appending None to the expected tracer list
+        # based on parameter
+        self._tracers_mapping = GEOS_TRACER_MAPPING
+        for i in range(all_tracers_count, len(GEOS_TRACER_MAPPING)):
+            self._tracers_mapping.append(f"tracer_#{i}")
+
         # Look for an override to run on a single node
         gtfv3_single_rank_override = int(os.getenv("GTFV3_SINGLE_RANK_OVERRIDE", -1))
         if gtfv3_single_rank_override >= 0:
@@ -137,7 +164,7 @@ def __init__(
         metric_terms = MetricTerms(
             quantity_factory=quantity_factory,
             communicator=self.communicator,
-            eta_file=namelist["grid_config"]["config"]["eta_file"],
+            eta_file=namelist["grid_config"]["config"]["eta_file"],  # type: ignore
         )
         grid_data = GridData.new_from_metric_terms(metric_terms)
 
@@ -173,7 +200,8 @@ def __init__(
         )
 
         self.dycore_state = pyFV3.DycoreState.init_zeros(
-            quantity_factory=quantity_factory
+            quantity_factory=quantity_factory,
+            tracer_list=self._tracers_mapping,
         )
         self.dycore_state.bdt = self.dycore_config.dt_atmos
 
@@ -190,6 +218,7 @@ def __init__(
                 timestep=timedelta(seconds=self.dycore_state.bdt),
                 phis=self.dycore_state.phis,
                 state=self.dycore_state,
+                exclude_tracers=[],
             )
 
         self._fortran_mem_space = fortran_mem_space
@@ -198,7 +227,6 @@ def __init__(
         )
 
         self.output_dict: Dict[str, np.ndarray] = {}
-        self._allocate_output_dir()
 
         # Feedback information
         device_ordinal_info = (
@@ -368,15 +396,11 @@ def _put_fortran_data_in_dycore(
         safe_assign_array(state.omga.view[:], omga[isc:iec, jsc:jec, :])
         safe_assign_array(state.diss_estd.view[:], diss_estd[isc:iec, jsc:jec, :])
 
-        # tracer quantities should be a 4d array in order:
-        # vapor, liquid, ice, rain, snow, graupel, cloud
-        safe_assign_array(state.qvapor.view[:], q[isc:iec, jsc:jec, :, 0])
-        safe_assign_array(state.qliquid.view[:], q[isc:iec, jsc:jec, :, 1])
-        safe_assign_array(state.qice.view[:], q[isc:iec, jsc:jec, :, 2])
-        safe_assign_array(state.qrain.view[:], q[isc:iec, jsc:jec, :, 3])
-        safe_assign_array(state.qsnow.view[:], q[isc:iec, jsc:jec, :, 4])
-        safe_assign_array(state.qgraupel.view[:], q[isc:iec, jsc:jec, :, 5])
-        safe_assign_array(state.qcld.view[:], q[isc:iec, jsc:jec, :, 6])
+        # Copy tracer data
+        for index, name in enumerate(self._tracers_mapping):
+            safe_assign_array(
+                state.tracers[name].view[:], q[isc:iec, jsc:jec, :, index]
+            )
 
         return state
 
@@ -388,6 +412,7 @@ def _prep_outputs_for_geos(self) -> Dict[str, np.ndarray]:
         jec = self._grid_indexing.jec + 1
 
         if self._fortran_mem_space != self._pace_mem_space:
+            self._allocate_output_dir()
             safe_assign_array(output_dict["u"], self.dycore_state.u.data[:-1, :, :-1])
             safe_assign_array(output_dict["v"], self.dycore_state.v.data[:, :-1, :-1])
             safe_assign_array(output_dict["w"], self.dycore_state.w.data[:-1, :-1, :-1])
@@ -453,27 +478,8 @@ def _prep_outputs_for_geos(self) -> Dict[str, np.ndarray]:
                 self.dycore_state.diss_estd.data[:-1, :-1, :-1],
             )
 
-            safe_assign_array(
-                output_dict["qvapor"], self.dycore_state.qvapor.data[:-1, :-1, :-1]
-            )
-            safe_assign_array(
-                output_dict["qliquid"], self.dycore_state.qliquid.data[:-1, :-1, :-1]
-            )
-            safe_assign_array(
-                output_dict["qice"], self.dycore_state.qice.data[:-1, :-1, :-1]
-            )
-            safe_assign_array(
-                output_dict["qrain"], self.dycore_state.qrain.data[:-1, :-1, :-1]
-            )
-            safe_assign_array(
-                output_dict["qsnow"], self.dycore_state.qsnow.data[:-1, :-1, :-1]
-            )
-            safe_assign_array(
-                output_dict["qgraupel"], self.dycore_state.qgraupel.data[:-1, :-1, :-1]
-            )
-            safe_assign_array(
-                output_dict["qcld"], self.dycore_state.qcld.data[:-1, :-1, :-1]
-            )
+            # Copy tracer data
+            safe_assign_array(output_dict["q"], self.dycore_state.tracers.as_4D_array())
         else:
             output_dict["u"] = self.dycore_state.u.data[:-1, :, :-1]
             output_dict["v"] = self.dycore_state.v.data[:, :-1, :-1]
@@ -504,23 +510,18 @@ def _prep_outputs_for_geos(self) -> Dict[str, np.ndarray]:
             output_dict["q_con"] = self.dycore_state.q_con.data[:-1, :-1, :-1]
             output_dict["omga"] = self.dycore_state.omga.data[:-1, :-1, :-1]
             output_dict["diss_estd"] = self.dycore_state.diss_estd.data[:-1, :-1, :-1]
-            output_dict["qvapor"] = self.dycore_state.qvapor.data[:-1, :-1, :-1]
-            output_dict["qliquid"] = self.dycore_state.qliquid.data[:-1, :-1, :-1]
-            output_dict["qice"] = self.dycore_state.qice.data[:-1, :-1, :-1]
-            output_dict["qrain"] = self.dycore_state.qrain.data[:-1, :-1, :-1]
-            output_dict["qsnow"] = self.dycore_state.qsnow.data[:-1, :-1, :-1]
-            output_dict["qgraupel"] = self.dycore_state.qgraupel.data[:-1, :-1, :-1]
-            output_dict["qcld"] = self.dycore_state.qcld.data[:-1, :-1, :-1]
+            output_dict["q"] = self.dycore_state.tracers.as_4D_array()
 
         return output_dict
 
     def _allocate_output_dir(self):
+        if len(self.output_dict) != 0:
+            return
         if self._fortran_mem_space != self._pace_mem_space:
             nhalo = self._grid_indexing.n_halo
             shape_centered = self._grid_indexing.domain_full(add=(0, 0, 0))
             shape_x_interface = self._grid_indexing.domain_full(add=(1, 0, 0))
             shape_y_interface = self._grid_indexing.domain_full(add=(0, 1, 0))
-            shape_z_interface = self._grid_indexing.domain_full(add=(0, 0, 1))
             shape_2d = shape_centered[:-1]
 
             self.output_dict["u"] = np.empty((shape_y_interface))
@@ -573,34 +574,3 @@ def _allocate_output_dir(self):
             self.output_dict["qsnow"] = np.empty((shape_centered))
             self.output_dict["qgraupel"] = np.empty((shape_centered))
             self.output_dict["qcld"] = np.empty((shape_centered))
-        else:
-            self.output_dict["u"] = None
-            self.output_dict["v"] = None
-            self.output_dict["w"] = None
-            self.output_dict["ua"] = None
-            self.output_dict["va"] = None
-            self.output_dict["uc"] = None
-            self.output_dict["vc"] = None
-            self.output_dict["delz"] = None
-            self.output_dict["pt"] = None
-            self.output_dict["delp"] = None
-            self.output_dict["mfxd"] = None
-            self.output_dict["mfyd"] = None
-            self.output_dict["cxd"] = None
-            self.output_dict["cyd"] = None
-            self.output_dict["ps"] = None
-            self.output_dict["pe"] = None
-            self.output_dict["pk"] = None
-            self.output_dict["peln"] = None
-            self.output_dict["pkz"] = None
-            self.output_dict["phis"] = None
-            self.output_dict["q_con"] = None
-            self.output_dict["omga"] = None
-            self.output_dict["diss_estd"] = None
-            self.output_dict["qvapor"] = None
-            self.output_dict["qliquid"] = None
-            self.output_dict["qice"] = None
-            self.output_dict["qrain"] = None
-            self.output_dict["qsnow"] = None
-            self.output_dict["qgraupel"] = None
-            self.output_dict["qcld"] = None
diff --git a/tests/mpi/test_doubly_periodic.py b/tests/mpi/test_doubly_periodic.py
index a264ea40..5b357bbf 100644
--- a/tests/mpi/test_doubly_periodic.py
+++ b/tests/mpi/test_doubly_periodic.py
@@ -128,6 +128,7 @@ def setup_dycore() -> Tuple[DynamicalCore, List[Any]]:
         config=config,
         phis=state.phis,
         state=state,
+        exclude_tracers=[],
         timestep=timedelta(seconds=255),
     )
     # TODO compute from namelist
diff --git a/tests/savepoint/translate/__init__.py b/tests/savepoint/translate/__init__.py
index 651b2551..256a3e3e 100644
--- a/tests/savepoint/translate/__init__.py
+++ b/tests/savepoint/translate/__init__.py
@@ -8,12 +8,14 @@
     TranslateDivergenceCorner,
     TranslateVorticityTransport_Cgrid,
 )
+from .translate_cond_output import TranslateCond_output
 from .translate_corners import (
     TranslateCopyCorners,
     TranslateFill4Corners,
     TranslateFillCorners,
     TranslateFillCornersVector,
 )
+from .translate_cs_profile import TranslateCS_Profile
 from .translate_cubedtolatlon import TranslateCubedToLatLon
 from .translate_d2a2c_vect import TranslateD2A2C_Vect
 from .translate_d_sw import (
@@ -32,6 +34,7 @@
 from .translate_fvsubgridz import TranslateFVSubgridZ
 from .translate_fvtp2d import TranslateFvTp2d, TranslateFvTp2d_2
 from .translate_fxadv import TranslateFxAdv
+from .translate_getMPIprop import TranslateGetMPIProp
 from .translate_grid import (
     TranslateAGrid,
     TranslateDerivedTrig,
@@ -62,25 +65,43 @@
     TranslateJablonowskiBaroclinic,
     TranslatePVarAuxiliaryPressureVars,
 )
+from .translate_lagrangian_contribution_interp import (
+    TranslateLagrangian_Contribution_Interp,
+)
 from .translate_last_step import TranslateLastStep
+from .translate_map1_ppm_delz import TranslateMap1_PPM_delz
+from .translate_map1_ppm_W import TranslateMap1_PPM_W
+from .translate_map_scalar import TranslateMap_Scalar
+from .translate_MapN_Tracer_2d import TranslateMapN_Tracer_2d
 from .translate_moistcvpluspkz_2d import TranslateMoistCVPlusPkz_2d
 from .translate_moistcvpluspt_2d import TranslateMoistCVPlusPt_2d
+from .translate_moistcvpluspt_2d_last_step import TranslateMoistCVPlusPt_2d_last_step
+from .translate_moistcvpluste_2d import TranslateMoistCVPlusTe_2d
+from .translate_mpp_global_sum import TranslateMpp_global_sum
 from .translate_neg_adj3 import TranslateNeg_Adj3
 from .translate_nh_p_grad import TranslateNH_P_Grad
 from .translate_pe_halo import TranslatePE_Halo
+from .translate_pe_pk_delp_peln import TranslatePE_pk_delp_peln
 from .translate_pk3_halo import TranslatePK3_Halo
 from .translate_pressureadjustedtemperature_nonhydrostatic import (
     TranslatePressureAdjustedTemperature_NonHydrostatic,
 )
+from .translate_Pressures_mapU import TranslatePressures_mapU
+from .translate_Pressures_mapV import TranslatePressures_mapV
 from .translate_qsinit import TranslateQSInit
 from .translate_ray_fast import TranslateRay_Fast
 from .translate_remapping import TranslateRemapping
+from .translate_remapping_GEOS import TranslateRemapping_GEOS
 from .translate_riem_solver3 import TranslateRiem_Solver3
 from .translate_riem_solver_c import TranslateRiem_Solver_C
 from .translate_satadjust3d import TranslateSatAdjust3d
+from .translate_scalar_profile import TranslateScalar_Profile
+from .translate_te_zsum import TranslateTe_Zsum
 from .translate_tracer2d1l import TranslateTracer2D1L
+from .translate_tracer2d1l_cmax import TranslateTracerCMax
 from .translate_updatedzc import TranslateUpdateDzC
 from .translate_updatedzd import TranslateUpdateDzD
+from .translate_w_fix_consrv_moment import TranslateW_fix_consrv_moment
 from .translate_xppm import TranslateXPPM, TranslateXPPM_2
 from .translate_xtp_u import TranslateXTP_U
 from .translate_yppm import TranslateYPPM, TranslateYPPM_2
diff --git a/tests/savepoint/translate/overrides/standard.yaml b/tests/savepoint/translate/overrides/standard.yaml
index 4b9994e3..5f126a59 100644
--- a/tests/savepoint/translate/overrides/standard.yaml
+++ b/tests/savepoint/translate/overrides/standard.yaml
@@ -191,7 +191,9 @@ UtilVectors:
   - max_error: 2e-10 # 48_6ranks
 
 FVDynamics:
-  - max_error: 5e-5 # 48_6ranks using metric terms
+  - backend: numpy
+    multimodal:
+      ulp_threshold: 100
 
 DivergenceDamping:
   - backend: dace:cpu
diff --git a/tests/savepoint/translate/translate_MapN_Tracer_2d.py b/tests/savepoint/translate/translate_MapN_Tracer_2d.py
new file mode 100644
index 00000000..a13c5d8e
--- /dev/null
+++ b/tests/savepoint/translate/translate_MapN_Tracer_2d.py
@@ -0,0 +1,85 @@
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.stencils.testing import TranslateFortranData2Py
+from ndsl.stencils.testing.grid import Grid
+from pyFV3.stencils.mapn_tracer import MapNTracer
+from pyFV3.tracers import setup_tracers
+
+
+class TranslateMapN_Tracer_2d(TranslateFortranData2Py):
+    def __init__(self, grid: Grid, namelist: Namelist, stencil_factory: StencilFactory):
+        super().__init__(grid, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.grid = grid
+        self.quantity_factory = grid.quantity_factory
+
+        self.in_vars["data_vars"] = {
+            "qtracers": {},
+            "pe1": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz,
+            },
+            "pe2": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz,
+            },
+            "dp2": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+        }
+
+        self.out_vars = {
+            "qtracers": {},
+        }
+
+        # Value from GEOS
+        self.kord = 9
+
+        # mode / iv set to 1 from GEOS
+        self.mode = 1
+
+        self.nq = 9
+
+        self.fill = True
+
+        self._are_tracers_setup = False
+
+        self._tracers = None
+
+    def compute_from_storage(self, inputs):
+        if not self._are_tracers_setup:
+            self._are_tracers_setup = True
+            self._tracers = setup_tracers(
+                number_of_tracers=inputs["qtracers"].shape[3],
+                quantity_factory=self.quantity_factory,
+                # mappings={"cloud": 6},
+            )
+            # tracers.quantity.data[:-1, :-1, :-1, :] = inputs["qtracers"]
+        self._tracers.quantity.data = inputs["qtracers"]
+
+        self._compute_func = MapNTracer(
+            self.stencil_factory,
+            self.quantity_factory,
+            abs(self.kord),
+            fill=self.fill,
+            tracers=self._tracers,
+        )
+
+        self._compute_func(
+            inputs["pe1"],
+            inputs["pe2"],
+            inputs["dp2"],
+            self._tracers,
+        )
+
+        return inputs
diff --git a/tests/savepoint/translate/translate_Pressures_mapU.py b/tests/savepoint/translate/translate_Pressures_mapU.py
new file mode 100644
index 00000000..e4f6c85b
--- /dev/null
+++ b/tests/savepoint/translate/translate_Pressures_mapU.py
@@ -0,0 +1,161 @@
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.constants import X_DIM, Y_DIM, Y_INTERFACE_DIM, Z_DIM
+from ndsl.stencils.testing import TranslateFortranData2Py
+from ndsl.stencils.testing.grid import Grid
+from pyFV3.stencils.map_single import MapSingle
+from pyFV3.stencils.remapping import pe0_ptop_xmax, pressures_mapu
+
+
+class TranslatePressures_mapU(TranslateFortranData2Py):
+    def __init__(self, grid: Grid, namelist: Namelist, stencil_factory: StencilFactory):
+        super().__init__(grid, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.grid = grid
+        self.quantity_factory = grid.quantity_factory
+
+        self.in_vars["data_vars"] = {
+            "pe_": {
+                "istart": grid.is_ - 1,
+                "iend": grid.ie + 1,
+                "jstart": grid.js - 1,
+                "jend": grid.je + 1,
+                "kend": grid.npz,
+            },
+            "ak": {},
+            "bk": {},
+            "pe0_": {
+                "istart": grid.is_,
+                "iend": grid.ie + 1,
+                "jstart": grid.js,
+                "jend": grid.je + 1,
+                "kend": grid.npz,
+            },
+            "pe3_": {
+                "istart": grid.is_,
+                "iend": grid.ie + 1,
+                "jstart": grid.js,
+                "jend": grid.je + 1,
+                "kend": grid.npz,
+            },
+            "u_": {
+                "istart": grid.isd,
+                "iend": grid.ied,
+                "jstart": grid.jsd,
+                "jend": grid.jed + 1,
+                "kend": grid.npz - 1,
+            },
+            "mfy_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je + 1,
+                "kend": grid.npz - 1,
+            },
+            "cy_": {
+                "istart": grid.isd,
+                "iend": grid.ied,
+                "jstart": grid.js,
+                "jend": grid.je + 1,
+                "kend": grid.npz - 1,
+            },
+        }
+        self.in_vars["parameters"] = [
+            "ptop",
+            "kord_mt",
+        ]
+
+        self.out_vars = {
+            "pe0_": {
+                "istart": grid.is_,
+                "iend": grid.ie + 1,
+                "jstart": grid.js,
+                "jend": grid.je + 1,
+                "kend": grid.npz,
+            },
+            "pe3_": {
+                "istart": grid.is_,
+                "iend": grid.ie + 1,
+                "jstart": grid.js,
+                "jend": grid.je + 1,
+                "kend": grid.npz,
+            },
+            "u_": {
+                "istart": grid.isd,
+                "iend": grid.ied,
+                "jstart": grid.jsd,
+                "jend": grid.jed + 1,
+                "kend": grid.npz - 1,
+            },
+            "mfy_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je + 1,
+                "kend": grid.npz - 1,
+            },
+            "cy_": {
+                "istart": grid.isd,
+                "iend": grid.ied,
+                "jstart": grid.js,
+                "jend": grid.je + 1,
+                "kend": grid.npz - 1,
+            },
+        }
+
+        grid_indexing = stencil_factory.grid_indexing
+
+        self.dims = [X_DIM, Y_DIM, Z_DIM]
+
+        self._pressures_mapu = stencil_factory.from_origin_domain(
+            pressures_mapu,
+            origin=grid_indexing.origin_compute(),
+            domain=(grid_indexing.domain[0], 1, grid_indexing.domain[2] + 1),
+        )
+
+        self._pe0_ptop_xmax = stencil_factory.from_origin_domain(
+            pe0_ptop_xmax,
+            origin=(grid_indexing.domain[0] + 3, 3, 0),
+            domain=(1, 1, grid_indexing.domain[2] + 1),
+        )
+
+    def compute_from_storage(self, inputs):
+        self._map1_ppm_u = MapSingle(
+            self.stencil_factory,
+            self.quantity_factory,
+            inputs["kord_mt"],
+            -1,
+            dims=[X_DIM, Y_INTERFACE_DIM, Z_DIM],
+        )
+
+        self._pressures_mapu(
+            inputs["pe_"],
+            inputs["ak"],
+            inputs["bk"],
+            inputs["pe0_"],
+            inputs["pe3_"],
+            inputs["ptop"],
+        )
+
+        self._pe0_ptop_xmax(
+            inputs["pe0_"],
+            inputs["ptop"],
+        )
+
+        self._map1_ppm_u(
+            inputs["u_"],
+            inputs["pe0_"],
+            inputs["pe3_"],
+        )
+        self._map1_ppm_u(
+            inputs["mfy_"],
+            inputs["pe0_"],
+            inputs["pe3_"],
+        )
+
+        self._map1_ppm_u(
+            inputs["cy_"],
+            inputs["pe0_"],
+            inputs["pe3_"],
+        )
+        return inputs
diff --git a/tests/savepoint/translate/translate_Pressures_mapV.py b/tests/savepoint/translate/translate_Pressures_mapV.py
new file mode 100644
index 00000000..81620694
--- /dev/null
+++ b/tests/savepoint/translate/translate_Pressures_mapV.py
@@ -0,0 +1,153 @@
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.constants import X_DIM, X_INTERFACE_DIM, Y_DIM, Z_DIM
+from ndsl.stencils.testing import TranslateFortranData2Py
+from ndsl.stencils.testing.grid import Grid
+from pyFV3.stencils.map_single import MapSingle
+from pyFV3.stencils.remapping import pressures_mapv
+
+
+class TranslatePressures_mapV(TranslateFortranData2Py):
+    def __init__(self, grid: Grid, namelist: Namelist, stencil_factory: StencilFactory):
+        super().__init__(grid, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.grid = grid
+        self.quantity_factory = grid.quantity_factory
+
+        self.in_vars["data_vars"] = {
+            "pe_": {
+                "istart": grid.is_ - 1,
+                "iend": grid.ie + 1,
+                "jstart": grid.js - 1,
+                "jend": grid.je + 1,
+                "kend": grid.npz + 1,
+            },
+            "pe0_": {
+                "istart": grid.is_,
+                "iend": grid.ie + 1,
+                "jstart": grid.js,
+                "jend": grid.je + 1,
+                "kend": grid.npz + 1,
+            },
+            "pe3_": {
+                "istart": grid.is_,
+                "iend": grid.ie + 1,
+                "jstart": grid.js,
+                "jend": grid.je + 1,
+                "kend": grid.npz + 1,
+            },
+            "ak": {},
+            "bk": {},
+            "v_": {
+                "istart": grid.isd,
+                "iend": grid.ied + 1,
+                "jstart": grid.jsd,
+                "jend": grid.jed,
+                "kend": grid.npz - 1,
+            },
+            "mfx_": {
+                "istart": grid.is_,
+                "iend": grid.ie + 1,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "cx_": {
+                "istart": grid.is_,
+                "iend": grid.ie + 1,
+                "jstart": grid.jsd,
+                "jend": grid.jed,
+                "kend": grid.npz - 1,
+            },
+        }
+        self.in_vars["parameters"] = [
+            "kord_mt",
+        ]
+
+        self.out_vars = {
+            "pe0_": {
+                "istart": grid.is_,
+                "iend": grid.ie + 1,
+                "jstart": grid.js,
+                "jend": grid.je + 1,
+                "kend": grid.npz + 1,
+            },
+            "pe3_": {
+                "istart": grid.is_,
+                "iend": grid.ie + 1,
+                "jstart": grid.js,
+                "jend": grid.je + 1,
+                "kend": grid.npz + 1,
+            },
+            "v_": {
+                "istart": grid.isd,
+                "iend": grid.ied + 1,
+                "jstart": grid.jsd,
+                "jend": grid.jed,
+                "kend": grid.npz - 1,
+            },
+            "mfx_": {
+                "istart": grid.is_,
+                "iend": grid.ie + 1,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "cx_": {
+                "istart": grid.is_,
+                "iend": grid.ie + 1,
+                "jstart": grid.jsd,
+                "jend": grid.jed,
+                "kend": grid.npz - 1,
+            },
+        }
+
+        grid_indexing = stencil_factory.grid_indexing
+
+        self.dims = [X_DIM, Y_DIM, Z_DIM]
+
+        self._pressures_mapv = stencil_factory.from_origin_domain(
+            pressures_mapv,
+            origin=grid_indexing.origin_compute(),
+            domain=(
+                grid_indexing.domain[0] + 1,
+                1,
+                grid_indexing.domain[2] + 1,
+            ),
+        )
+
+    def compute_from_storage(self, inputs):
+        self._map1_ppm_v = MapSingle(
+            self.stencil_factory,
+            self.quantity_factory,
+            inputs["kord_mt"],
+            -1,
+            dims=[X_INTERFACE_DIM, Y_DIM, Z_DIM],
+        )
+
+        self._pressures_mapv(
+            inputs["pe_"],
+            inputs["ak"],
+            inputs["bk"],
+            inputs["pe0_"],
+            inputs["pe3_"],
+        )
+
+        self._map1_ppm_v(
+            inputs["v_"],
+            inputs["pe0_"],
+            inputs["pe3_"],
+        )
+
+        self._map1_ppm_v(
+            inputs["mfx_"],
+            inputs["pe0_"],
+            inputs["pe3_"],
+        )
+
+        self._map1_ppm_v(
+            inputs["cx_"],
+            inputs["pe0_"],
+            inputs["pe3_"],
+        )
+        return inputs
diff --git a/tests/savepoint/translate/translate_a2b_ord4.py b/tests/savepoint/translate/translate_a2b_ord4.py
index acc4808f..92ec2e0e 100644
--- a/tests/savepoint/translate/translate_a2b_ord4.py
+++ b/tests/savepoint/translate/translate_a2b_ord4.py
@@ -1,9 +1,12 @@
+import numpy as np
 from typing import Any, Dict
 
-from ndsl import Namelist, StencilFactory, orchestrate
-from ndsl.constants import Z_DIM
+from ndsl import StencilFactory, orchestrate
+from f90nml import Namelist
+from ndsl.constants import X_DIM, Y_DIM, Z_DIM
 from pyFV3.stencils import DivergenceDamping
 from pyFV3.testing import TranslateDycoreFortranData2Py
+from pyFV3.utils.functional_validation import get_subset_func
 
 
 class A2B_Ord4Compute:
@@ -58,6 +61,11 @@ def __init__(
         self.namelist = namelist  # type: ignore
         self.stencil_factory = stencil_factory
         self.compute_obj = A2B_Ord4Compute(stencil_factory)
+        self._subset = get_subset_func(
+            self.grid.grid_indexing,
+            dims=[X_DIM, Y_DIM, Z_DIM],
+            n_halo=((3, 3), (3, 3)),
+        )
 
     def compute_from_storage(self, inputs):
         nord_col = self.grid.quantity_factory.zeros(dims=[Z_DIM], units="unknown")
@@ -80,3 +88,13 @@ def compute_from_storage(self, inputs):
         inputs["grid_type"] = 0
         self.compute_obj(divdamp, **inputs)
         return inputs
+
+    def subset_output(self, varname: str, output: np.ndarray) -> np.ndarray:
+        """
+        Given an output array, return the slice of the array which we'd
+        like to validate against reference data
+        """
+        if varname in ["wk"]:
+            return self._subset(output)
+        else:
+            return output
diff --git a/tests/savepoint/translate/translate_c_sw.py b/tests/savepoint/translate/translate_c_sw.py
index 13abd93c..b7255525 100644
--- a/tests/savepoint/translate/translate_c_sw.py
+++ b/tests/savepoint/translate/translate_c_sw.py
@@ -1,4 +1,5 @@
-from ndsl import Namelist, QuantityFactory, StencilFactory
+from ndsl import QuantityFactory, StencilFactory
+from f90nml.namelist import Namelist
 from pyFV3.stencils import CGridShallowWaterDynamics
 from pyFV3.testing import TranslateDycoreFortranData2Py
 
diff --git a/tests/savepoint/translate/translate_cond_output.py b/tests/savepoint/translate/translate_cond_output.py
new file mode 100644
index 00000000..6bd94b2e
--- /dev/null
+++ b/tests/savepoint/translate/translate_cond_output.py
@@ -0,0 +1,52 @@
+from ndsl.stencils.testing import TranslateFortranData2Py
+from pyFV3.stencils import moist_cv
+
+
+class TranslateCond_output(TranslateFortranData2Py):
+    def __init__(self, grid, namelist, stencil_factory):
+        super().__init__(grid, namelist, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.in_vars["data_vars"] = {
+            "qliquid": {
+                "kend": grid.npz - 1,
+            },
+            "qice": {
+                "kend": grid.npz - 1,
+            },
+            "qrain": {
+                "kend": grid.npz - 1,
+            },
+            "qsnow": {
+                "kend": grid.npz - 1,
+            },
+            "qgraupel": {
+                "kend": grid.npz - 1,
+            },
+            "q_con": {
+                "kend": grid.npz - 1,
+            },
+        }
+
+        self.out_vars = {
+            "q_con": {
+                "kend": grid.npz - 1,
+            }
+        }
+
+        self.compute_func = stencil_factory.from_origin_domain(
+            moist_cv.cond_output,
+            origin=grid.compute_origin(),
+            domain=(grid.nic, grid.njc, grid.npz),
+        )
+
+    def compute_from_storage(self, inputs):
+
+        self.compute_func(
+            inputs["q_con"],
+            inputs["qliquid"],
+            inputs["qrain"],
+            inputs["qsnow"],
+            inputs["qice"],
+            inputs["qgraupel"],
+        )
+        return inputs
diff --git a/tests/savepoint/translate/translate_corners.py b/tests/savepoint/translate/translate_corners.py
index 4aa58497..7bcd8f39 100644
--- a/tests/savepoint/translate/translate_corners.py
+++ b/tests/savepoint/translate/translate_corners.py
@@ -1,7 +1,8 @@
 from typing import Any, Dict
 
 import ndsl.dsl.gt4py_utils as utils
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.stencils import corners
 from pyFV3.testing import TranslateDycoreFortranData2Py
 
diff --git a/tests/savepoint/translate/translate_cs_profile.py b/tests/savepoint/translate/translate_cs_profile.py
new file mode 100644
index 00000000..d256c507
--- /dev/null
+++ b/tests/savepoint/translate/translate_cs_profile.py
@@ -0,0 +1,113 @@
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.constants import X_DIM, Y_DIM, Z_DIM
+from ndsl.stencils.testing import TranslateFortranData2Py
+from ndsl.stencils.testing.grid import Grid
+from pyFV3.stencils.remap_profile import RemapProfile
+
+
+class TranslateCS_Profile(TranslateFortranData2Py):
+    def __init__(self, grid: Grid, namelist: Namelist, stencil_factory: StencilFactory):
+        super().__init__(grid, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.grid = grid
+        self.quantity_factory = grid.quantity_factory
+
+        self.in_vars["data_vars"] = {
+            "qs_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_1": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_2": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_3": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_4": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "dp1_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+        }
+        self.in_vars["parameters"] = [
+            "iv_",
+            "kord_",
+        ]
+
+        self.out_vars = {
+            "q4_1": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_2": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_3": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_4": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+        }
+
+    def compute_from_storage(self, inputs):
+        self._compute_func = RemapProfile(
+            self.stencil_factory,
+            self.quantity_factory,
+            inputs["kord_"],
+            inputs["iv_"],
+            dims=[X_DIM, Y_DIM, Z_DIM],
+        )
+
+        self._compute_func(
+            inputs["qs_"],
+            inputs["q4_1"],
+            inputs["q4_2"],
+            inputs["q4_3"],
+            inputs["q4_4"],
+            inputs["dp1_"],
+        )
+        return inputs
diff --git a/tests/savepoint/translate/translate_cubedtolatlon.py b/tests/savepoint/translate/translate_cubedtolatlon.py
index 6610a46e..32b15779 100644
--- a/tests/savepoint/translate/translate_cubedtolatlon.py
+++ b/tests/savepoint/translate/translate_cubedtolatlon.py
@@ -1,4 +1,5 @@
-from ndsl import Namelist, Quantity, StencilFactory
+from ndsl import Quantity, StencilFactory
+from f90nml.namelist import Namelist
 from ndsl.constants import X_DIM, X_INTERFACE_DIM, Y_DIM, Y_INTERFACE_DIM, Z_DIM
 from ndsl.stencils.c2l_ord import CubedToLatLon
 from ndsl.stencils.testing import ParallelTranslate2Py
diff --git a/tests/savepoint/translate/translate_d2a2c_vect.py b/tests/savepoint/translate/translate_d2a2c_vect.py
index d3561e42..c081f110 100644
--- a/tests/savepoint/translate/translate_d2a2c_vect.py
+++ b/tests/savepoint/translate/translate_d2a2c_vect.py
@@ -1,4 +1,5 @@
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from pyFV3.stencils import DGrid2AGrid2CGridVectors
 from pyFV3.testing import TranslateDycoreFortranData2Py
 
diff --git a/tests/savepoint/translate/translate_d_sw.py b/tests/savepoint/translate/translate_d_sw.py
index 836f3996..d9508cbd 100644
--- a/tests/savepoint/translate/translate_d_sw.py
+++ b/tests/savepoint/translate/translate_d_sw.py
@@ -2,8 +2,9 @@
 
 import pyFV3
 import pyFV3.stencils.d_sw as d_sw
-from ndsl import Namelist, StencilFactory
-from ndsl.dsl.typing import FloatField, FloatFieldIJ
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.dsl.typing import Float, FloatField, FloatFieldIJ
 from pyFV3.testing import TranslateDycoreFortranData2Py
 
 
@@ -44,10 +45,11 @@ def __init__(
             "crx": grid.x3d_compute_domain_y_dict(),
             "yfx": grid.y3d_compute_domain_x_dict(),
             "cry": grid.y3d_compute_domain_x_dict(),
-            "mfx": grid.x3d_compute_dict(),
-            "mfy": grid.y3d_compute_dict(),
-            "cx": grid.x3d_compute_domain_y_dict(),
-            "cy": grid.y3d_compute_domain_x_dict(),
+            "mfx": grid.x3d_compute_dict() | {"serialname": "mfxd_R8"},
+            "mfy": grid.y3d_compute_dict() | {"serialname": "mfyd_R8"},
+            "cx": grid.x3d_compute_domain_y_dict() | {"serialname": "cxd_R8"},
+            "cy": grid.y3d_compute_domain_x_dict() | {"serialname": "cyd_R8"},
+            "dpx": grid.compute_dict(),
             "heat_source": {},
             "diss_est": {},
             "q_con": {},
@@ -58,7 +60,8 @@ def __init__(
             "divgd": grid.default_dict_buffer_2d(),
         }
         for name, info in self.in_vars["data_vars"].items():
-            info["serialname"] = name + "d"
+            if name not in ["mfx", "mfy", "cx", "cy", "dpx"]:
+                info["serialname"] = name + "d"
         self.in_vars["parameters"] = ["dt"]
         self.out_vars = self.in_vars["data_vars"].copy()
         del self.out_vars["zh"]
@@ -71,13 +74,13 @@ def ubke(
     rsina: FloatFieldIJ,
     ut: FloatField,
     ub: FloatField,
-    dt4: float,
-    dt5: float,
+    dt4: Float,
+    dt5: Float,
 ):
     with computation(PARALLEL), interval(...):
-        dt = 2.0 * dt5
-        ub, _ = d_sw.interpolate_uc_vc_to_cell_corners(uc, vc, cosa, rsina, ut, ut)
-        ub = ub * dt
+        ub, _ = d_sw.interpolate_uc_vc_to_cell_corners(
+            uc, vc, cosa, rsina, ut, ut, dt4, dt5
+        )
 
 
 class TranslateUbKE(TranslateDycoreFortranData2Py):
@@ -118,13 +121,13 @@ def vbke(
     rsina: FloatFieldIJ,
     vt: FloatField,
     vb: FloatField,
-    dt4: float,
-    dt5: float,
+    dt4: Float,
+    dt5: Float,
 ):
     with computation(PARALLEL), interval(...):
-        dt = 2.0 * dt5
-        _, vb = d_sw.interpolate_uc_vc_to_cell_corners(uc, vc, cosa, rsina, vt, vt)
-        vb = vb * dt
+        _, vb = d_sw.interpolate_uc_vc_to_cell_corners(
+            uc, vc, cosa, rsina, vt, vt, dt4, dt5
+        )
 
 
 class TranslateVbKE(TranslateDycoreFortranData2Py):
@@ -218,7 +221,7 @@ def compute_from_storage(self, inputs):
         # TODO add these to the serialized data or remove the test
         inputs["damp_w"] = column_namelist["damp_w"]
         inputs["ke_bg"] = column_namelist["ke_bg"]
-        inputs["dt"] = (
+        inputs["dt"] = Float(
             self.namelist.dt_atmos / self.namelist.k_split / self.namelist.n_split
         )
         inputs["rarea"] = self.grid.rarea
diff --git a/tests/savepoint/translate/translate_del2cubed.py b/tests/savepoint/translate/translate_del2cubed.py
index 07bf24c2..cc33515c 100644
--- a/tests/savepoint/translate/translate_del2cubed.py
+++ b/tests/savepoint/translate/translate_del2cubed.py
@@ -1,6 +1,7 @@
 from typing import Any, Dict
 
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from pyFV3.stencils import HyperdiffusionDamping
 from pyFV3.testing import TranslateDycoreFortranData2Py
 
diff --git a/tests/savepoint/translate/translate_del6vtflux.py b/tests/savepoint/translate/translate_del6vtflux.py
index f163c83b..ae500403 100644
--- a/tests/savepoint/translate/translate_del6vtflux.py
+++ b/tests/savepoint/translate/translate_del6vtflux.py
@@ -1,5 +1,6 @@
 import pyFV3.stencils.delnflux as delnflux
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.constants import Z_DIM
 from pyFV3.testing import TranslateDycoreFortranData2Py
 
diff --git a/tests/savepoint/translate/translate_delnflux.py b/tests/savepoint/translate/translate_delnflux.py
index 1db0b33c..4ca8bbc9 100644
--- a/tests/savepoint/translate/translate_delnflux.py
+++ b/tests/savepoint/translate/translate_delnflux.py
@@ -1,4 +1,5 @@
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.constants import Z_DIM
 from pyFV3.stencils import delnflux
 from pyFV3.testing import TranslateDycoreFortranData2Py
diff --git a/tests/savepoint/translate/translate_divergencedamping.py b/tests/savepoint/translate/translate_divergencedamping.py
index fbc7d6b5..8d5cbedb 100644
--- a/tests/savepoint/translate/translate_divergencedamping.py
+++ b/tests/savepoint/translate/translate_divergencedamping.py
@@ -1,6 +1,7 @@
 from typing import Optional
 
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.constants import Z_DIM
 from pyFV3.stencils import DivergenceDamping
 from pyFV3.testing import TranslateDycoreFortranData2Py
@@ -60,13 +61,3 @@ def compute_from_storage(self, inputs):
         )
         self.divdamp(**inputs)
         return inputs
-
-    def subset_output(self, varname: str, output):
-        """
-        Given an output array, return the slice of the array which we'd
-        like to validate against reference data
-        """
-        if self.divdamp is None:
-            raise RuntimeError("must call compute_from_storage before subset_output")
-        else:
-            return self.divdamp.subset_output(varname, output)  # type: ignore
diff --git a/tests/savepoint/translate/translate_fillz.py b/tests/savepoint/translate/translate_fillz.py
index c08b5323..eb2377be 100644
--- a/tests/savepoint/translate/translate_fillz.py
+++ b/tests/savepoint/translate/translate_fillz.py
@@ -1,11 +1,12 @@
 import numpy as np
 
-import ndsl.dsl.gt4py_utils as utils
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.stencils.testing import pad_field_in_j
-from ndsl.utils import safe_assign_array
-from pyFV3.stencils import fillz
+from pyFV3.stencils.fillz import FillNegativeTracerValues
 from pyFV3.testing import TranslateDycoreFortranData2Py
+from pyFV3.tracers import setup_tracers
+from ndsl.quantity.field_bundle import FieldBundle
 
 
 class TranslateFillz(TranslateDycoreFortranData2Py):
@@ -33,26 +34,27 @@ def __init__(
         self.max_error = 1e-13
         self.ignore_near_zero_errors = {"q2tracers": True}
         self.stencil_factory = stencil_factory
+        self._quantity_factory = grid.quantity_factory
 
-    def make_storage_data_input_vars(self, inputs, storage_vars=None):
-        if storage_vars is None:
-            storage_vars = self.storage_vars()
+    def make_storage_data_input_vars(self, inputs, tracers: FieldBundle):
+        storage_vars = self.storage_vars()
         info = storage_vars["dp2"]
         inputs["dp2"] = self.make_storage_data(
             np.squeeze(inputs["dp2"]), istart=info["istart"], axis=info["axis"]
         )
-        inputs["tracers"] = {}
         info = storage_vars["q2tracers"]
-        for i in range(int(inputs["nq"])):
-            inputs["tracers"][utils.tracer_variables[i]] = self.make_storage_data(
-                np.squeeze(inputs["q2tracers"][:, :, i]),
-                istart=info["istart"],
-                axis=info["axis"],
-            )
+        tracers.quantity.field[:, :, :, :] = inputs["q2tracers"][:, np.newaxis, :, :]
         del inputs["q2tracers"]
 
     def compute(self, inputs):
-        self.make_storage_data_input_vars(inputs)
+        tracers = setup_tracers(
+            number_of_tracers=inputs["q2tracers"].shape[2],
+            quantity_factory=self._quantity_factory,
+        )
+
+        self.make_storage_data_input_vars(inputs, tracers)
+        inputs["tracers"] = tracers
+
         for name, value in tuple(inputs.items()):
             if hasattr(value, "shape") and len(value.shape) > 1 and value.shape[1] == 1:
                 inputs[name] = self.make_storage_data(
@@ -60,28 +62,13 @@ def compute(self, inputs):
                         value, self.grid.njd, backend=self.stencil_factory.backend
                     )
                 )
-        for name, value in tuple(inputs["tracers"].items()):
-            if hasattr(value, "shape") and len(value.shape) > 1 and value.shape[1] == 1:
-                inputs["tracers"][name] = self.make_storage_data(
-                    pad_field_in_j(
-                        value, self.grid.njd, backend=self.stencil_factory.backend
-                    )
-                )
-        run_fillz = fillz.FillNegativeTracerValues(
+        inputs.pop("nq")
+        fillz = FillNegativeTracerValues(
             self.stencil_factory,
             self.grid.quantity_factory,
-            inputs.pop("nq"),
-            inputs["tracers"],
         )
-        run_fillz(**inputs)
+        fillz(**inputs)
         ds = self.grid.default_domain_dict()
         ds.update(self.out_vars["q2tracers"])
-        tracers = np.zeros((self.grid.nic, self.grid.npz, len(inputs["tracers"])))
-        for varname, data in inputs["tracers"].items():
-            index = utils.tracer_variables.index(varname)
-            data[self.grid.slice_dict(ds)]
-            safe_assign_array(
-                tracers[:, :, index], np.squeeze(data[self.grid.slice_dict(ds)])
-            )
-        out = {"q2tracers": tracers}
+        out = {"q2tracers": tracers.quantity.field[:, 0, :, :]}
         return out
diff --git a/tests/savepoint/translate/translate_fvsubgridz.py b/tests/savepoint/translate/translate_fvsubgridz.py
index 726bb7be..c427f984 100644
--- a/tests/savepoint/translate/translate_fvsubgridz.py
+++ b/tests/savepoint/translate/translate_fvsubgridz.py
@@ -1,7 +1,8 @@
 from types import SimpleNamespace
 
 import ndsl.dsl.gt4py_utils as utils
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.constants import X_DIM, Y_DIM, Z_DIM, Z_INTERFACE_DIM
 from ndsl.stencils.testing import ParallelTranslateBaseSlicing
 from pyFV3 import DryConvectiveAdjustment
diff --git a/tests/savepoint/translate/translate_fvtp2d.py b/tests/savepoint/translate/translate_fvtp2d.py
index 81a0cff4..55762dda 100644
--- a/tests/savepoint/translate/translate_fvtp2d.py
+++ b/tests/savepoint/translate/translate_fvtp2d.py
@@ -1,5 +1,6 @@
 import ndsl.dsl.gt4py_utils as utils
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.constants import Z_DIM
 from ndsl.dsl.typing import Float
 from pyFV3.stencils import FiniteVolumeTransport
diff --git a/tests/savepoint/translate/translate_fxadv.py b/tests/savepoint/translate/translate_fxadv.py
index 76afe3b2..324d9e1d 100644
--- a/tests/savepoint/translate/translate_fxadv.py
+++ b/tests/savepoint/translate/translate_fxadv.py
@@ -1,6 +1,7 @@
 import numpy as np
 
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.constants import X_DIM, Y_DIM, Z_DIM
 from pyFV3.stencils import FiniteVolumeFluxPrep
 from pyFV3.testing import TranslateDycoreFortranData2Py
diff --git a/tests/savepoint/translate/translate_getMPIprop.py b/tests/savepoint/translate/translate_getMPIprop.py
new file mode 100644
index 00000000..561c8b5c
--- /dev/null
+++ b/tests/savepoint/translate/translate_getMPIprop.py
@@ -0,0 +1,64 @@
+import numpy as np
+
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.quantity import Quantity
+from ndsl.stencils.testing import ParallelTranslate
+from ndsl.stencils.testing.grid import Grid
+from ndsl.typing import Communicator
+
+
+class TranslateGetMPIProp(ParallelTranslate):
+    def __init__(
+        self,
+        grid: Grid,
+        namelist: Namelist,
+        stencil_factory: StencilFactory,
+    ):
+        print("Base TranslateGetMPIProp is initialized")
+        super().__init__(grid, namelist, stencil_factory)
+        self._base.in_vars["data_vars"] = {"delz": {}}
+        self._base.out_vars = {"delz": {}}
+
+        len_k = 10
+
+        a = [1, 2, 3, 4, 5]
+
+        self._testQuantity_1D = Quantity(
+            data=np.array(a, dtype=np.float32),
+            dims=["K"],
+            units="dunno",
+            gt4py_backend=stencil_factory.backend,
+        )
+
+        self._testQuantity_2D = Quantity(
+            data=np.ones([5, 5], dtype=np.float32),
+            dims=["I", "J"],
+            units="dunno2",
+            gt4py_backend=stencil_factory.backend,
+        )
+
+        self._testQuantity_3D = Quantity(
+            data=np.ones([3, 3, 3], dtype=np.float32),
+            dims=["I", "J", "K"],
+            units="dunno3",
+            gt4py_backend=stencil_factory.backend,
+        )
+
+    def compute_parallel(self, inputs, communicator: Communicator):
+        print("Communicator rank = ", communicator.rank)
+        print("Communicator size = ", communicator.size)
+        print("self._testQuantity = ", self._testQuantity_1D.data)
+        global_sum_q = communicator.all_reduce_sum(self._testQuantity_1D)
+        print("global_sum_q.data = ", global_sum_q.data)
+        print("global_sum_q.metadata = ", global_sum_q.metadata)
+
+        global_sum_q = communicator.all_reduce_sum(self._testQuantity_2D)
+        print("global_sum_q.data = ", global_sum_q.data)
+        print("global_sum_q.metadata = ", global_sum_q.metadata)
+
+        global_sum_q = communicator.all_reduce_sum(self._testQuantity_3D)
+        print("global_sum_q.data = ", global_sum_q.data)
+        print("global_sum_q.metadata = ", global_sum_q.metadata)
+
+        return inputs
diff --git a/tests/savepoint/translate/translate_grid.py b/tests/savepoint/translate/translate_grid.py
index 25d77b21..3e7f94ff 100644
--- a/tests/savepoint/translate/translate_grid.py
+++ b/tests/savepoint/translate/translate_grid.py
@@ -4,7 +4,8 @@
 import pytest
 
 import ndsl.dsl.gt4py_utils as utils
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.constants import (
     X_DIM,
     X_INTERFACE_DIM,
diff --git a/tests/savepoint/translate/translate_haloupdate.py b/tests/savepoint/translate/translate_haloupdate.py
index 12b078d7..1591edef 100644
--- a/tests/savepoint/translate/translate_haloupdate.py
+++ b/tests/savepoint/translate/translate_haloupdate.py
@@ -1,4 +1,5 @@
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.constants import (
     N_HALO_DEFAULT,
     X_DIM,
diff --git a/tests/savepoint/translate/translate_init_case.py b/tests/savepoint/translate/translate_init_case.py
index 9cd87f03..41391883 100644
--- a/tests/savepoint/translate/translate_init_case.py
+++ b/tests/savepoint/translate/translate_init_case.py
@@ -4,11 +4,11 @@
 import pytest
 
 import ndsl.constants as constants
-import ndsl.dsl.gt4py_utils as utils
 import pyFV3.initialization.analytic_init as analytic_init
 import pyFV3.initialization.init_utils as init_utils
 import pyFV3.initialization.test_cases.initialize_baroclinic as baroclinic_init
-from ndsl import Namelist, Quantity, QuantityFactory, StencilFactory, SubtileGridSizer
+from ndsl import Quantity, QuantityFactory, StencilFactory, SubtileGridSizer
+from f90nml import Namelist
 from ndsl.constants import (
     N_HALO_DEFAULT,
     X_DIM,
@@ -20,7 +20,6 @@
 )
 from ndsl.grid import GridData, MetricTerms
 from ndsl.stencils.testing import ParallelTranslateBaseSlicing
-from ndsl.stencils.testing.grid import TRACER_DIM  # type: ignore
 from pyFV3.testing import TranslateDycoreFortranData2Py
 
 
@@ -112,7 +111,7 @@ class TranslateInitCase(ParallelTranslateBaseSlicing):
         },
         "q4d": {
             "name": "tracers",
-            "dims": [X_DIM, Y_DIM, Z_DIM, TRACER_DIM],
+            "dims": [X_DIM, Y_DIM, Z_DIM, "tracers"],
             "units": "kg/kg",
         },
     }
@@ -166,6 +165,10 @@ def __init__(
             self.ignore_near_zero_errors[var] = {"near_zero": 2e-13}
         self.namelist = namelist  # type: ignore
         self.stencil_factory = stencil_factory
+        self._quantity_factory = QuantityFactory.from_backend(
+            sizer=stencil_factory.grid_indexing._sizer,
+            backend=stencil_factory.backend,
+        )
 
     def compute_sequential(self, *args, **kwargs):
         pytest.skip(
@@ -177,10 +180,8 @@ def outputs_from_state(self, state: dict):
         outputs = {}
         arrays = {}
         for name, properties in self.outputs.items():
-            if isinstance(state[name], dict):
-                for tracer, quantity in state[name].items():
-                    state[name][tracer] = state[name][tracer].data
-                arrays[name] = state[name]
+            if name == "q4d":
+                arrays[name] = state["tracers"].as_4D_array()
             elif len(self.outputs[name]["dims"]) > 0:
                 arrays[name] = state[name].data
             else:
@@ -229,7 +230,6 @@ def compute_parallel(self, inputs, communicator):
         )
 
         grid_data = GridData.new_from_metric_terms(metric_terms)
-        quantity_factory = QuantityFactory()
 
         state = analytic_init.init_analytic_state(
             analytic_init_case="baroclinic",
@@ -241,9 +241,6 @@ def compute_parallel(self, inputs, communicator):
             comm=communicator,
         )
 
-        state.q4d = {}
-        for tracer in utils.tracer_variables:
-            state.q4d[tracer] = getattr(state, tracer)
         return self.outputs_from_state(state.__dict__)
 
 
diff --git a/tests/savepoint/translate/translate_lagrangian_contribution_interp.py b/tests/savepoint/translate/translate_lagrangian_contribution_interp.py
new file mode 100644
index 00000000..e65753a1
--- /dev/null
+++ b/tests/savepoint/translate/translate_lagrangian_contribution_interp.py
@@ -0,0 +1,171 @@
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.constants import X_DIM, Y_DIM, Z_DIM
+from ndsl.dsl.typing import Bool, BoolFieldIJ, FloatField, Int, IntField, IntFieldIJ
+from ndsl.stencils.testing import TranslateFortranData2Py
+from ndsl.stencils.testing.grid import Grid
+from pyFV3.stencils.map_single import lagrangian_contributions_interp
+
+
+class test_Lagragian_Contribution_Interp:
+    def __init__(
+        self,
+        stencil_factory: StencilFactory,
+        grid: Grid,
+    ):
+        print("In test_Lagragian_Contribution_interp")
+
+        grid_indexing = stencil_factory.grid_indexing
+
+        self._lagrangian_contributions_interp = stencil_factory.from_origin_domain(
+            func=lagrangian_contributions_interp,
+            origin=grid_indexing.origin_compute(),
+            domain=(grid.nic, 1, grid.npz),
+        )
+
+    def __call__(
+        self,
+        km: int,
+        not_exit_loop: BoolFieldIJ,
+        INDEX_LM1: IntField,
+        INDEX_LP0: IntField,
+        q: FloatField,
+        pe1: FloatField,
+        pe2: FloatField,
+        q4_1: FloatField,
+        q4_2: FloatField,
+        q4_3: FloatField,
+        q4_4: FloatField,
+        dp1: FloatField,
+        lev: IntFieldIJ,
+    ):
+        self._lagrangian_contributions_interp(
+            km,
+            not_exit_loop,
+            INDEX_LM1,
+            INDEX_LP0,
+            q,
+            pe1,
+            pe2,
+            q4_1,
+            q4_2,
+            q4_3,
+            q4_4,
+            dp1,
+            lev,
+        )
+
+
+class TranslateLagrangian_Contribution_Interp(TranslateFortranData2Py):
+    def __init__(self, grid: Grid, namelist: Namelist, stencil_factory: StencilFactory):
+        super().__init__(grid, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.grid = grid
+        self.compute_func = test_Lagragian_Contribution_Interp(
+            self.stencil_factory, self.grid
+        )  # type: ignore
+        self.quantity_factory = grid.quantity_factory
+
+        self.in_vars["data_vars"] = {
+            "q1": {
+                "kend": grid.npz - 1,
+            },
+            "pe1_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz,
+            },
+            "pe2_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz,
+            },
+            "q4_1": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_2": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_3": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_4": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "dp1_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+        }
+
+        self.out_vars = {
+            "q1": {
+                "kend": grid.npz - 1,
+            },
+        }
+
+    def compute_from_storage(self, inputs):
+        self._not_exit_loop = self.quantity_factory.zeros(
+            [X_DIM, Y_DIM],
+            units="",
+            dtype=Bool,
+        )
+
+        self._INDEX_LM1 = self.quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="",
+            dtype=Int,
+        )
+
+        self._INDEX_LP0 = self.quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="",
+            dtype=Int,
+        )
+
+        self._lev = self.quantity_factory.zeros(
+            [X_DIM, Y_DIM],
+            units="",
+            dtype=Int,
+        )
+
+        self.compute_func(
+            self.grid.npz,
+            self._not_exit_loop,
+            self._INDEX_LM1,
+            self._INDEX_LP0,
+            inputs["q1"],
+            inputs["pe1_"],
+            inputs["pe2_"],
+            inputs["q4_1"],
+            inputs["q4_2"],
+            inputs["q4_3"],
+            inputs["q4_4"],
+            inputs["dp1_"],
+            self._lev,
+        )
+
+        return inputs
diff --git a/tests/savepoint/translate/translate_last_step.py b/tests/savepoint/translate/translate_last_step.py
index 1d2d7525..c6a9a4df 100644
--- a/tests/savepoint/translate/translate_last_step.py
+++ b/tests/savepoint/translate/translate_last_step.py
@@ -1,4 +1,5 @@
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from pyFV3.stencils import moist_cv
 from pyFV3.testing import TranslateDycoreFortranData2Py
 
diff --git a/tests/savepoint/translate/translate_map1_ppm_W.py b/tests/savepoint/translate/translate_map1_ppm_W.py
new file mode 100644
index 00000000..07ca73da
--- /dev/null
+++ b/tests/savepoint/translate/translate_map1_ppm_W.py
@@ -0,0 +1,71 @@
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.constants import X_DIM, Y_DIM, Z_DIM
+from ndsl.stencils.testing import TranslateFortranData2Py
+from ndsl.stencils.testing.grid import Grid
+from pyFV3.stencils.map_single import MapSingle
+
+
+class TranslateMap1_PPM_W(TranslateFortranData2Py):
+    def __init__(self, grid: Grid, namelist: Namelist, stencil_factory: StencilFactory):
+        super().__init__(grid, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.grid = grid
+        self.quantity_factory = grid.quantity_factory
+
+        self.in_vars["data_vars"] = {
+            "w_": {
+                "kend": grid.npz - 1,
+            },
+            "pe1_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz,
+            },
+            "pe2_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz,
+            },
+            "ws_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+            },
+        }
+        self.in_vars["parameters"] = [
+            "kord_wz",
+        ]
+
+        self.out_vars = {
+            "w_": {
+                "kend": grid.npz - 1,
+            },
+        }
+
+        # mode / iv set to -2 from GEOS
+        self.mode = -2
+
+        self.dims = [X_DIM, Y_DIM, Z_DIM]
+
+    def compute_from_storage(self, inputs):
+        self._compute_func = MapSingle(
+            self.stencil_factory,
+            self.quantity_factory,
+            inputs["kord_wz"],
+            self.mode,
+            dims=[X_DIM, Y_DIM, Z_DIM],
+        )
+
+        self._compute_func(
+            inputs["w_"],
+            inputs["pe1_"],
+            inputs["pe2_"],
+            qs=inputs["ws_"],
+        )
+        return inputs
diff --git a/tests/savepoint/translate/translate_map1_ppm_delz.py b/tests/savepoint/translate/translate_map1_ppm_delz.py
new file mode 100644
index 00000000..9384076e
--- /dev/null
+++ b/tests/savepoint/translate/translate_map1_ppm_delz.py
@@ -0,0 +1,120 @@
+from gt4py.cartesian.gtscript import PARALLEL, computation, interval
+
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.constants import X_DIM, Y_DIM, Z_DIM
+from ndsl.dsl.typing import FloatField
+from ndsl.stencils.testing import TranslateFortranData2Py
+from ndsl.stencils.testing.grid import Grid
+from pyFV3.stencils.map_single import MapSingle
+
+
+def rescale_delz_1(
+    delz: FloatField,
+    delp: FloatField,
+):
+    with computation(PARALLEL), interval(...):
+        delz = -delz / delp
+
+
+def rescale_delz_2(
+    delz: FloatField,
+    dp: FloatField,
+):
+    with computation(PARALLEL), interval(...):
+        delz = -delz * dp
+
+
+class TranslateMap1_PPM_delz(TranslateFortranData2Py):
+    def __init__(self, grid: Grid, namelist: Namelist, stencil_factory: StencilFactory):
+        super().__init__(grid, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.grid = grid
+        self.quantity_factory = grid.quantity_factory
+
+        self.in_vars["data_vars"] = {
+            "delz_": {
+                "kend": grid.npz - 1,
+            },
+            "pe1_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz,
+            },
+            "pe2_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz,
+            },
+            "dp2_3d": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz,
+            },
+            "gz_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+            },
+            "delp": {},
+        }
+        self.in_vars["parameters"] = [
+            "kord_wz",
+        ]
+
+        self.out_vars = {
+            "delz_": {
+                "kend": grid.npz - 1,
+            },
+        }
+
+        # mode / iv set to 1 from GEOS
+        self.mode = 1
+
+        self.dims = [X_DIM, Y_DIM, Z_DIM]
+
+        self._rescale_delz_1 = stencil_factory.from_origin_domain(
+            rescale_delz_1,
+            origin=grid.compute_origin(),
+            domain=(grid.nic, 1, grid.npz),
+        )
+
+        self._rescale_delz_2 = stencil_factory.from_origin_domain(
+            rescale_delz_2,
+            origin=grid.compute_origin(),
+            domain=(grid.nic, 1, grid.npz),
+        )
+
+    def compute_from_storage(self, inputs):
+        self._compute_func = MapSingle(
+            self.stencil_factory,
+            self.quantity_factory,
+            inputs["kord_wz"],
+            self.mode,
+            dims=[X_DIM, Y_DIM, Z_DIM],
+        )
+
+        self._rescale_delz_1(
+            inputs["delz_"],
+            inputs["delp"],
+        )
+
+        self._compute_func(
+            inputs["delz_"],
+            inputs["pe1_"],
+            inputs["pe2_"],
+            qs=inputs["gz_"],
+        )
+
+        self._rescale_delz_2(
+            inputs["delz_"],
+            inputs["dp2_3d"],
+        )
+        return inputs
diff --git a/tests/savepoint/translate/translate_map_scalar.py b/tests/savepoint/translate/translate_map_scalar.py
new file mode 100644
index 00000000..35968971
--- /dev/null
+++ b/tests/savepoint/translate/translate_map_scalar.py
@@ -0,0 +1,69 @@
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.constants import X_DIM, Y_DIM, Z_DIM
+from ndsl.stencils.testing import TranslateFortranData2Py
+from ndsl.stencils.testing.grid import Grid
+from pyFV3.stencils.map_single import MapSingle
+
+
+class TranslateMap_Scalar(TranslateFortranData2Py):
+    def __init__(self, grid: Grid, namelist: Namelist, stencil_factory: StencilFactory):
+        super().__init__(grid, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.grid = grid
+        self.quantity_factory = grid.quantity_factory
+
+        self.in_vars["data_vars"] = {
+            "q1": {
+                "kend": grid.npz - 1,
+            },
+            "pe1_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz,
+            },
+            "pe2_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz,
+            },
+        }
+        self.in_vars["parameters"] = [
+            "q_min",
+        ]
+
+        self.out_vars = {
+            "q1": {
+                "kend": grid.npz - 1,
+            },
+        }
+
+        # Value from GEOS
+        self._kord_tm = 9
+
+        # mode / iv set to 1 from GEOS
+        self.mode = 1
+
+        self.dims = [X_DIM, Y_DIM, Z_DIM]
+
+        self._compute_func = MapSingle(
+            self.stencil_factory,
+            self.quantity_factory,
+            self._kord_tm,
+            self.mode,
+            dims=[X_DIM, Y_DIM, Z_DIM],
+            interpolate_contribution=True,
+        )
+
+    def compute_from_storage(self, inputs):
+        self._compute_func(
+            inputs["q1"],
+            inputs["pe1_"],
+            inputs["pe2_"],
+            qmin=inputs["q_min"],
+        )
+        return inputs
diff --git a/tests/savepoint/translate/translate_moistcvpluspkz_2d.py b/tests/savepoint/translate/translate_moistcvpluspkz_2d.py
index 54577da2..e7331103 100644
--- a/tests/savepoint/translate/translate_moistcvpluspkz_2d.py
+++ b/tests/savepoint/translate/translate_moistcvpluspkz_2d.py
@@ -1,4 +1,5 @@
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.dsl.typing import FloatField
 from ndsl.stencils.testing import pad_field_in_j
 from pyFV3.stencils import moist_cv
@@ -29,9 +30,6 @@ def __call__(
         qsnow: FloatField,
         qice: FloatField,
         qgraupel: FloatField,
-        q_con: FloatField,
-        gz: FloatField,
-        cvm: FloatField,
         pkz: FloatField,
         pt: FloatField,
         cappa: FloatField,
@@ -46,9 +44,6 @@ def __call__(
             qsnow,
             qice,
             qgraupel,
-            q_con,
-            gz,
-            cvm,
             pkz,
             pt,
             cappa,
@@ -76,39 +71,18 @@ def __init__(
             "qrain": {"serialname": "qrain_js"},
             "qsnow": {"serialname": "qsnow_js"},
             "qgraupel": {"serialname": "qgraupel_js"},
-            "gz": {"serialname": "gz1d", "kstart": grid.is_, "axis": 0},
-            "cvm": {"kstart": grid.is_, "axis": 0},
             "delp": {},
             "delz": {},
-            "q_con": {},
             "pkz": {"istart": grid.is_, "jstart": grid.js},
             "pt": {},
             "cappa": {},
         }
-        self.write_vars = ["gz", "cvm"]
         for k, v in self.in_vars["data_vars"].items():
             if k not in self.write_vars:
                 v["axis"] = 1
 
         self.in_vars["parameters"] = ["r_vir"]
         self.out_vars = {
-            "gz": {
-                "serialname": "gz1d",
-                "istart": grid.is_,
-                "iend": grid.ie,
-                "jstart": grid.js,
-                "jend": grid.js,
-                "kstart": grid.npz - 1,
-                "kend": grid.npz - 1,
-            },
-            "cvm": {
-                "istart": grid.is_,
-                "iend": grid.ie,
-                "jstart": grid.js,
-                "jend": grid.js,
-                "kstart": grid.npz - 1,
-                "kend": grid.npz - 1,
-            },
             "pkz": {
                 "istart": grid.is_,
                 "iend": grid.ie,
@@ -116,7 +90,6 @@ def __init__(
                 "jend": grid.je,
             },
             "cappa": {},
-            "q_con": {},
         }
 
     def compute_from_storage(self, inputs):
diff --git a/tests/savepoint/translate/translate_moistcvpluspt_2d.py b/tests/savepoint/translate/translate_moistcvpluspt_2d.py
index c50fda2e..271430c0 100644
--- a/tests/savepoint/translate/translate_moistcvpluspt_2d.py
+++ b/tests/savepoint/translate/translate_moistcvpluspt_2d.py
@@ -1,7 +1,8 @@
 from gt4py.cartesian.gtscript import PARALLEL, computation, interval
 
 from ndsl import StencilFactory
-from ndsl.dsl.typing import FloatField
+from ndsl.constants import X_DIM, Y_DIM, Z_DIM
+from ndsl.dsl.typing import Float, FloatField
 from ndsl.stencils.testing import TranslateFortranData2Py, pad_field_in_j
 from pyFV3.stencils import moist_cv
 
@@ -18,7 +19,7 @@ def moist_pt(
     cappa: FloatField,
     delp: FloatField,
     delz: FloatField,
-    r_vir: float,
+    r_vir: Float,
 ):
     with computation(PARALLEL), interval(...):
         cvm, gz, q_con, cappa, pt = moist_cv.moist_pt_func(
@@ -53,6 +54,12 @@ def __init__(
             domain=(grid.nic, 1, grid.npz),
         )
 
+        self._q_con = grid.quantity_factory.zeros(
+            [X_DIM, Y_DIM, Z_DIM],
+            units="unknown",
+            dtype=Float,
+        )
+
     def __call__(
         self,
         qvapor: FloatField,
@@ -61,7 +68,7 @@ def __call__(
         qsnow: FloatField,
         qice: FloatField,
         qgraupel: FloatField,
-        q_con: FloatField,
+        # q_con: FloatField,
         pt: FloatField,
         cappa: FloatField,
         delp: FloatField,
@@ -75,7 +82,8 @@ def __call__(
             qsnow,
             qice,
             qgraupel,
-            q_con,
+            # q_con,
+            self._q_con,
             pt,
             cappa,
             delp,
@@ -98,7 +106,7 @@ def __init__(self, grid, namelist, stencil_factory):
             "qgraupel": {"serialname": "qgraupel_js"},
             "delp": {},
             "delz": {},
-            "q_con": {},
+            # "q_con": {},
             "pt": {},
             "cappa": {},
         }
@@ -111,7 +119,7 @@ def __init__(self, grid, namelist, stencil_factory):
         self.out_vars = {
             "pt": {},
             "cappa": {},
-            "q_con": {},
+            # "q_con": {},
         }
 
     def compute_from_storage(self, inputs):
diff --git a/tests/savepoint/translate/translate_moistcvpluspt_2d_last_step.py b/tests/savepoint/translate/translate_moistcvpluspt_2d_last_step.py
new file mode 100644
index 00000000..f947444a
--- /dev/null
+++ b/tests/savepoint/translate/translate_moistcvpluspt_2d_last_step.py
@@ -0,0 +1,70 @@
+from ndsl.dsl.typing import Float
+from ndsl.stencils.testing import TranslateFortranData2Py
+from pyFV3.stencils import moist_cv
+
+
+class TranslateMoistCVPlusPt_2d_last_step(TranslateFortranData2Py):
+    def __init__(self, grid, namelist, stencil_factory):
+        super().__init__(grid, namelist, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.in_vars["data_vars"] = {
+            "qvapor": {
+                "kend": grid.npz - 1,
+            },
+            "qliquid": {
+                "kend": grid.npz - 1,
+            },
+            "qice": {
+                "kend": grid.npz - 1,
+            },
+            "qrain": {
+                "kend": grid.npz - 1,
+            },
+            "qsnow": {
+                "kend": grid.npz - 1,
+            },
+            "qgraupel": {
+                "kend": grid.npz - 1,
+            },
+            "pt": {},
+            "pkz": {"istart": grid.is_, "jstart": grid.js},
+        }
+
+        self.in_vars["parameters"] = ["r_vir", "dtmp"]
+        self.out_vars = {
+            "pt": {},
+        }
+
+        self.compute_func = stencil_factory.from_origin_domain(
+            moist_cv.moist_pt_last_step,
+            origin=grid.compute_origin(),
+            domain=(grid.nic, grid.njc, grid.npz),
+        )
+
+        self.quantity_factory = grid.quantity_factory
+
+        self._gz = self.quantity_factory._numpy.zeros(
+            (
+                grid.nid,
+                grid.njd,
+                grid.npz,
+            ),
+            dtype=Float,
+        )
+
+    def compute_from_storage(self, inputs):
+
+        self.compute_func(
+            inputs["qvapor"],
+            inputs["qliquid"],
+            inputs["qrain"],
+            inputs["qsnow"],
+            inputs["qice"],
+            inputs["qgraupel"],
+            # self._gz,
+            inputs["pt"],
+            inputs["pkz"],
+            Float(inputs["dtmp"]),
+            inputs["r_vir"],
+        )
+        return inputs
diff --git a/tests/savepoint/translate/translate_moistcvpluste_2d.py b/tests/savepoint/translate/translate_moistcvpluste_2d.py
new file mode 100644
index 00000000..8723bf3a
--- /dev/null
+++ b/tests/savepoint/translate/translate_moistcvpluste_2d.py
@@ -0,0 +1,119 @@
+from ndsl.stencils.testing import TranslateFortranData2Py, pad_field_in_j
+from pyFV3.stencils import moist_cv
+
+
+class TranslateMoistCVPlusTe_2d(TranslateFortranData2Py):
+    def __init__(self, grid, namelist, stencil_factory):
+        super().__init__(grid, namelist, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.in_vars["data_vars"] = {
+            "qvapor": {"serialname": "qvapor_js"},
+            "qliquid": {"serialname": "qliquid_js"},
+            "qice": {"serialname": "qice_js"},
+            "qrain": {"serialname": "qrain_js"},
+            "qsnow": {"serialname": "qsnow_js"},
+            "qgraupel": {"serialname": "qgraupel_js"},
+            "delp": {},
+            "pt": {},
+            "phis_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "te_2d_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+            },
+            "u": {
+                "istart": grid.isd,
+                "iend": grid.ied,
+                "jstart": grid.jsd,
+                "jend": grid.jed + 1,
+                "kend": grid.npz,
+            },
+            "v": {
+                "istart": grid.isd,
+                "iend": grid.ied + 1,
+                "jstart": grid.jsd,
+                "jend": grid.jed,
+                "kend": grid.npz,
+            },
+            "w": {
+                "kend": grid.npz,
+            },
+            "cosa_s": {
+                "istart": grid.isd,
+                "iend": grid.ied,
+                "jstart": grid.jsd,
+                "jend": grid.jed,
+            },
+            "rsin2": {
+                "istart": grid.isd,
+                "iend": grid.ied,
+                "jstart": grid.jsd,
+                "jend": grid.jed,
+            },
+            "hs": {
+                "istart": grid.isd,
+                "iend": grid.ied,
+                "jstart": grid.jsd,
+                "jend": grid.jed,
+            },
+            "delz": {},
+        }
+        self.write_vars = ["qvapor", "qliquid", "qice", "qrain", "qsnow", "qgraupel"]
+        for k, v in self.in_vars["data_vars"].items():
+            # if k not in self.write_vars:
+            if k in self.write_vars:
+                v["axis"] = 1
+        self.in_vars["parameters"] = ["grav"]
+        self.out_vars = {
+            "te_2d_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+            },
+        }
+
+        self.compute_func = stencil_factory.from_origin_domain(
+            moist_cv.moist_te,
+            origin=grid.compute_origin(),
+            domain=(grid.nic, 1, grid.npz + 1),
+        )
+
+    def compute_from_storage(self, inputs):
+        for name, value in inputs.items():
+            if hasattr(value, "shape") and len(value.shape) > 1 and value.shape[1] == 1:
+                inputs[name] = self.make_storage_data(
+                    pad_field_in_j(
+                        value, self.grid.njd, backend=self.stencil_factory.backend
+                    )
+                )
+
+        self.compute_func(
+            inputs["qvapor"],
+            inputs["qliquid"],
+            inputs["qrain"],
+            inputs["qsnow"],
+            inputs["qice"],
+            inputs["qgraupel"],
+            inputs["u"],
+            inputs["v"],
+            inputs["w"],
+            inputs["te_2d_"],
+            inputs["pt"],
+            inputs["phis_"],
+            inputs["delp"],
+            inputs["rsin2"],
+            inputs["cosa_s"],
+            inputs["hs"],
+            inputs["delz"],
+            inputs["grav"],
+        )
+
+        return inputs
diff --git a/tests/savepoint/translate/translate_mpp_global_sum.py b/tests/savepoint/translate/translate_mpp_global_sum.py
new file mode 100644
index 00000000..397b6d2c
--- /dev/null
+++ b/tests/savepoint/translate/translate_mpp_global_sum.py
@@ -0,0 +1,36 @@
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.stencils.testing import ParallelTranslate
+from ndsl.stencils.testing.grid import Grid
+from ndsl.typing import Communicator
+from pyFV3.mpi.mpp_sum import MPPGlobalSum
+
+
+class TranslateMpp_global_sum(ParallelTranslate):
+    def __init__(
+        self,
+        grid: Grid,
+        namelist: Namelist,
+        stencil_factory: StencilFactory,
+    ):
+        super().__init__(grid, namelist, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.grid = grid
+
+        self._base.in_vars["data_vars"] = {
+            "inputArray": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+            },
+            "tesum": {},
+        }
+
+        self._base.out_vars = {"tesum": {}}
+
+    def compute_parallel(self, inputs, communicator: Communicator):
+        mpp_sum = MPPGlobalSum(self.stencil_factory, communicator)
+        inputs["tesum"] = mpp_sum(inputs["inputArray"])
+
+        return inputs
diff --git a/tests/savepoint/translate/translate_neg_adj3.py b/tests/savepoint/translate/translate_neg_adj3.py
index f7ba63b1..b7f34ac6 100644
--- a/tests/savepoint/translate/translate_neg_adj3.py
+++ b/tests/savepoint/translate/translate_neg_adj3.py
@@ -1,7 +1,8 @@
 from typing import Any, Dict
 
 import ndsl.dsl.gt4py_utils as utils
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from pyFV3.stencils import AdjustNegativeTracerMixingRatio
 from pyFV3.testing import TranslateDycoreFortranData2Py
 
diff --git a/tests/savepoint/translate/translate_nh_p_grad.py b/tests/savepoint/translate/translate_nh_p_grad.py
index 8220f83a..6ec3b424 100644
--- a/tests/savepoint/translate/translate_nh_p_grad.py
+++ b/tests/savepoint/translate/translate_nh_p_grad.py
@@ -1,5 +1,6 @@
 import pyFV3.stencils.nh_p_grad as NH_P_Grad
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from pyFV3.testing import TranslateDycoreFortranData2Py
 
 
diff --git a/tests/savepoint/translate/translate_pe_halo.py b/tests/savepoint/translate/translate_pe_halo.py
index 0c5c2e09..a5ed4596 100644
--- a/tests/savepoint/translate/translate_pe_halo.py
+++ b/tests/savepoint/translate/translate_pe_halo.py
@@ -1,4 +1,5 @@
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from pyFV3.stencils import pe_halo
 from pyFV3.testing import TranslateDycoreFortranData2Py
 
diff --git a/tests/savepoint/translate/translate_pe_pk_delp_peln.py b/tests/savepoint/translate/translate_pe_pk_delp_peln.py
new file mode 100644
index 00000000..bbf80271
--- /dev/null
+++ b/tests/savepoint/translate/translate_pe_pk_delp_peln.py
@@ -0,0 +1,152 @@
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.stencils.testing import TranslateFortranData2Py
+from ndsl.stencils.testing.grid import Grid
+from pyFV3.stencils.remapping import pe_pk_delp_peln
+
+
+class TranslatePE_pk_delp_peln(TranslateFortranData2Py):
+    def __init__(self, grid: Grid, namelist: Namelist, stencil_factory: StencilFactory):
+        super().__init__(grid, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.grid = grid
+        self.quantity_factory = grid.quantity_factory
+
+        self.in_vars["data_vars"] = {
+            "pe2_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "pe_": {
+                "istart": grid.is_ - 1,
+                "iend": grid.ie + 1,
+                "jstart": grid.js - 1,
+                "jend": grid.je + 1,
+                "kend": grid.npz + 1,
+            },
+            "peln_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "pn2_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "pk2_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "delp": {
+                # "istart": grid.isd,
+                # "iend": grid.ied,
+                # "jstart": grid.jsd,
+                # "jend": grid.jed,
+                # "kend": grid.npz,
+            },
+            "pk": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "ak": {},
+            "bk": {},
+        }
+        self.in_vars["parameters"] = [
+            "akap",
+            "ptop",
+        ]
+
+        self.out_vars = {
+            "pe2_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "pe_": {
+                "istart": grid.is_ - 1,
+                "iend": grid.ie + 1,
+                "jstart": grid.js - 1,
+                "jend": grid.je + 1,
+                "kend": grid.npz + 1,
+            },
+            "peln_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "pn2_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "pk2_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "delp": {
+                # "istart": grid.isd,
+                # "iend": grid.ied,
+                # "jstart": grid.jsd,
+                # "jend": grid.jed,
+                # "kend": grid.npz,
+            },
+            "pk": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+        }
+
+        grid_indexing = stencil_factory.grid_indexing
+        self._domain_kextra = (
+            grid_indexing.domain[0],
+            1,
+            grid_indexing.domain[2] + 1,
+        )
+
+        self._pe_pk_delp_peln = stencil_factory.from_origin_domain(
+            pe_pk_delp_peln,
+            origin=grid_indexing.origin_compute(),
+            domain=self._domain_kextra,
+        )
+
+    def compute_from_storage(self, inputs):
+        self._pe_pk_delp_peln(
+            inputs["pe_"],
+            inputs["pk"],
+            inputs["delp"],
+            inputs["peln_"],
+            inputs["pe2_"],
+            inputs["pk2_"],
+            inputs["pn2_"],
+            inputs["ak"],
+            inputs["bk"],
+            inputs["akap"],
+            inputs["ptop"],
+        )
+        return inputs
diff --git a/tests/savepoint/translate/translate_pk3_halo.py b/tests/savepoint/translate/translate_pk3_halo.py
index 4f13c02d..35475b1f 100644
--- a/tests/savepoint/translate/translate_pk3_halo.py
+++ b/tests/savepoint/translate/translate_pk3_halo.py
@@ -1,4 +1,5 @@
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from pyFV3.stencils import PK3Halo
 from pyFV3.testing import TranslateDycoreFortranData2Py
 
diff --git a/tests/savepoint/translate/translate_pn2_pk_delp.py b/tests/savepoint/translate/translate_pn2_pk_delp.py
new file mode 100644
index 00000000..ba29adb4
--- /dev/null
+++ b/tests/savepoint/translate/translate_pn2_pk_delp.py
@@ -0,0 +1,131 @@
+from ndsl import StencilFactory
+from ndsl.dsl.typing import Float, FloatField
+from ndsl.stencils.testing import TranslateFortranData2Py
+from pyFV3.stencils.remapping import pn2_pk_delp
+
+
+class testClass:
+    """
+    Class to test with DaCe orchestration. test class is MoistCVPlusPt_2d
+    """
+
+    def __init__(
+        self,
+        stencil_factory: StencilFactory,
+        grid,
+    ):
+        self._pn2_pk_delp = stencil_factory.from_origin_domain(
+            func=pn2_pk_delp,
+            origin=(3, 3, 1),
+            domain=(24, 24, 71),
+        )
+
+    def __call__(
+        self,
+        dp2: FloatField,
+        delp: FloatField,
+        pe2: FloatField,
+        pn2: FloatField,
+        pk: FloatField,
+        akap: Float,
+    ):
+        self._pn2_pk_delp(dp2, delp, pe2, pn2, pk, akap)
+
+
+class TranslatePN2_PK_DelP(TranslateFortranData2Py):
+    def __init__(self, grid, namelist, stencil_factory):
+        super().__init__(grid, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.grid = grid
+        self.compute_func = testClass(self.stencil_factory, self.grid)  # type: ignore
+        self.quantity_factory = grid.quantity_factory
+
+        self.in_vars["data_vars"] = {
+            "pe2_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "pn2_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "pk_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+        }
+        self.in_vars["parameters"] = [
+            "akap",
+        ]
+
+        self.out_vars = {
+            "pe2_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "pn2_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "pk_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+        }
+        self._dp2 = self.quantity_factory._numpy.zeros(
+            (
+                31,
+                31,
+                73,
+            ),
+            dtype=Float,
+        )
+
+        self._delp = self.quantity_factory._numpy.zeros(
+            (
+                31,
+                31,
+                73,
+            ),
+            dtype=Float,
+        )
+
+    def compute_from_storage(self, inputs):
+
+        # print("delp shape = ", self._delp.shape)
+        # print("inputs[pe2] shape = ", inputs["pe2_"].shape)
+        # print("inputs[pe2_][:,3,0] = ", inputs["pe2_"][:,3,0])
+        # print('self.grid.is_ = ', self.grid.is_)
+        # print('self.grid.ie = ', self.grid.ie)
+        # print('self.grid.js = ', self.grid.js)
+        # print('self.grid.je = ', self.grid.je)
+        # print('self.storage_vars() = ', self.storage_vars())
+        # self.make_storage_data_input_vars(inputs)
+        # exit(1)
+        self.compute_func(
+            self._dp2,
+            self._delp,
+            inputs["pe2_"],
+            inputs["pn2_"],
+            inputs["pk_"],
+            inputs["akap"],
+        )
+        return inputs
diff --git a/tests/savepoint/translate/translate_pressureadjustedtemperature_nonhydrostatic.py b/tests/savepoint/translate/translate_pressureadjustedtemperature_nonhydrostatic.py
index 6ed5e02d..c8e47a9d 100644
--- a/tests/savepoint/translate/translate_pressureadjustedtemperature_nonhydrostatic.py
+++ b/tests/savepoint/translate/translate_pressureadjustedtemperature_nonhydrostatic.py
@@ -1,6 +1,9 @@
+import numpy as np
 from typing import Any, Dict
 
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.dsl.typing import Float
 from pyFV3 import DynamicalCoreConfig
 from pyFV3.stencils import temperature_adjust
 from pyFV3.stencils.dyn_core import get_nk_heat_dissipation
@@ -42,7 +45,9 @@ def __init__(
         self.stencil_factory = stencil_factory
 
     def compute_from_storage(self, inputs):
-        inputs["delt_time_factor"] = abs(inputs["bdt"] * self.namelist.delt_max)
+        inputs["delt_time_factor"] = np.abs(
+            inputs["bdt"] * self.namelist.delt_max, dtype=Float
+        )
         del inputs["bdt"]
         self.compute_func(**inputs)
         return inputs
diff --git a/tests/savepoint/translate/translate_qsinit.py b/tests/savepoint/translate/translate_qsinit.py
index e1228ced..6b58393e 100644
--- a/tests/savepoint/translate/translate_qsinit.py
+++ b/tests/savepoint/translate/translate_qsinit.py
@@ -2,7 +2,8 @@
 
 import ndsl.dsl.gt4py_utils as utils
 import pyFV3.stencils.saturation_adjustment as satadjust
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from pyFV3.testing import TranslateDycoreFortranData2Py
 
 
diff --git a/tests/savepoint/translate/translate_ray_fast.py b/tests/savepoint/translate/translate_ray_fast.py
index 32de3d55..70a1abf9 100644
--- a/tests/savepoint/translate/translate_ray_fast.py
+++ b/tests/savepoint/translate/translate_ray_fast.py
@@ -1,4 +1,5 @@
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from pyFV3.stencils import RayleighDamping
 from pyFV3.testing import TranslateDycoreFortranData2Py
 
diff --git a/tests/savepoint/translate/translate_remapping.py b/tests/savepoint/translate/translate_remapping.py
index 43ddb27d..f0dbdfde 100644
--- a/tests/savepoint/translate/translate_remapping.py
+++ b/tests/savepoint/translate/translate_remapping.py
@@ -1,9 +1,11 @@
 import ndsl.dsl.gt4py_utils as utils
-from ndsl import Namelist, StencilFactory
+from ndsl import QuantityFactory, StencilFactory
+from f90nml.namelist import Namelist
 from ndsl.constants import Z_DIM
 from pyFV3 import DynamicalCoreConfig
 from pyFV3.stencils import LagrangianToEulerian
 from pyFV3.testing import TranslateDycoreFortranData2Py
+from pyFV3.tracers import TracersType
 
 
 class TranslateRemapping(TranslateDycoreFortranData2Py):
@@ -97,6 +99,10 @@ def __init__(
         self.ignore_near_zero_errors = {"q_con": True, "tracers": True}
         self.stencil_factory = stencil_factory
         self.namelist = DynamicalCoreConfig.from_namelist(namelist)
+        self._quantity_factory = QuantityFactory.from_backend(
+            sizer=stencil_factory.grid_indexing._sizer,
+            backend=stencil_factory.backend,
+        )
 
     def compute_from_storage(self, inputs):
         wsd_2d = utils.make_storage_from_shape(
@@ -104,19 +110,35 @@ def compute_from_storage(self, inputs):
         )
         wsd_2d[:, :] = inputs["wsd"][:, :, 0]
         inputs["wsd"] = wsd_2d
-        inputs["q_cld"] = inputs["tracers"]["qcld"]
+        tracers = Tracers.make_from_4D_array(
+            quantity_factory=self._quantity_factory,
+            tracer_mapping=[
+                "vapor",
+                "liquid",
+                "rain",
+                "ice",
+                "snow",
+                "graupel",
+                "qo3mr",
+                "qsgs_tke",
+                "cloud",
+            ],
+            tracer_data=inputs["tracers"],
+        )
         inputs["last_step"] = bool(inputs["last_step"])
         pfull = self.grid.quantity_factory.zeros([Z_DIM], units="Pa")
         pfull.data[:] = pfull.np.asarray(inputs.pop("pfull"))
+        inputs.pop("nq")
+        inputs["tracers"] = tracers
         l_to_e_obj = LagrangianToEulerian(
             self.stencil_factory,
             quantity_factory=self.grid.quantity_factory,
             config=DynamicalCoreConfig.from_namelist(self.namelist).remapping,
             area_64=self.grid.area_64,
-            nq=inputs.pop("nq"),
             pfull=pfull,
             tracers=inputs["tracers"],
+            exclude_tracers=["cloud"],
         )
         l_to_e_obj(**inputs)
-        inputs.pop("q_cld")
+        inputs["tracers"] = tracers.as_4D_array()
         return inputs
diff --git a/tests/savepoint/translate/translate_remapping_GEOS.py b/tests/savepoint/translate/translate_remapping_GEOS.py
new file mode 100644
index 00000000..982ba391
--- /dev/null
+++ b/tests/savepoint/translate/translate_remapping_GEOS.py
@@ -0,0 +1,492 @@
+from types import SimpleNamespace
+
+from ndsl import StencilFactory, Quantity
+from f90nml import Namelist
+from ndsl.constants import (
+    X_DIM,
+    X_INTERFACE_DIM,
+    Y_DIM,
+    Y_INTERFACE_DIM,
+    Z_DIM,
+    Z_INTERFACE_DIM,
+)
+from ndsl.dsl.typing import Float
+from ndsl.stencils.testing import Grid, ParallelTranslateBaseSlicing
+from pyFV3 import DynamicalCoreConfig
+from pyFV3.stencils.remapping_GEOS import LagrangianToEulerian_GEOS
+from pyFV3.tracers import TracersType, setup_tracers
+
+
+class TranslateRemapping_GEOS(ParallelTranslateBaseSlicing):
+    inputs = {
+        "pe": {
+            "name": "pe",
+            "dims": [X_DIM, Y_DIM, Z_INTERFACE_DIM],
+            "units": "No Units",
+        },
+        "delp": {
+            "name": "delp",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "delz": {
+            "name": "delz",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "q_con": {
+            "name": "q_con",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "pt": {
+            "name": "pt",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "cappa": {
+            "name": "cappa",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "ps": {
+            "name": "ps",
+            "dims": [X_DIM, Y_DIM],
+            "units": "No Units",
+        },
+        "peln": {
+            "name": "peln",
+            "dims": [X_DIM, Y_DIM, Z_INTERFACE_DIM],
+            "units": "No Units",
+        },
+        "ak": {
+            "name": "ak",
+            "dims": [Z_INTERFACE_DIM],
+            "units": "No Units",
+        },
+        "bk": {
+            "name": "bk",
+            "dims": [Z_INTERFACE_DIM],
+            "units": "No Units",
+        },
+        "pk": {
+            "name": "pk",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "pkz": {
+            "name": "pkz",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "w": {
+            "name": "w",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "u": {
+            "name": "u",
+            "dims": [X_DIM, Y_INTERFACE_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "v": {
+            "name": "v",
+            "dims": [X_INTERFACE_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "mfy_R4": {
+            "name": "mfy",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "cy_R4": {
+            "name": "cy",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "mfx_R4": {
+            "name": "mfx",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "cx_R4": {
+            "name": "cx",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "phis": {
+            "name": "phis",
+            "dims": [X_DIM, Y_DIM],
+            "units": "No Units",
+        },
+        "te_2d": {
+            "name": "te_2d",
+            "dims": [X_DIM, Y_DIM],
+            "units": "No Units",
+        },
+        "wsd": {
+            "name": "wsd",
+            "dims": [X_DIM, Y_DIM],
+            "units": "No Units",
+        },
+        "dp1": {
+            "name": "dp1",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "pfull": {
+            "name": "pfull",
+            "dims": [Z_DIM],
+            "units": "No Units",
+        },
+    }
+    outputs = {
+        "pt": {
+            "name": "pt",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "cappa": {
+            "name": "cappa",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "delp": {
+            "name": "delp",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "delz": {
+            "name": "delz",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "w": {
+            "name": "w",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "u": {
+            "name": "u",
+            "dims": [X_DIM, Y_INTERFACE_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "v": {
+            "name": "v",
+            "dims": [X_INTERFACE_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "mfy_R4": {
+            "name": "mfy",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "cy_R4": {
+            "name": "cy",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "mfx_R4": {
+            "name": "mfx",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "cx_R4": {
+            "name": "cx",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "peln": {
+            "name": "peln",
+            "dims": [X_DIM, Y_DIM, Z_INTERFACE_DIM],
+            "units": "No Units",
+        },
+        "pe": {
+            "name": "pe",
+            "dims": [X_DIM, Y_DIM, Z_INTERFACE_DIM],
+            "units": "No Units",
+        },
+        "pk": {
+            "name": "pk",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "pkz": {
+            "name": "pkz",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "q_con": {
+            "name": "q_con",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "dp1": {
+            "name": "dp1",
+            "dims": [X_DIM, Y_DIM, Z_DIM],
+            "units": "No Units",
+        },
+        "ps": {
+            "name": "ps",
+            "dims": [X_DIM, Y_DIM],
+            "units": "No Units",
+        },
+    }
+
+    def __init__(
+        self,
+        grid: Grid,
+        namelist: Namelist,
+        stencil_factory: StencilFactory,
+    ):
+        super().__init__(grid, namelist, stencil_factory)
+
+        self._base.in_vars["data_vars"] = {
+            "tracers": {},
+            "w": {
+                "kend": grid.npz - 1,
+            },
+            "u": grid.y3d_domain_dict(),
+            "v": grid.x3d_domain_dict(),
+            "delz": {},
+            "pt": {},
+            "dp1": {},
+            "delp": {},
+            "cappa": {},
+            "q_con": {},
+            "pkz": grid.compute_dict(),
+            "pk": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz + 1,
+            },
+            "peln": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kaxis": 1,
+                "kend": grid.npz,
+            },
+            "pe": {
+                "istart": grid.is_ - 1,
+                "iend": grid.ie + 1,
+                "jstart": grid.js - 1,
+                "jend": grid.je + 1,
+                "kend": grid.npz + 1,
+                "kaxis": 1,
+            },
+            "ps": {},
+            "wsd": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+            },
+            "mfy_R4": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je + 1,
+                "kend": grid.npz - 1,
+            },
+            "cy_R4": {
+                "istart": grid.isd,
+                "iend": grid.ied,
+                "jstart": grid.js,
+                "jend": grid.je + 1,
+                "kend": grid.npz - 1,
+            },
+            "mfx_R4": {
+                "istart": grid.is_,
+                "iend": grid.ie + 1,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "cx_R4": {
+                "istart": grid.is_,
+                "iend": grid.ie + 1,
+                "jstart": grid.jsd,
+                "jend": grid.jed,
+                "kend": grid.npz - 1,
+            },
+            "phis": {
+                "istart": grid.isd,
+                "iend": grid.ied,
+                "jstart": grid.jsd,
+                "jend": grid.jed,
+            },
+            "te_2d": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+            },
+            # column variables...
+            "ak": {},
+            "bk": {},
+            "pfull": grid.compute_buffer_k_dict(),
+        }
+        self._base.in_vars["parameters"] = [
+            "ptop",
+            "akap",
+            "zvir",
+            "last_step",
+            "consv_te",
+            "mdt",
+            "nq",
+        ]
+        self._base.out_vars = {}
+        for k in [
+            "tracers",
+            "pe",
+            "pkz",
+            "pk",
+            "peln",
+            "pt",
+            "cappa",
+            "delp",
+            "delz",
+            "q_con",
+            "u",
+            "v",
+            "w",
+            "ps",
+            "dp1",
+            "mfy_R4",
+            "cy_R4",
+            "mfx_R4",
+            "cx_R4",
+        ]:
+            self._base.out_vars[k] = self._base.in_vars["data_vars"][k]
+
+        self.stencil_factory = stencil_factory
+        self.quantity_factory = grid.quantity_factory
+
+        self.stencil_factory = stencil_factory
+        self.namelist = DynamicalCoreConfig.from_namelist(namelist)
+        self.grid = grid
+
+        self._are_tracers_setup = False
+
+        self._tracers = None
+
+    def compute_sequential(self, inputs_list, communicator_list):
+        print("No serial test available")
+
+    def state_from_inputs(self, inputs: dict, tracers: TracersType) -> SimpleNamespace:
+        input_storages = super().state_from_inputs(inputs)
+        # Rename fluxes and courant numbers
+        input_storages["mfx"] = input_storages.pop("mfx_R4")
+        input_storages["mfy"] = input_storages.pop("mfy_R4")
+        input_storages["cx"] = input_storages.pop("cx_R4")
+        input_storages["cy"] = input_storages.pop("cy_R4")
+        # Make tracers
+        input_storages["tracers"] = tracers
+        return SimpleNamespace(**input_storages)
+
+    def outputs_from_state(self, state: dict):
+        if len(self.outputs) == 0:
+            return {}
+        outputs = {}
+        storages = {}
+        for name, properties in self.outputs.items():
+            if name in ["mfx_R4", "mfy_R4", "cx_R4", "cy_R4"]:
+                storages[name] = state[name[:-3]]
+            elif isinstance(state[name], Quantity):
+                storages[name] = state[name].data
+            elif len(self.outputs[name]["dims"]) > 0:
+                storages[name] = state[name]  # assume it's a storage
+            else:
+                outputs[name] = state[name]  # scalar
+        # Put tracers
+        storages["tracers"] = state["tracers"].quantity.data[:-1, :-1, :-1, :]
+        outputs.update(self._base.slice_output(storages))
+        return outputs
+
+    def compute_parallel(self, inputs, communicator):
+        # tracers_mapping = Tracers.blind_mapping_from_data(inputs["tracers"])
+        # tracers_mapping[0] = "vapor"
+        # tracers_mapping[1] = "liquid"
+        # tracers_mapping[2] = "rain"
+        # tracers_mapping[3] = "snow"
+        # tracers_mapping[4] = "ice"
+        # tracers_mapping[5] = "graupel"
+        # tracers_mapping[6] = "cloud"
+        # tracers = Tracers.make_from_4D_array(
+        #     self.quantity_factory,
+        #     tracers_mapping[0:7],
+        #     inputs["tracers"],
+        # )
+
+        if not self._are_tracers_setup:
+            self._are_tracers_setup = True
+            self._tracers = setup_tracers(
+                number_of_tracers=inputs["tracers"].shape[3],
+                quantity_factory=self.quantity_factory,
+                mappings={
+                    "vapor": 0,
+                    "liquid": 1,
+                    "rain": 3,
+                    "snow": 4,
+                    "ice": 2,
+                    "graupel": 5,
+                    "cloud": 6,
+                },
+            )
+
+        self._tracers.quantity.data[:-1, :-1, :-1, :] = inputs["tracers"]
+
+        inputs["te_2d"] = inputs["te_2d"].astype(Float)
+        state = self.state_from_inputs(inputs, self._tracers)
+
+        l_to_e = LagrangianToEulerian_GEOS(
+            self.stencil_factory,
+            self.quantity_factory,
+            DynamicalCoreConfig.from_namelist(self.namelist).remapping,
+            communicator,
+            self.grid.grid_data,
+            state.nq,
+            state.pfull,
+            state.tracers,
+            DynamicalCoreConfig.adiabatic,
+        )
+
+        l_to_e(
+            state.tracers,
+            state.pt,
+            state.delp,
+            state.delz,
+            state.peln,
+            state.u,
+            state.v,
+            state.w,
+            state.mfx,
+            state.mfy,
+            state.cx,
+            state.cy,
+            state.cappa,
+            state.q_con,
+            state.pkz,
+            state.pk,
+            state.pe,
+            state.phis,
+            state.te_2d,
+            state.ps,
+            state.wsd,
+            state.ak,
+            state.bk,
+            state.dp1,
+            state.ptop,
+            state.akap,
+            state.zvir,
+            state.last_step,
+            state.consv_te,
+            state.mdt,
+        )
+
+        outputs = self.outputs_from_state(vars(state))
+        return outputs
diff --git a/tests/savepoint/translate/translate_riem_solver3.py b/tests/savepoint/translate/translate_riem_solver3.py
index 76c4a5c3..962ef56f 100644
--- a/tests/savepoint/translate/translate_riem_solver3.py
+++ b/tests/savepoint/translate/translate_riem_solver3.py
@@ -1,4 +1,5 @@
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from pyFV3 import _config as spec
 from pyFV3.stencils import NonhydrostaticVerticalSolver
 from pyFV3.testing import TranslateDycoreFortranData2Py
diff --git a/tests/savepoint/translate/translate_riem_solver_c.py b/tests/savepoint/translate/translate_riem_solver_c.py
index aa193d67..15d29b75 100644
--- a/tests/savepoint/translate/translate_riem_solver_c.py
+++ b/tests/savepoint/translate/translate_riem_solver_c.py
@@ -1,6 +1,10 @@
-from ndsl import Namelist, StencilFactory
+import numpy as np
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.constants import X_DIM, Y_DIM, Z_DIM
 from pyFV3.stencils import NonhydrostaticVerticalSolverCGrid
 from pyFV3.testing import TranslateDycoreFortranData2Py
+from pyFV3.utils.functional_validation import get_subset_func
 
 
 class TranslateRiem_Solver_C(TranslateDycoreFortranData2Py):
@@ -31,3 +35,23 @@ def __init__(
         self.out_vars = {"pef": {"kend": grid.npz}, "gz": {"kend": grid.npz}}
         self.max_error = 5e-14
         self.stencil_factory = stencil_factory
+        self._subset = get_subset_func(
+            self.grid.grid_indexing,
+            dims=[X_DIM, Y_DIM, Z_DIM],
+            n_halo=((3, 3), (3, 3)),
+        )
+
+    def compute(self, inputs):
+        outputs = super().compute(inputs)
+        outputs["gz"] = self.subset_output("gz", outputs["gz"])
+        return outputs
+
+    def subset_output(self, varname: str, output: np.ndarray) -> np.ndarray:
+        """
+        Given an output array, return the slice of the array which we'd
+        like to validate against reference data
+        """
+        if varname in ["gz", "pef"]:
+            return self._subset(output)
+        else:
+            return output
diff --git a/tests/savepoint/translate/translate_satadjust3d.py b/tests/savepoint/translate/translate_satadjust3d.py
index 99f33b55..f335e519 100644
--- a/tests/savepoint/translate/translate_satadjust3d.py
+++ b/tests/savepoint/translate/translate_satadjust3d.py
@@ -1,4 +1,5 @@
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from pyFV3 import DynamicalCoreConfig
 from pyFV3.stencils import SatAdjust3d
 from pyFV3.testing import TranslateDycoreFortranData2Py
diff --git a/tests/savepoint/translate/translate_scalar_profile.py b/tests/savepoint/translate/translate_scalar_profile.py
new file mode 100644
index 00000000..9a9c8150
--- /dev/null
+++ b/tests/savepoint/translate/translate_scalar_profile.py
@@ -0,0 +1,119 @@
+from ndsl import StencilFactory
+from f90nml import Namelist
+from ndsl.constants import X_DIM, Y_DIM, Z_DIM
+from ndsl.stencils.testing import TranslateFortranData2Py
+from ndsl.stencils.testing.grid import Grid
+from pyFV3.stencils.remap_profile import RemapProfile
+
+
+class TranslateScalar_Profile(TranslateFortranData2Py):
+    def __init__(self, grid: Grid, namelist: Namelist, stencil_factory: StencilFactory):
+        super().__init__(grid, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.grid = grid
+        self.quantity_factory = grid.quantity_factory
+
+        self.in_vars["data_vars"] = {
+            "qs_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_1": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_2": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_3": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_4": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "dp1_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+        }
+        self.in_vars["parameters"] = [
+            "q_min",
+        ]
+
+        self.out_vars = {
+            "q4_1": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_2": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_3": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+            "q4_4": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz - 1,
+            },
+        }
+
+        # Value from GEOS
+        self.kord = 9
+
+        # mode / iv set to 1 from GEOS
+        self.mode = 1
+
+        self._compute_func = RemapProfile(
+            self.stencil_factory,
+            self.quantity_factory,
+            self.kord,
+            self.mode,
+            dims=[X_DIM, Y_DIM, Z_DIM],
+        )
+
+    def compute_from_storage(self, inputs):
+        self._compute_func(
+            inputs["qs_"],
+            inputs["q4_1"],
+            inputs["q4_2"],
+            inputs["q4_3"],
+            inputs["q4_4"],
+            inputs["dp1_"],
+            inputs["q_min"],
+        )
+        return inputs
diff --git a/tests/savepoint/translate/translate_te_zsum.py b/tests/savepoint/translate/translate_te_zsum.py
new file mode 100644
index 00000000..2b39e1a5
--- /dev/null
+++ b/tests/savepoint/translate/translate_te_zsum.py
@@ -0,0 +1,70 @@
+from ndsl.stencils.testing import TranslateFortranData2Py
+from pyFV3.stencils import moist_cv
+
+
+class TranslateTe_Zsum(TranslateFortranData2Py):
+    def __init__(self, grid, namelist, stencil_factory):
+        super().__init__(grid, namelist, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.in_vars["data_vars"] = {
+            "delp": {
+                "kend": grid.npz,
+            },
+            "te_2d_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+            },
+            "te0_2d_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+            },
+            "zsum1": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+            },
+            "pkz": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+                "kend": grid.npz,
+            },
+        }
+        self.out_vars = {
+            "te_2d_": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+            },
+            "zsum1": {
+                "istart": grid.is_,
+                "iend": grid.ie,
+                "jstart": grid.js,
+                "jend": grid.je,
+            },
+        }
+
+        self.compute_func = stencil_factory.from_origin_domain(
+            moist_cv.te_zsum,
+            origin=grid.compute_origin(),
+            domain=(grid.nic, 1, grid.npz),
+        )
+
+    def compute_from_storage(self, inputs):
+
+        self.compute_func(
+            inputs["te_2d_"],
+            inputs["te0_2d_"],
+            inputs["delp"],
+            inputs["pkz"],
+            inputs["zsum1"],
+        )
+
+        return inputs
diff --git a/tests/savepoint/translate/translate_tracer2d1l.py b/tests/savepoint/translate/translate_tracer2d1l.py
index f3ad0f70..5be216b9 100644
--- a/tests/savepoint/translate/translate_tracer2d1l.py
+++ b/tests/savepoint/translate/translate_tracer2d1l.py
@@ -1,11 +1,13 @@
 import pytest
 
-import ndsl.dsl.gt4py_utils as utils
-from ndsl import Namelist, StencilFactory
+from ndsl import QuantityFactory, StencilFactory
+from f90nml.namelist import Namelist
 from ndsl.constants import X_DIM, Y_DIM, Z_DIM
 from ndsl.stencils.testing import ParallelTranslate
 from pyFV3.stencils import FiniteVolumeTransport, TracerAdvection
+from pyFV3.tracers import TracersType, setup_tracers
 from pyFV3.utils.functional_validation import get_subset_func
+from pyFV3 import DynamicalCoreConfig
 
 
 class TranslateTracer2D1L(ParallelTranslate):
@@ -26,38 +28,41 @@ def __init__(
         self._base.in_vars["data_vars"] = {
             "tracers": {},
             "dp1": {},
-            "mfxd": grid.x3d_compute_dict(),
-            "mfyd": grid.y3d_compute_dict(),
-            "cxd": grid.x3d_compute_domain_y_dict(),
-            "cyd": grid.y3d_compute_domain_x_dict(),
+            "mfxd_R4": grid.x3d_compute_dict(),
+            "mfyd_R4": grid.y3d_compute_dict(),
+            "cxd_R4": grid.x3d_compute_domain_y_dict(),
+            "cyd_R4": grid.y3d_compute_domain_x_dict(),
         }
         self._base.in_vars["parameters"] = ["nq"]
         self._base.out_vars = self._base.in_vars["data_vars"]
         self.stencil_factory = stencil_factory
-        self.namelist = namelist
+        self._quantity_factory = QuantityFactory.from_backend(
+            sizer=stencil_factory.grid_indexing._sizer,
+            backend=stencil_factory.backend,
+        )
         self._subset = get_subset_func(
             self.grid.grid_indexing,
             dims=[X_DIM, Y_DIM, Z_DIM],
             n_halo=((0, 0), (0, 0)),
         )
-
-    def collect_input_data(self, serializer, savepoint):
-        input_data = self._base.collect_input_data(serializer, savepoint)
-        return input_data
+        self.config = DynamicalCoreConfig.from_f90nml(namelist)
 
     def compute_parallel(self, inputs, communicator):
-        self._base.make_storage_data_input_vars(inputs)
-        all_tracers = inputs["tracers"]
-        inputs["tracers"] = self.get_advected_tracer_dict(
-            inputs["tracers"], int(inputs.pop("nq"))
+        self._base.make_storage_data_input_vars(inputs, dict_4d=False)
+        tracers = setup_tracers(
+            number_of_tracers=inputs["tracers"].shape[3],
+            quantity_factory=self._quantity_factory,
         )
+        tracers.quantity.data[:] = inputs["tracers"][:]
+        inputs.pop("tracers")
+        inputs.pop("nq")  # Fortran NQ is intrinsic to Tracers (e.g Tracers.count)
         transport = FiniteVolumeTransport(
             stencil_factory=self.stencil_factory,
             quantity_factory=self.grid.quantity_factory,
             grid_data=self.grid.grid_data,
             damping_coefficients=self.grid.damping_coefficients,
             grid_type=self.grid.grid_type,
-            hord=self.namelist.hord_tr,
+            hord=self.config.hord_tr,
         )
 
         self.tracer_advection = TracerAdvection(
@@ -66,39 +71,25 @@ def compute_parallel(self, inputs, communicator):
             transport,
             self.grid.grid_data,
             communicator,
-            inputs["tracers"],
+            tracers,
+            update_mass_courant=False,
         )
-        inputs["x_mass_flux"] = inputs.pop("mfxd")
-        inputs["y_mass_flux"] = inputs.pop("mfyd")
-        inputs["x_courant"] = inputs.pop("cxd")
-        inputs["y_courant"] = inputs.pop("cyd")
-        self.tracer_advection(**inputs)
-        inputs["mfxd"] = inputs.pop("x_mass_flux")
-        inputs["mfyd"] = inputs.pop("y_mass_flux")
-        inputs["cxd"] = inputs.pop("x_courant")
-        inputs["cyd"] = inputs.pop("y_courant")
-        inputs[
-            "tracers"
-        ] = all_tracers  # some aren't advected, still need to be validated
-        # need to convert tracers dict to [x, y, z, n_tracer] array before subsetting
+        inputs["x_mass_flux"] = inputs.pop("mfxd_R4")
+        inputs["y_mass_flux"] = inputs.pop("mfyd_R4")
+        inputs["x_courant"] = inputs.pop("cxd_R4")
+        inputs["y_courant"] = inputs.pop("cyd_R4")
+        self.tracer_advection(tracers=tracers, **inputs)
+        inputs["mfxd_R4"] = inputs.pop("x_mass_flux")
+        inputs["mfyd_R4"] = inputs.pop("y_mass_flux")
+        inputs["cxd_R4"] = inputs.pop("x_courant")
+        inputs["cyd_R4"] = inputs.pop("y_courant")
+        inputs["tracers"] = tracers.quantity.field
         outputs = self._base.slice_output(inputs)
-        outputs["tracers"] = self.subset_output("tracers", outputs["tracers"])
+        # outputs["tracers"] = self.subset_output("tracers", outputs["tracers"])
+        # outputs["tracers"] = tracers.quantity.field[:]
         return outputs
 
-    def get_advected_tracer_dict(self, all_tracers, nq):
-        all_tracers = {**all_tracers}  # make a new dict so we don't modify the input
-        properties = self.inputs["tracers"]
-        for name in utils.tracer_variables:
-            self.grid.quantity_dict_update(
-                all_tracers,
-                name,
-                dims=properties["dims"],
-                units=properties["units"],
-            )
-        tracer_names = utils.tracer_variables[:nq]
-        return {name: all_tracers[name + "_quantity"] for name in tracer_names}
-
-    def compute_sequential(self, a, b):
+    def compute_sequential(self, inputs_list, communicator_list):
         pytest.skip(
             f"{self.__class__} only has a mpirun implementation, "
             "not running in mock-parallel"
diff --git a/tests/savepoint/translate/translate_tracer2d1l_cmax.py b/tests/savepoint/translate/translate_tracer2d1l_cmax.py
new file mode 100644
index 00000000..0cde88e1
--- /dev/null
+++ b/tests/savepoint/translate/translate_tracer2d1l_cmax.py
@@ -0,0 +1,88 @@
+from ndsl import StencilFactory, QuantityFactory, Quantity
+from f90nml import Namelist
+from ndsl.stencils.testing import ParallelTranslate2Py
+from pyFV3.stencils.tracer_2d_1l import TracerCMax
+from ndsl.constants import X_DIM, X_INTERFACE_DIM, Y_DIM, Y_INTERFACE_DIM, Z_DIM
+
+
+def _quantity_wrap(storage, dims, grid_indexing):
+    origin, extent = grid_indexing.get_origin_domain(dims)
+    return Quantity(
+        storage,
+        dims=dims,
+        units="unknown",
+        origin=origin,
+        extent=extent,
+    )
+
+
+class TranslateTracerCMax(ParallelTranslate2Py):
+    inputs = {
+        "cx_R4": {
+            "name": "cx_R4",
+            "dims": [X_INTERFACE_DIM, Y_DIM, Z_DIM],
+            "units": "unitless",
+        },
+        "cy_R4": {
+            "name": "cy_R4",
+            "dims": [X_DIM, Y_INTERFACE_DIM, Z_DIM],
+            "units": "unitless",
+        },
+        "cmax": {
+            "name": "cmaxgrid",
+            "dims": [Z_DIM],
+            "units": "unitless",
+        },
+    }
+
+    def __init__(
+        self,
+        grid,
+        namelist: Namelist,
+        stencil_factory: StencilFactory,
+    ):
+        super().__init__(grid, namelist, stencil_factory)
+        self._base.in_vars["data_vars"] = {
+            "cx_R4": grid.x3d_compute_domain_y_dict(),
+            "cy_R4": grid.y3d_compute_domain_x_dict(),
+            "cmax": {},
+        }
+        self._base.out_vars = {
+            "cmax": {},
+        }
+        self._stencil_factory = stencil_factory
+        self._grid_data = grid
+        self._quantity_factory = QuantityFactory.from_backend(
+            sizer=stencil_factory.grid_indexing._sizer,
+            backend=stencil_factory.backend,
+        )
+
+    def compute_parallel(self, inputs, communicator):
+        self._base.make_storage_data_input_vars(inputs)
+        tracer_cmax = TracerCMax(
+            stencil_factory=self._stencil_factory,
+            quantity_factory=self._quantity_factory,
+            grid_data=self._grid_data,
+            comm=communicator,
+        )
+        cx_quantity = _quantity_wrap(
+            inputs["cx_R4"],
+            self.inputs["cx_R4"]["dims"],
+            self.grid.grid_indexing,
+        )
+        cy_quantity = _quantity_wrap(
+            inputs["cy_R4"],
+            self.inputs["cy_R4"]["dims"],
+            self.grid.grid_indexing,
+        )
+        cmax_quantity = _quantity_wrap(
+            inputs["cmax"],
+            self.inputs["cmax"]["dims"],
+            self.grid.grid_indexing,
+        )
+        tracer_cmax(
+            cx=cx_quantity,
+            cy=cy_quantity,
+            cmax=cmax_quantity,
+        )
+        return self._base.slice_output(inputs)
diff --git a/tests/savepoint/translate/translate_updatedzc.py b/tests/savepoint/translate/translate_updatedzc.py
index 9553a512..ded74351 100644
--- a/tests/savepoint/translate/translate_updatedzc.py
+++ b/tests/savepoint/translate/translate_updatedzc.py
@@ -1,6 +1,7 @@
 import numpy as np
 
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.constants import X_DIM, Y_DIM, Z_DIM
 from pyFV3.stencils import UpdateGeopotentialHeightOnCGrid
 from pyFV3.testing import TranslateDycoreFortranData2Py
@@ -16,15 +17,17 @@ def __init__(
     ):
         super().__init__(grid, namelist, stencil_factory)
         self.stencil_factory = stencil_factory
-        update_gz_on_c_grid = UpdateGeopotentialHeightOnCGrid(
-            self.stencil_factory,
-            quantity_factory=self.grid.quantity_factory,
-            area=grid.grid_data.area,
-            dp_ref=grid.grid_data.dp_ref,
-            grid_type=namelist.grid_type,
-        )
 
         def compute(**kwargs):
+            update_gz_on_c_grid = UpdateGeopotentialHeightOnCGrid(
+                self.stencil_factory,
+                quantity_factory=self.grid.quantity_factory,
+                area=grid.grid_data.area,
+                dp_ref=grid.grid_data.dp_ref,
+                grid_type=namelist.grid_type,
+                dz_min=kwargs.pop("dz_min"),
+            )
+
             kwargs["dt"] = kwargs.pop("dt2")
             update_gz_on_c_grid(**kwargs)
 
@@ -36,7 +39,7 @@ def compute(**kwargs):
             "gz": {},
             "ws": {},
         }
-        self.in_vars["parameters"] = ["dt2"]
+        self.in_vars["parameters"] = ["dt2", "dz_min"]
         self.out_vars = {
             "gz": grid.default_buffer_k_dict(),
             "ws": {"kstart": -1, "kend": None},
@@ -44,12 +47,12 @@ def compute(**kwargs):
         self._subset = get_subset_func(
             self.grid.grid_indexing,
             dims=[X_DIM, Y_DIM, Z_DIM],
-            n_halo=((0, 0), (0, 0)),
+            n_halo=((3, 3), (3, 3)),
         )
         self._subset_2d = get_subset_func(
             self.grid.grid_indexing,
             dims=[X_DIM, Y_DIM],
-            n_halo=((0, 0), (0, 0)),
+            n_halo=((3, 3), (3, 3)),
         )
 
     def compute(self, inputs):
diff --git a/tests/savepoint/translate/translate_updatedzd.py b/tests/savepoint/translate/translate_updatedzd.py
index 75800895..1921914a 100644
--- a/tests/savepoint/translate/translate_updatedzd.py
+++ b/tests/savepoint/translate/translate_updatedzd.py
@@ -1,6 +1,7 @@
 import numpy as np
 
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.constants import X_DIM, Y_DIM, Z_DIM
 from pyFV3 import DynamicalCoreConfig
 from pyFV3.stencils import UpdateHeightOnDGrid, d_sw
@@ -33,7 +34,7 @@ def __init__(
         self.in_vars["data_vars"]["height"]["serialname"] = "zh"
         self.in_vars["data_vars"]["ws"]["serialname"] = "wsd"
 
-        self.in_vars["parameters"] = ["dt"]
+        self.in_vars["parameters"] = ["dt", "dz_min"]
         out_vars = [
             "height",
             "courant_number_x",
@@ -52,7 +53,7 @@ def __init__(
         self._subset = get_subset_func(
             self.grid.grid_indexing,
             dims=[X_DIM, Y_DIM, Z_DIM],
-            n_halo=((0, 0), (0, 0)),
+            n_halo=((3, 3), (3, 3)),
         )
         self.ignore_near_zero_errors = {"zh": True, "wsd": True}
         self.near_zero = 1e-30
@@ -66,6 +67,7 @@ def compute(self, inputs):
             self.grid.grid_data,
             self.grid.grid_type,
             self.namelist.hord_tm,
+            dz_min=inputs.pop("dz_min"),
             column_namelist=d_sw.get_column_namelist(
                 self.namelist, quantity_factory=self.grid.quantity_factory
             ),
diff --git a/tests/savepoint/translate/translate_w_fix_consrv_moment.py b/tests/savepoint/translate/translate_w_fix_consrv_moment.py
new file mode 100644
index 00000000..6a642bc2
--- /dev/null
+++ b/tests/savepoint/translate/translate_w_fix_consrv_moment.py
@@ -0,0 +1,71 @@
+from ndsl.dsl.typing import Float
+from ndsl.stencils.testing import TranslateFortranData2Py
+from ndsl.stencils.testing.grid import Grid
+from pyFV3.stencils.w_fix_consrv_moment import W_fix_consrv_moment
+
+
+class TranslateW_fix_consrv_moment(TranslateFortranData2Py):
+    def __init__(self, grid: Grid, namelist, stencil_factory):
+        super().__init__(grid, stencil_factory)
+        self.stencil_factory = stencil_factory
+        self.grid = grid
+        self.quantity_factory = grid.quantity_factory
+
+        self.compute_func = stencil_factory.from_origin_domain(
+            func=W_fix_consrv_moment,
+            origin=grid.compute_origin(),
+            domain=(grid.nic, 1, grid.npz),
+        )
+
+        self.in_vars["data_vars"] = {
+            "w": {
+                "kend": grid.npz - 1,
+            },
+            "dp2_W": grid.compute_dict(),
+        }
+
+        self.in_vars["parameters"] = ["w_max", "w_min"]
+
+        self.out_vars = {
+            "w": {
+                "kend": grid.npz - 1,
+            },
+        }
+        self._gz = self.quantity_factory._numpy.zeros(
+            (
+                grid.nid,
+                grid.njd,
+            ),
+            dtype=Float,
+        )
+
+        self._w2 = self.quantity_factory._numpy.zeros(
+            (
+                grid.nid,
+                grid.njd,
+                grid.npz,
+            ),
+            dtype=Float,
+        )
+
+        self._compute_performed = self.quantity_factory._numpy.zeros(
+            (
+                grid.nid,
+                grid.njd,
+            ),
+            dtype=bool,
+        )
+
+    def compute_from_storage(self, inputs):
+
+        self.compute_func(
+            inputs["w"],
+            self._w2,
+            inputs["dp2_W"],
+            self._gz,
+            inputs["w_max"],
+            inputs["w_min"],
+            self._compute_performed,
+        )
+
+        return inputs
diff --git a/tests/savepoint/translate/translate_xppm.py b/tests/savepoint/translate/translate_xppm.py
index 44d8b368..b33325f6 100644
--- a/tests/savepoint/translate/translate_xppm.py
+++ b/tests/savepoint/translate/translate_xppm.py
@@ -1,5 +1,6 @@
 import ndsl.dsl.gt4py_utils as utils
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.stencils.testing import TranslateGrid
 from pyFV3.stencils import XPiecewiseParabolic
 from pyFV3.testing import TranslateDycoreFortranData2Py
@@ -14,12 +15,12 @@ def __init__(
     ):
         super().__init__(grid, namelist, stencil_factory)
         self.in_vars["data_vars"] = {
-            "q": {"serialname": "qx", "jstart": "jfirst"},
-            "c": {"serialname": "cx", "istart": grid.is_},
+            "q": {"serialname": "xppm_q", "jstart": "jfirst"},
+            "c": {"serialname": "xppm_c", "istart": grid.is_},
         }
         self.in_vars["parameters"] = ["iord", "jfirst", "jlast"]
         self.out_vars = {
-            "xflux": {
+            "xppm_flux": {
                 "istart": grid.is_,
                 "iend": grid.ie + 1,
                 "jstart": "jfirst",
@@ -40,7 +41,7 @@ def process_inputs(self, inputs):
 
     def compute(self, inputs):
         self.process_inputs(inputs)
-        inputs["xflux"] = utils.make_storage_from_shape(
+        inputs["xppm_flux"] = utils.make_storage_from_shape(
             inputs["q"].shape, backend=self.stencil_factory.backend
         )
         origin = self.grid.grid_indexing.origin_compute()
@@ -53,7 +54,7 @@ def compute(self, inputs):
             origin=(origin[0], int(inputs["jfirst"]), origin[2]),
             domain=(domain[0], int(inputs["jlast"] - inputs["jfirst"] + 1), domain[2]),
         )
-        self.compute_func(inputs["q"], inputs["c"], inputs["xflux"])
+        self.compute_func(inputs["q"], inputs["c"], inputs["xppm_flux"])
         return self.slice_output(inputs)
 
 
@@ -65,5 +66,5 @@ def __init__(
         stencil_factory: StencilFactory,
     ):
         super().__init__(grid, namelist, stencil_factory)
-        self.in_vars["data_vars"]["q"]["serialname"] = "q"
-        self.out_vars["xflux"]["serialname"] = "xflux_2"
+        self.in_vars["data_vars"]["q"]["serialname"] = "xppm_q2"
+        self.out_vars["xppm_flux"]["serialname"] = "xppm_flux_2"
diff --git a/tests/savepoint/translate/translate_xtp_u.py b/tests/savepoint/translate/translate_xtp_u.py
index 4a0cbbb6..60f64957 100644
--- a/tests/savepoint/translate/translate_xtp_u.py
+++ b/tests/savepoint/translate/translate_xtp_u.py
@@ -1,6 +1,7 @@
 from gt4py.cartesian.gtscript import PARALLEL, computation, interval
 
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.dsl.typing import FloatField, FloatFieldIJ
 from ndsl.grid import GridData
 from pyFV3.stencils import xtp_u
diff --git a/tests/savepoint/translate/translate_yppm.py b/tests/savepoint/translate/translate_yppm.py
index 4b9430d0..6d296da2 100644
--- a/tests/savepoint/translate/translate_yppm.py
+++ b/tests/savepoint/translate/translate_yppm.py
@@ -1,5 +1,6 @@
 import ndsl.dsl.gt4py_utils as utils
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.dsl.typing import Float
 from ndsl.stencils.testing import TranslateGrid
 from pyFV3.stencils import YPiecewiseParabolic
diff --git a/tests/savepoint/translate/translate_ytp_v.py b/tests/savepoint/translate/translate_ytp_v.py
index 038f889f..cfb2cc57 100644
--- a/tests/savepoint/translate/translate_ytp_v.py
+++ b/tests/savepoint/translate/translate_ytp_v.py
@@ -1,6 +1,7 @@
 from gt4py.cartesian.gtscript import PARALLEL, computation, interval
 
-from ndsl import Namelist, StencilFactory
+from ndsl import StencilFactory
+from f90nml import Namelist
 from ndsl.dsl.typing import FloatField, FloatFieldIJ
 from ndsl.grid import GridData
 from pyFV3 import DynamicalCoreConfig
diff --git a/tests/script/geos_fp/TEMP/run_yppm_xppm.sh b/tests/script/geos_fp/TEMP/run_yppm_xppm.sh
new file mode 100755
index 00000000..9795418c
--- /dev/null
+++ b/tests/script/geos_fp/TEMP/run_yppm_xppm.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+THIS_DIR=$PWD
+TEST_DATA_PATH="../../../../test_data/geos/TEMP_XPPM_YPMM"
+mkdir -p $TEST_DATA_PATH
+cd $TEST_DATA_PATH
+
+wget https://portal.nccs.nasa.gov/datashare/astg/smt/geos-fp/translate/11.5.2/x86_GNU/Dycore/TBC_C24_L72_Debug/YPPM-In.nc
+wget https://portal.nccs.nasa.gov/datashare/astg/smt/geos-fp/translate/11.5.2/x86_GNU/Dycore/TBC_C24_L72_Debug/YPPM-Out.nc
+wget https://portal.nccs.nasa.gov/datashare/astg/smt/geos-fp/translate/11.5.2/x86_GNU/Dycore/TBC_C24_L72_Debug/XPPM-In.nc
+wget https://portal.nccs.nasa.gov/datashare/astg/smt/geos-fp/translate/11.5.2/x86_GNU/Dycore/TBC_C24_L72_Debug/XPPM-Out.nc
+wget https://portal.nccs.nasa.gov/datashare/astg/smt/geos-fp/translate/11.5.2/x86_GNU/Dycore/TBC_C24_L72_Debug/input.nml
+wget https://portal.nccs.nasa.gov/datashare/astg/smt/geos-fp/translate/11.5.2/x86_GNU/Dycore/TBC_C24_L72_Debug/Grid-Info.nc
+
+
+cd $THIS_DIR
+rm -r ./.gt_cache_*
+
+export PACE_FLOAT_PRECISION=32
+export PACE_CONSTANTS=GEOS
+export FV3_DACEMODE=Python
+
+python -m pytest -v -s -x \
+    --data_path=$TEST_DATA_PATH \
+    --backend=numpy \
+    --which_modules=XPPM,YPPM \
+    --multimodal_metric \
+    ../../../savepoint

constants_mod	radius, pi=>pi_8, rvgas, rdgas, grav, hlv, hlf, cp_air, cp_vapor
field_manager_mod	MODEL_ATMOS
fv_arrays_mod	fv_grid_type
fv_cmp_mod	qs_init, fv_sat_adj
fv_fill_mod	fillz
fv_grid_utils_mod	g_sum, ptop_min
fv_mp_mod	is_master
fv_timing_mod	timing_on, timing_off
fv_tracer2d_mod	tracer_2d, tracer_2d_1L, tracer_2d_nested
mpp_mod/td> +!	NOTE, mpp_error, get_unit, mpp_root_pe, mpp_pe
mpp_domains_mod/td> +!	mpp_update_domains, domain2d
tracer_manager_mod	get_tracer_index