From ec845a60ffad1b11e8beac711d0602328153f04f Mon Sep 17 00:00:00 2001
From: Roman Cattaneo <1116746+romanc@users.noreply.github.com>
Date: Fri, 8 Aug 2025 08:52:06 +0200
Subject: [PATCH 1/4] Update gt4py: support for literal precision

---
 external/gt4py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/gt4py b/external/gt4py
index 68eea74b..c678c31c 160000
--- a/external/gt4py
+++ b/external/gt4py
@@ -1 +1 @@
-Subproject commit 68eea74b748747ac5415c93e479d7964f3ec6947
+Subproject commit c678c31c111a15fa61743d2dc3d47cc4af7778d4

From 4e5946ca7b1004e80c68720bb84d929c7f3ae566 Mon Sep 17 00:00:00 2001
From: Roman Cattaneo <1116746+romanc@users.noreply.github.com>
Date: Mon, 11 Aug 2025 09:28:22 +0200
Subject: [PATCH 2/4] Actually forwarding literal precision to gt4py

---
 ndsl/dsl/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ndsl/dsl/__init__.py b/ndsl/dsl/__init__.py
index b30d0a35..5fa508d2 100644
--- a/ndsl/dsl/__init__.py
+++ b/ndsl/dsl/__init__.py
@@ -41,7 +41,8 @@ def _get_literal_precision(default: Literal["32", "64"] = "64") -> Literal["32",
 
 
 NDSL_GLOBAL_PRECISION = int(_get_literal_precision())
-os.environ["GT4PY_LITERAL_PRECISION"] = str(NDSL_GLOBAL_PRECISION)
+os.environ["GT4PY_LITERAL_INT_PRECISION"] = str(NDSL_GLOBAL_PRECISION)
+os.environ["GT4PY_LITERAL_FLOAT_PRECISION"] = str(NDSL_GLOBAL_PRECISION)
 
 
 # Set cache names for default gt backends workflow

From ed5187708b5b35f7f714946af805fc6fd597ec6c Mon Sep 17 00:00:00 2001
From: Roman Cattaneo <1116746+romanc@users.noreply.github.com>
Date: Mon, 11 Aug 2025 09:28:52 +0200
Subject: [PATCH 3/4] Exposing type casts and new math functions

---
 ndsl/dsl/gt4py/__init__.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/ndsl/dsl/gt4py/__init__.py b/ndsl/dsl/gt4py/__init__.py
index 3ae0dbd8..5843afb8 100644
--- a/ndsl/dsl/gt4py/__init__.py
+++ b/ndsl/dsl/gt4py/__init__.py
@@ -26,12 +26,18 @@
     computation,
     cos,
     cosh,
+    erf,
+    erfc,
     exp,
     externals,
+    float32,
+    float64,
     floor,
     function,
     gamma,
     horizontal,
+    int32,
+    int64,
     interval,
     isfinite,
     isinf,
@@ -80,12 +86,18 @@
     "computation",
     "cos",
     "cosh",
+    "erf",
+    "erfc",
     "exp",
     "externals",
+    "float32",
+    "float64",
     "floor",
     "function",
     "gamma",
     "horizontal",
+    "int32",
+    "int64",
     "interval",
     "isfinite",
     "isinf",

From 6b57b59a41759763623baa6db7aea5b413de4d56 Mon Sep 17 00:00:00 2001
From: Roman Cattaneo <1116746+romanc@users.noreply.github.com>
Date: Mon, 11 Aug 2025 10:45:20 +0200
Subject: [PATCH 4/4] Documentation update

---
 docs/index.md      | 101 +++------------------------------------------
 docs/quickstart.md |  36 ++++++++++++++++
 docs/user/index.md |  48 +++++++++++++++++++++
 mkdocs.yml         |   5 +++
 4 files changed, 95 insertions(+), 95 deletions(-)
 create mode 100644 docs/quickstart.md

diff --git a/docs/index.md b/docs/index.md
index a87616b0..779a8781 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,102 +1,13 @@
 # NDSL Documentation
 
-NDSL allows atmospheric scientists to write focus on what matters in model development and hides away the complexities of coding for a super computer.
+NDSL is a middleware for climate and weather modelling developed jointly by NOAA and NASA. It allows atmospheric scientists to focus on what matters in model development and essentially decouples performance engineering from model development.
 
-## Quick Start
+## Portable performance
 
-Python `3.11.x` is required for NDSL and all its third party dependencies for installation.
+NDSL brings together [GT4Py](https://github.com/GridTools/gt4py/) and [DaCe](https://github.com/spcl/dace/), two libraries developed for high-performance and portability. On top of those pillars, NDSL deploys a series of optimized APIs for common operations, e.g. halo exchange or domain decomposition, and tools to port existing models.
 
-NDSL submodules `gt4py` and `dace` to point to vetted versions, use `git clone --recurse-submodule` to update the git submodules.
+## Batteries-included for FV-based models
 
-NDSL is **NOT** available on `pypi`. Installation of the package has to be local, via `pip install ./NDSL` (`-e` supported). The packages have a few options:
+Historically, NDSL was developed to port the FV3 dynamical core on the cubed-sphere. Therefore, the middleware ships with ready-to-execute specialization for models based on cubed-sphere grids and FV-based models in particular.
 
-- `ndsl[test]`: installs the test packages (based on `pytest`)
-- `ndsl[develop]`: installs tools for development and tests.
-
-NDSL uses pytest for its unit tests, the tests are available via:
-
-- `pytest -x test`: running CPU serial tests (GPU as well if `cupy` is installed)
-- `mpirun -np 6 pytest -x test/mpi`: running CPU parallel tests (GPU as well if `cupy` is installed)
-
-## Requirements & supported compilers
-
-For CPU backends:
-
-- 3.11.x >= Python < 3.12.x
-- Compilers:
-  - GNU 11.2+
-
-For GPU backends (the above plus):
-
-- CUDA 11.2+
-- Python package:
-  - `cupy` (latest with proper driver support [see install notes](https://docs.cupy.dev/en/stable/install.html))
-- Libraries:
-  - MPI compiled with cuda support
-
-## NDSL installation and testing
-
-NDSL is not available at `pypi`, it uses
-
-```bash
-pip install NDSL
-```
-
-to install NDSL locally.
-
-NDSL has a few options:
-
-- `ndsl[test]`: installs the test packages (based on `pytest`)
-- `ndsl[develop]`: installs tools for development and tests.
-
-Tests are available via:
-
-- `pytest -x test`: running CPU serial tests (GPU as well if `cupy` is installed)
-- `mpirun -np 6 pytest -x test/mpi`: running CPU parallel tests (GPU as well if `cupy` is installed)
-
-## Configurations for Pace
-
-Configurations for Pace to use NDSL with different backend:
-
-- FV3_DACEMODE=Python[Build|BuildAndRun|Run] controls the full program optimizer behavior
-
-  - Python: default, use stencil only, no full program optimization
-
-  - Build: will build the program then exit. This _build no matter what_. (backend must be `dace:gpu` or `dace:cpu`)
-
-  - BuildAndRun: same as above but after build the program will keep executing (backend must be `dace:gpu` or `dace:cpu`)
-
-  - Run: load pre-compiled program and execute, fail if the .so is not present (_no hash check!_) (backend must be `dace:gpu` or `dace:cpu`)
-
-- NDSL_LITERAL_PRECISION=64 controls the floating point precision throughout the program.
-
-Install Pace with different NDSL backend:
-
-- Shell scripts to install Pace using NDSL backend on specific machines such as Gaea can be found in `examples/build_scripts/`.
-- When cloning Pace you will need to update the repository's submodules as well:
-
-```bash
-git clone --recursive https://github.com/ai2cm/pace.git
-```
-
-  or if you have already cloned the repository:
-
-```bash
-git submodule update --init --recursive
-```
-
-- Pace requires GCC > 9.2, MPI, and Python 3.8 on your system, and CUDA is required to run with a GPU backend.
-- We recommend creating a python `venv` or conda environment specifically for Pace.
-
-```bash
-python3 -m venv venv_name
-source venv_name/bin/activate
-```
-
-- Inside of your pace `venv` or conda environment pip install the Python requirements, GT4Py, and Pace:
-
-```bash
-pip3 install -r requirements_dev.txt -c constraints.txt
-```
-
-- There are also separate requirements files which can be installed for linting (`requirements_lint.txt`) and building documentation   (`requirements_docs.txt`).
+Next: get [up and running](./quickstart.md).
diff --git a/docs/quickstart.md b/docs/quickstart.md
new file mode 100644
index 00000000..a0125181
--- /dev/null
+++ b/docs/quickstart.md
@@ -0,0 +1,36 @@
+# Quickstart
+
+Alright - let's get you up an running!
+
+NDSL requires Python version `3.11` and a GNU compiler. We strongly recommend using a conda or virtual environment.
+
+```shell
+# We have submodules for GT4Py and DaCe. Don't forget to pull them
+git clone --recurse-submodules git@github.com:NOAA-GFDL/NDSL.git
+
+cd NDSL/
+
+# We strongly recommend using conda or a virtual environment
+python -m venv .venv/
+source ./venv/bin/activate
+
+# [optional] Install MPI if you don't have a system installation.
+pip install openmpi
+
+# Finally, install NDSL
+pip install .[demos]
+```
+
+Now you can run through the Jupyter notebooks in `examples/NDSL` :rocket:.
+
+Read on in the [user manual](./user/index.md).
+
+!!! note "Supported compilers"
+
+    NDSL currently only works with the GNU compiler. Using `clang` will result in errors related to undefined OpenMP flags.
+
+    For MacOS users, we know that `gcc` version 14 from homebrew works.
+
+!!! question "Why cloning the repository?"
+
+    We are cloning the repository because NDSL is not available on `pypi`.
diff --git a/docs/user/index.md b/docs/user/index.md
index 292d3953..de57ad93 100644
--- a/docs/user/index.md
+++ b/docs/user/index.md
@@ -1,3 +1,51 @@
 # Usage documentation
 
 This part of the documentation is geared towards users of NDSL.
+
+## Up and running
+
+See our [quickstart guide](./quickstart.md) on how to get up and running.
+
+## Configuration
+
+NDSL tries to have sensible defaults. In cases you want tweak something, here are some pointers:
+
+### Literal precision (float/int)
+
+Unspecified integer and floating point literals (e.g. `42` and `3.1415`) default to 64-bit precision. This can be changed with the environment variable `PACE_FLOAT_PRECISION`.
+
+For mixed precision code, you can specify the "hard coded" precision with type hints and casts, e.g.
+
+```python
+with computation(PARALLEL), interval(...):
+    # Either 32-bit or 64-bit depending on `PACE_FLOAT_PRECISION`
+    my_int = 42
+    my_float = 3.1415
+
+    # Always 32-bit
+    my_int32: int32 = 42
+    my_float32: float32 = 3.1415
+
+    # Explicit 64-bit cast within otherwise unspecified calculation
+    factor = 0.5 * float64(3.1415 + 2.71828)
+```
+
+### Full program optimizer
+
+The behavior of the full program optimizer is controlled by `FV3_DACEMODE`. Valid values are:
+
+`Python`
+
+:   The default. Disables full program optimization and only accelerates stencil code.
+
+`Build`
+
+:   Build the program, then exit. This mode is only available for backends `dace:gpu` and `dace:cpu`.
+
+`BuildAndRun`
+
+:   Build the program, then run it immediately. This mode is only available for backends `dace:gpu` and `dace:cpu`.
+
+`Run`
+
+:   Load a pre-compiled program and run it. Fails if the pre-compiled program can not be found. This mode is only available for backends `dace:gpu` and `dace:cpu`.
diff --git a/mkdocs.yml b/mkdocs.yml
index e931b342..7f9f4f70 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -9,6 +9,7 @@ theme:
 
 nav:
   - Home: index.md
+  - Quickstart: quickstart.md
   - User documentation: user/index.md
   - Porting:
       - General Concepts: porting/index.md
@@ -24,8 +25,12 @@ markdown_extensions:
   - abbr
   # support for colored notes / warnings / tips / examples
   - admonition
+  # support for "definition lists" (<dl>)
+  - def_list
   # support for footnotes
   - footnotes
+  # support for emojis
+  - pymdownx.emoji
   # support for syntax highlighting
   - pymdownx.highlight:
       anchor_linenums: true