diff --git a/docs/images/rh_l63.png b/docs/images/rh_l63.png new file mode 100644 index 00000000..c2cb54a0 Binary files /dev/null and b/docs/images/rh_l63.png differ diff --git a/docs/images/rh_l63_halfev.png b/docs/images/rh_l63_halfev.png new file mode 100644 index 00000000..41aaf9ab Binary files /dev/null and b/docs/images/rh_l63_halfev.png differ diff --git a/docs/userguide/dartworkflow.rst b/docs/userguide/dartworkflow.rst new file mode 100644 index 00000000..4893eefb --- /dev/null +++ b/docs/userguide/dartworkflow.rst @@ -0,0 +1,231 @@ +.. _dartworkflow: + +================================================================= +An example of using pyDARTdiags with DART and the Lorenz 63 model +================================================================= + +This page will demonstrate how to use pyDARTdiags to enhance your data assimilation projects +with DART by showing you how to manipulate DART observation sequences and analyze DART results. + +We assume you have downloaded and built DART and successfully run the +`Lorenz 63 model `__. + +In this guide, we'll step through: + +* Using pyDARTdiags to change the observations input to DART's filter program +* Using pyDARTdiags to plot rank histograms of the DART Lorenz 63 obs_seq.final files, + both for the original run of filter and for the run with the modified observations + + +Manipulating Observation Sequences with pyDARTdiags +--------------------------------------------------- + +This section focuses on how pyDARTdiags fits into a DART workflow, guiding you through the process +of running a Python program in an interactive session thats uses pyDARTdiags functions to read in +the observation sequence file, modify the observation error variances, and write out a new +observation sequence file with the altered data. + +You will be working through the examples in this guide in the Lorenz 63 model working directory. +Navigate to your DART installation directory and then to the ``DART/models/lorenz_63/work`` +directory in your terminal. + +First, ensure you have pyDARTdiags installed in your Python environment. If you haven't +installed it yet, follow the instructions in the :ref:`installguide`. + +Start an interactive Python session by entering the command ``python`` or ``python3`` +in your terminal. + +Enter the following lines of code one by one. You can copy and paste them from below. + +#. Import the necessary modules. + + .. code-block:: python + + import pydartdiags.obs_sequence.obs_sequence as obsq + +#. Specify the name of the observation sequence file to be read. This will be the + Lorenz 63 ``obs_seq.out`` file that is included in DART and is already located + in your current working directory. + + .. code-block:: python + + file_name = "obs_seq.out" + +#. Read the obs_seq file into an obs_seq object. + + .. code-block:: python + + obs_seq = obsq.ObsSequence(file_name) + +#. Halve the observation error variances. + + .. code-block:: python + + obs_seq.df['obs_err_var'] = obs_seq.df['obs_err_var'] / 2.0 + +#. Create a new file name for the modified observation sequence. + + .. code-block:: python + + output_file = file_name + ".half_error_variance" + +#. Write out the modified observation sequence to a new file. + + .. code-block:: python + + obs_seq.write_obs_seq(output_file) + +#. Exit the interactive session. + + .. code-block:: bash + + exit() + +A new observation sequence file with the modified error variances should be saved in your +current directory. The new file will have be the name of the observation sequence file you +specified with ``.half_error_variance`` appended to the end. + +You can now use this new observation sequence file as input to a new DART data assimilation +experiment with Lorenz 63. Ensure you have made the necessary changes to the &filter_nml +section of the DART ``input.nml`` namelist file to point the ``obs_sequence_in_name`` namelist item +to this new input observation sequence file and give a corresponding name for the output +observation sequence file (i.e ``obs_seq.final.half_error_variance``) in the ``obs_sequence_out_name`` +item. Rerun the filter program. + + +Analyzing DART Results with pyDARTdiags +--------------------------------------- + +You have now completed two DART data assimilation experiments, each producing an +final observation sequence file, or ``obs_seq.final``. This file contains the actual +observations as assimilated as well as the ensemble forward-operator expected values +and any quality-control values. + +You can now use the pyDARTdiags library to read in and analyze the observation +space results of your DART assimilation experiment. In this example, we'll plot the +:ref:`stats-rank-hist` of the DART Lorenz 63 obs_seq.final files. + +The observation sequence files you created for Lorenz 63 contain only identity +observations. You can read about identity observations in +`this section `__ +of the pyDARTdiags documentation. + +Identity observations do not get listed in the header of the observation sequence file; they +are instead given a special value for the observation type (kind) in the observation sequence +to indicate that they are identity observations. They are denoted in a given observation by an +observation type of -x where x is the index in the DART state vector that the observation +corresponds to. + +In the ObsSequence DataFrame, the type of identity observations is stored as this negative integer +of the index in the DART state vector. Therefore, when writing the program to plot the rank +histograms, you will specify the observation type as ``IDENTITY_OBS``. + +Start an interactive Python session by entering the command ``python`` or ``python3`` +in your terminal. + +Enter the following lines of code one by one. You can copy and paste them from below. + +#. Import the obs_sequence module. + + .. code-block:: python + + import pydartdiags.obs_sequence.obs_sequence as obsq + +#. Import the matplots module. + + .. code-block:: python + + import pydartdiags.matplots.matplots as mp + +#. Specify the path to and name of the final observation sequence file from + your first DART data assimilation experiment. + + .. code-block:: python + + file_name = "obs_seq.final" + +#. Specify the path to and name of the final observation sequence file from + your second DART data assimilation experiment. + + .. code-block:: python + + file_name2 = "obs_seq.final.half_error_variance" + + +#. Read the first observation sequence file into an obs_seq object. + + .. code-block:: python + + obs_seq = obsq.ObsSequence(file_name) + +#. Read the second observation sequence file into an obs_seq object. + + .. code-block:: python + + obs_seq_half_ev = obsq.ObsSequence(file_name2) + +#. Choose an observation type to plot on the rank histograms. + + .. code-block:: python + + obs_type = "IDENTITY_OBS" + +#. Set the ensemble size used in your DART experiments. + For the Lorenz 63 model, the default ensemble size is 20. + + .. code-block:: python + + ens_size = 20 + +#. Plot the rank histograms for the first obs_seq.final. + The dataframe has prior and posterior information so both the prior and posterior rank + histograms are plotted. + + Once plotted, save the figure and close it. + + .. code-block:: python + + fig = mp.plot_rank_histogram(obs_seq, obs_type, ens_size) + +#. Plot the rank histograms for the second obs_seq.final with halved error variance. + + Once plotted, save the figure and close it. + + .. code-block:: python + + fig2 = mp.plot_rank_histogram(obs_seq_half_ev, obs_type, ens_size) + +#. Exit the interactive session. + + .. code-block:: bash + + exit() + +You should now have rank histogram plots for both experiments and can now compare the two +to see how the change in observation error variances affected the results of your data +assimilation experiments. Look for differences in the shape of the histograms, which supply +information on the model bias. The results should look like similar to the images below. + +Rank Histogram for First Lorenz 63 Experiment: + +.. image:: ../images/rh_l63.png + :alt: Rank Histogram for First Lorenz 63 Experiment + :width: 800px + :align: center + +Rank Histogram for Second Lorenz 63 Experiment with Halved Error Variance: + +.. image:: ../images/rh_l63_halfev.png + :alt: Rank Histogram for Second Lorenz 63 Experiment with Halved Error Variance + :width: 800px + :align: center + +The rank histogram for the initial experiment should be generally flatter and more evenly spread +across the ranks, indicating a more reliable forecast (the observed distribution is well represented +by the ensemble and all ensemble members represent equally likely scenarios). + +By following this workflow, you have learned how pyDARTdiags can be easily integrated into your +DART data assimilation experiments, allowing you to effectively manipulate observation sequences +and analyze assimilation results. Please refer to the :ref:`userguide` and +:ref:`examples-index` sections of the documentation for more detailed information and additional +examples of using pyDARTdiags. \ No newline at end of file diff --git a/docs/userguide/index.rst b/docs/userguide/index.rst index 7c6d2510..d1bf6cb4 100644 --- a/docs/userguide/index.rst +++ b/docs/userguide/index.rst @@ -13,6 +13,7 @@ User Guide quickstart working-with-obsq statistics + dartworkflow Welcome to the pyDARTdiags User Guide! diff --git a/docs/userguide/installguide.rst b/docs/userguide/installguide.rst index ada1c51f..f6f60a63 100644 --- a/docs/userguide/installguide.rst +++ b/docs/userguide/installguide.rst @@ -1,3 +1,5 @@ +.. _installguide: + ============= Install Guide ============= diff --git a/docs/userguide/statistics.rst b/docs/userguide/statistics.rst index efb7c429..241374e2 100644 --- a/docs/userguide/statistics.rst +++ b/docs/userguide/statistics.rst @@ -110,6 +110,9 @@ are calculated as follows: .. math:: \sigma_{T,s} \equiv \sqrt{\frac{1}{N} \sum_{n=1}^{N} (\sigma^2_u + \sigma^2_v + \epsilon^2_u + \epsilon^2_v )} + +.. _stats-rank-hist: + Rank Histogram -------------- diff --git a/examples/03_diagnostics/plot_rank_histogram.py b/examples/03_diagnostics/plot_rank_histogram.py index b84e2730..892e702e 100644 --- a/examples/03_diagnostics/plot_rank_histogram.py +++ b/examples/03_diagnostics/plot_rank_histogram.py @@ -51,4 +51,4 @@ # The dataframe has prior and posterior information so both the prior # and posterior rank histograms are plotted. ens_size = 80 -fig = mp.plot_rank_histogram(obs_seq, levels, obs_type, ens_size) +fig = mp.plot_rank_histogram(obs_seq, obs_type, ens_size, levels) diff --git a/src/pydartdiags/matplots/matplots.py b/src/pydartdiags/matplots/matplots.py index 8b086755..76bd65d8 100644 --- a/src/pydartdiags/matplots/matplots.py +++ b/src/pydartdiags/matplots/matplots.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from pydartdiags.stats import stats import matplotlib.pyplot as plt +import pandas as pd # HK @todo color scheme class dacolors = ["green", "magenta", "orange", "red"] @@ -10,7 +11,7 @@ def plot_profile( obs_seq, levels, type, bias=True, rmse=True, totalspread=True, depth=False ): """ - plot_profile on the levels for prior and posterior if present + plot_profile on the levels for prior and posterior if present - bias - rmse - totalspread @@ -227,33 +228,47 @@ def plot_profile( return fig -def plot_rank_histogram(obs_seq, levels, type, ens_size): +def plot_rank_histogram(obs_seq, type, ens_size, levels=None): + """ + Plot the rank histograms of the requested observation type, ensemble + size, and levels (if applicable). Rank histograms are plotted for prior + and posterior if present. + + Args: + obs_seq: The observation sequence object. + type (str): The type of observation to filter by. For identity + observations, use "IDENTITY_OBS" or a negative integer + ens_size (int): The ensemble size. + levels (list, optional): The levels to bin by. If None, no binning by level. + + Returns: + fig: The matplotlib figure object. + """ qc0 = stats.select_used_qcs(obs_seq.df) # filter only qc=0, qc=2 - qc0 = qc0[qc0["type"] == type] # filter by type - if qc0.empty: - print(f"No rows found for type: {type}") - return None - if qc0["vert_unit"].nunique() > 1: + if (isinstance(type, int) and type < 0) or (type == "IDENTITY_OBS"): + type = "IDENTITY_OBS" print( - f"Multiple vertical units found in the data: {qc0['vert_unit'].unique()} for type: {type}" - ) - return None + "Observation type is for identity observations." + ) # Filter on types < 0 to get identity observations - vert_unit = qc0.iloc[0]["vert_unit"] - conversion, unit = _get_plot_unit(vert_unit) # multiplier and unit for y-axis - - stats.bin_by_layer(qc0, levels, verticalUnit=vert_unit) # bin by level + # Only keep rows where 'type' is numeric before comparing + qc0 = qc0[pd.to_numeric(qc0["type"], errors="coerce").notnull()] + qc0 = qc0[qc0["type"].astype(int) < 0] + if qc0.empty: + print(f"No rows found for IDENTITY_OBS") + return None - midpoints = qc0["midpoint"].unique() + else: + qc0 = qc0[qc0["type"] == type] # filter by type - for level in sorted(midpoints): + if qc0.empty: + print(f"No rows found for type: {type}") + return None - df = qc0[qc0["midpoint"] == level] - # convert to hPa only for Pressure (Pa) - df["midpoint"] = df["midpoint"].astype(float) - df["midpoint"] = df["midpoint"] * conversion + if levels is None: + print(f"No levels given. Proceeding without level binning.") df = stats.calculate_rank(qc0) @@ -284,6 +299,62 @@ def plot_rank_histogram(obs_seq, levels, type, ens_size): ax2.set_xlabel("Observation Rank (among ensemble members)") ax2.set_ylabel("Count") + fig.suptitle(f"{type}", fontsize=14) + + plt.tight_layout(rect=[0, 0.03, 1, 0.95]) + plt.show() + return None + + elif qc0["vert_unit"].nunique() > 1: + print( + f"Multiple vertical units found in the data: {qc0['vert_unit'].unique()} for type: {type}" + ) + return None + + else: + vert_unit = qc0.iloc[0]["vert_unit"] + conversion, unit = _get_plot_unit(vert_unit) # multiplier and unit for y-axis + + stats.bin_by_layer(qc0, levels, verticalUnit=vert_unit) # bin by level + + midpoints = qc0["midpoint"].unique() + + for level in sorted(midpoints): + + df = qc0[qc0["midpoint"] == level] + # convert to hPa only for Pressure (Pa) + df["midpoint"] = df["midpoint"].astype(float) + df["midpoint"] = df["midpoint"] * conversion + + df = stats.calculate_rank(qc0) + + if "posterior_rank" in df.columns: + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6)) + else: + fig, ax1 = plt.subplots() + + # Plot the prior rank histogram + bins = list(range(1, ens_size + 2)) + ax1.hist( + df["prior_rank"], bins=bins, color="blue", alpha=0.5, label="prior rank" + ) + ax1.set_title("Prior Rank Histogram") + ax1.set_xlabel("Observation Rank (among ensemble members)") + ax1.set_ylabel("Count") + + # Plot the posterior rank histogram if it exists + if "posterior_rank" in df.columns: + ax2.hist( + df["posterior_rank"], + bins=bins, + color="green", + alpha=0.5, + label="posterior rank", + ) + ax2.set_title("Posterior Rank Histogram") + ax2.set_xlabel("Observation Rank (among ensemble members)") + ax2.set_ylabel("Count") + fig.suptitle(f"{type} at Level {round(level, 1)} {unit}", fontsize=14) plt.tight_layout(rect=[0, 0.03, 1, 0.95])