diff --git a/src/CSET/cset_workflow/flow.cylc b/src/CSET/cset_workflow/flow.cylc index ddf05ecfe..ec042e9e1 100644 --- a/src/CSET/cset_workflow/flow.cylc +++ b/src/CSET/cset_workflow/flow.cylc @@ -101,6 +101,7 @@ final cycle point = {{CSET_TRIAL_END_DATE}} {% if SKIP_WRITE|default(False) %} SKIP_WRITE = True {% endif %} + HISTOGRAM_METHOD = {{HISTOGRAM_METHOD|default("frequency")}} [[PROCESS]] diff --git a/src/CSET/cset_workflow/meta/rose-meta.conf b/src/CSET/cset_workflow/meta/rose-meta.conf index 99a67b25a..07feab990 100644 --- a/src/CSET/cset_workflow/meta/rose-meta.conf +++ b/src/CSET/cset_workflow/meta/rose-meta.conf @@ -180,6 +180,28 @@ type=python_boolean compulsory=true sort-key=setup-h-out4 +[template variables=HISTOGRAM_METHOD] +ns=Setup +title=Histogram method +description=Histogram method to use. +help=This selects the method used to compute histograms. The options are: + + "Density" Computes the density histogram, i.e. the total area + under the histogram is 1. This is the closest + approximation to the continuous probability density + function. This histogram shape is also invariant to + the bin width. + + "Frequency" Computes the frequency histogram. This plots the + number of counts in each bin. + + "Normalised frequency" Plots the number of counts in each bin normalised by the + total number of counts. The sum of the counts is 1. +values="density", "frequency", "normalised_frequency" +value-titles=Density, Frequency, Normalised frequency +compulsory=true +sort-key=setup-h-out5 + [template variables=HOUSEKEEPING_MODE] ns=Setup title=Housekeeping mode diff --git a/src/CSET/cset_workflow/meta/rose-meta.conf.jinja2 b/src/CSET/cset_workflow/meta/rose-meta.conf.jinja2 index a2ec063ac..71273ba3f 100644 --- a/src/CSET/cset_workflow/meta/rose-meta.conf.jinja2 +++ b/src/CSET/cset_workflow/meta/rose-meta.conf.jinja2 @@ -180,6 +180,28 @@ type=python_boolean compulsory=true sort-key=setup-h-out4 +[template variables=HISTOGRAM_METHOD] +ns=Setup +title=Histogram method +description=Histogram method to use. +help=This selects the method used to compute histograms. The options are: + + "Density" Computes the density histogram, i.e. the total area + under the histogram is 1. This is the closest + approximation to the continuous probability density + function. This histogram shape is also invariant to + the bin width. + + "Frequency" Computes the frequency histogram. This plots the + number of counts in each bin. + + "Normalised frequency" Plots the number of counts in each bin normalised by the + total number of counts. The sum of the counts is 1. +values="density", "frequency", "normalised_frequency" +value-titles=Density, Frequency, Normalised frequency +compulsory=true +sort-key=setup-h-out5 + [template variables=HOUSEKEEPING_MODE] ns=Setup title=Housekeeping mode diff --git a/src/CSET/cset_workflow/opt/rose-suite-RAL3LFRIC.conf b/src/CSET/cset_workflow/opt/rose-suite-RAL3LFRIC.conf index b4dedd837..2a73d28a4 100644 --- a/src/CSET/cset_workflow/opt/rose-suite-RAL3LFRIC.conf +++ b/src/CSET/cset_workflow/opt/rose-suite-RAL3LFRIC.conf @@ -20,6 +20,7 @@ DETERMINISTIC_PLOT_CAPE_RATIO=False DETERMINISTIC_PLOT_INFLOW_PROPERTIES=False EXTRACT_MLEVEL_TRANSECT=False EXTRACT_PLEVEL_TRANSECT=False +HISTOGRAM_METHOD="density" HISTOGRAM_MLEVEL_FIELD=False HISTOGRAM_MLEVEL_FIELD_AGGREGATION=False,False,False,False HISTOGRAM_MLEVEL_FIELD_SEQUENCE=False diff --git a/src/CSET/cset_workflow/rose-suite.conf.example b/src/CSET/cset_workflow/rose-suite.conf.example index 2c6b93a72..f5899f6da 100644 --- a/src/CSET/cset_workflow/rose-suite.conf.example +++ b/src/CSET/cset_workflow/rose-suite.conf.example @@ -22,6 +22,7 @@ DETERMINISTIC_PLOT_CAPE_RATIO=False DETERMINISTIC_PLOT_INFLOW_PROPERTIES=False EXTRACT_MLEVEL_TRANSECT=False EXTRACT_PLEVEL_TRANSECT=False +HISTOGRAM_METHOD="density" HISTOGRAM_MLEVEL_FIELD=False HISTOGRAM_MLEVEL_FIELD_AGGREGATION=False,False,False,False HISTOGRAM_MLEVEL_FIELD_SEQUENCE=False diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py index 499ca7e7a..d2c265d70 100644 --- a/src/CSET/operators/plot.py +++ b/src/CSET/operators/plot.py @@ -1062,13 +1062,107 @@ def _plot_and_save_vector_plot( plt.close(fig) +def _plot_histogram( + data, method: str = "density", title: str = "", units: str = "", **kwargs +) -> dict: + """Plot a histogram using a specified method to compute the histogram. + + Histograms can be computed using any of three different methods: + + frequency - This is constructed from the raw counts within each bin. + + normalised_frequency - The frequency counts are normalised so that the total + counts is 1. + + density - The counts per bin are normalised by both the total + counts and the bin widths. The sum of the areas of the + bins is 1. This is the discrete sampling analogue of + the continuous probability density function. + + This function returns a dictionary of plotting labels to use on the histogram plot. + Keys available: + + "ylabel" - label for the y-axis indicating the method used to compute the histogram. + + + It is possible to specify both the range of data values to consider for the + histogram and the number of bins to use. Using these options will set the bin + width to ( value_maximum - value_minimum ) / number of bins. See the documentation + for matplotlib.pyplot.hist for more details at: + https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.hist.html + + Parameters + ---------- + data: array + Input values of the data to use for computing the histogram. + method: "frequency" | "normalised_frequency" | "density", optional + The histogram method to use. Default is "density". + title: str + Title of the histogram. + units: str + Units of the data. + **kwargs: optional keyword arguments used by matplotlib.pyplot.hist + The keywords "density" and "weights" are set by the input "method", so + values should not be assigned to these keywords in the call to this function. + """ + # Define the parameter settings in matplotlib.pyplot.hist + # required to implement the histogram methods "density", + # "normalised frequency" and "frequency". + hist_method = { + "density": {"method_args": {"density": True}, "labels": {"ylabel": "Density"}}, + "normalised_frequency": { + "method_args": { + "density": False, + "weights": 1.0 / len(data) * np.ones(len(data)), + }, + "labels": {"ylabel": "Normalised frequency"}, + }, + "frequency": { + "method_args": {"density": False}, + "labels": {"ylabel": "Frequency"}, + }, + } + + # Grab the parameter settings required for the histogram method + try: + args_method = hist_method[method]["method_args"] + except KeyError as err: + raise ValueError( + f"Unrecognised histogram method: {method}\n" + f"Method should be one of {', '.join(hist_method.keys())}" + ) from err + + # If surface microphysical precipitation amounts calculate different histogram. + if "surface_microphysical" in title and "amount" in title: + bins = kwargs["bins"] + x, y = np.histogram( + data, bins=bins, density=args_method["method_args"]["density"] + ) + bin_mean = (bins[:-1] + bins[1:]) / 2.0 + x = x * bin_mean / x.sum() + x = x[1:] + y = y[1:] + + ax = plt.gca() + ax.plot(y[:-1], x, marker="o", markersize=6, **kwargs) + + args_method["labels"]["ylabel"] = f"Contribution to mean ({units})" + else: + # Use the function matplotlib.pyplot.hist to plot the histogram. + plt.hist(data, **args_method, **kwargs) + + # Return from this function _plot_histogram, passing back the + # label to print on the y-axis of the histogram. + return hist_method[method]["labels"] + + def _plot_and_save_histogram_series( cubes: iris.cube.Cube | iris.cube.CubeList, filename: str, title: str, vmin: float, vmax: float, - **kwargs, + histogram_method: str, ): """Plot and save a histogram series. @@ -1084,16 +1178,14 @@ def _plot_and_save_histogram_series( minimum for colorbar vmax: float maximum for colorbar + histogram_method: str + Histogram method to use i.e. frequency, normalised_frequency or density. """ fig = plt.figure(figsize=(10, 10), facecolor="w", edgecolor="k") ax = plt.gca() model_colors_map = _get_model_colors_map(cubes) - # Set default that histograms will produce probability density function - # at each bin (integral over range sums to 1). - density = True - for cube in iter_maybe(cubes): # Easier to check title (where var name originates) # than seeing if long names exist etc. @@ -1103,7 +1195,6 @@ def _plot_and_save_histogram_series( # Compute histogram following Klingaman et al. (2017): ASoP bin2 = np.exp(np.log(0.02) + 0.1 * np.linspace(0, 99, 100)) bins = np.pad(bin2, (1, 0), "constant", constant_values=0) - density = False else: bins = 10.0 ** ( np.arange(-10, 27, 1) / 10.0 @@ -1133,30 +1224,26 @@ def _plot_and_save_histogram_series( if model_colors_map: label = cube.attributes.get("model_name") color = model_colors_map[label] - x, y = np.histogram(cube_data_1d, bins=bins, density=density) - - # Compute area under curve. - if "surface_microphysical" in title and "amount" in title: - bin_mean = (bins[:-1] + bins[1:]) / 2.0 - x = x * bin_mean / x.sum() - x = x[1:] - y = y[1:] - - ax.plot( - y[:-1], x, color=color, linewidth=3, marker="o", markersize=6, label=label + # Plot the histogram.x, y = np.histogram(cube_data_1d, bins=bins, density=density) + y = _plot_histogram( + cube_data_1d, + method=histogram_method, + title=title, + units=cube.units, + bins=bins, + color=color, + linewidth=2, + histtype="step", + label=label, ) # Add some labels and tweak the style. - ax.set_title(title, fontsize=16) - ax.set_xlabel( - f"{iter_maybe(cubes)[0].name()} / {iter_maybe(cubes)[0].units}", fontsize=14 + ax.set( + title=title, + xlabel=f"{iter_maybe(cubes)[0].name()} / {iter_maybe(cubes)[0].units}", + ylabel=y["ylabel"], + xlim=(vmin, vmax), ) - ax.set_ylabel("Normalised probability density", fontsize=14) - if "surface_microphysical" in title and "amount" in title: - ax.set_ylabel( - f"Contribution to mean ({iter_maybe(cubes)[0].units})", fontsize=14 - ) - ax.set_xlim(vmin, vmax) ax.tick_params(axis="both", labelsize=12) # Overlay grid-lines onto histogram plot. @@ -1177,7 +1264,7 @@ def _plot_and_save_postage_stamp_histogram_series( stamp_coordinate: str, vmin: float, vmax: float, - **kwargs, + histogram_method: str, ): """Plot and save postage (ensemble members) stamps for a histogram series. @@ -1195,6 +1282,9 @@ def _plot_and_save_postage_stamp_histogram_series( minimum for pdf x-axis vmax: float maximum for pdf x-axis + histogram_method: str + Histogram method to use i.e. frequency, normalised_frequency or density. + """ # Use the smallest square grid that will fit the members. grid_size = int(math.ceil(math.sqrt(len(cube.coord(stamp_coordinate).points)))) @@ -1230,7 +1320,7 @@ def _plot_and_save_postage_stamps_in_single_plot_histogram_series( stamp_coordinate: str, vmin: float, vmax: float, - **kwargs, + histogram_method: str, ): fig, ax = plt.subplots(figsize=(10, 10), facecolor="w", edgecolor="k") ax.set_title(title, fontsize=16) @@ -1914,7 +2004,7 @@ def scatter_plot( def vector_plot( cube_u: iris.cube.Cube, cube_v: iris.cube.Cube, - filename: str = None, + filename: str | None = None, sequence_coordinate: str = "time", **kwargs, ) -> iris.cube.CubeList: @@ -1969,10 +2059,11 @@ def vector_plot( def plot_histogram_series( cubes: iris.cube.Cube | iris.cube.CubeList, - filename: str = None, + filename: str | None = None, sequence_coordinate: str = "time", stamp_coordinate: str = "realization", single_plot: bool = False, + histogram_method: str = "frequency", **kwargs, ) -> iris.cube.Cube | iris.cube.CubeList: """Plot a histogram plot for each vertical level provided. @@ -2005,6 +2096,8 @@ def plot_histogram_series( If True, all postage stamp plots will be plotted in a single plot. If False, each postage stamp plot will be plotted separately. Is only valid if stamp_coordinate exists and has more than a single point. + histogram_method: str + Histogram method to use i.e. frequency, normalised_frequency or density. Returns ------- @@ -2137,6 +2230,7 @@ def plot_histogram_series( title=title, vmin=vmin, vmax=vmax, + histogram_method=histogram_method, ) plot_index.append(plot_filename) diff --git a/src/CSET/recipes/generic_level_histogram_series.yaml b/src/CSET/recipes/generic_level_histogram_series.yaml index c6275b07d..e0f16a474 100644 --- a/src/CSET/recipes/generic_level_histogram_series.yaml +++ b/src/CSET/recipes/generic_level_histogram_series.yaml @@ -34,6 +34,7 @@ steps: # stamp_coordinate and single_plot optional and only required for ensemble data stamp_coordinate: "realization" single_plot: False + histogram_method: $HISTOGRAM_METHOD - operator: write.write_cube_to_nc overwrite: True diff --git a/src/CSET/recipes/generic_level_histogram_series_case_aggregation_all.yaml b/src/CSET/recipes/generic_level_histogram_series_case_aggregation_all.yaml index a82b5c0d9..09eaca7d8 100644 --- a/src/CSET/recipes/generic_level_histogram_series_case_aggregation_all.yaml +++ b/src/CSET/recipes/generic_level_histogram_series_case_aggregation_all.yaml @@ -35,6 +35,8 @@ steps: - operator: plot.plot_histogram_series sequence_coordinate: realization + histogram_method: $HISTOGRAM_METHOD + - operator: write.write_cube_to_nc overwrite: True diff --git a/src/CSET/recipes/generic_level_histogram_series_case_aggregation_hour_of_day.yaml b/src/CSET/recipes/generic_level_histogram_series_case_aggregation_hour_of_day.yaml index 23dc7a59b..79ae3d977 100644 --- a/src/CSET/recipes/generic_level_histogram_series_case_aggregation_hour_of_day.yaml +++ b/src/CSET/recipes/generic_level_histogram_series_case_aggregation_hour_of_day.yaml @@ -41,6 +41,7 @@ steps: # stamp_coordinate and single_plot optional and only required for ensemble data stamp_coordinate: "realization" single_plot: False + histogram_method: $HISTOGRAM_METHOD - operator: write.write_cube_to_nc overwrite: True diff --git a/src/CSET/recipes/generic_level_histogram_series_case_aggregation_lead_time.yaml b/src/CSET/recipes/generic_level_histogram_series_case_aggregation_lead_time.yaml index 93cd960bf..78c8e0732 100644 --- a/src/CSET/recipes/generic_level_histogram_series_case_aggregation_lead_time.yaml +++ b/src/CSET/recipes/generic_level_histogram_series_case_aggregation_lead_time.yaml @@ -38,6 +38,7 @@ steps: # stamp_coordinate and single_plot optional and only required for ensemble data stamp_coordinate: "realization" single_plot: False + histogram_method: $HISTOGRAM_METHOD - operator: write.write_cube_to_nc overwrite: True diff --git a/src/CSET/recipes/generic_level_histogram_series_case_aggregation_validity_time.yaml b/src/CSET/recipes/generic_level_histogram_series_case_aggregation_validity_time.yaml index 611e929c2..5eb6034e7 100644 --- a/src/CSET/recipes/generic_level_histogram_series_case_aggregation_validity_time.yaml +++ b/src/CSET/recipes/generic_level_histogram_series_case_aggregation_validity_time.yaml @@ -38,6 +38,7 @@ steps: # stamp_coordinate and single_plot optional and only required for ensemble data stamp_coordinate: "realization" single_plot: False + histogram_method: $HISTOGRAM_METHOD - operator: write.write_cube_to_nc overwrite: True diff --git a/src/CSET/recipes/generic_surface_histogram_series.yaml b/src/CSET/recipes/generic_surface_histogram_series.yaml index 0ded153ca..8758910b2 100644 --- a/src/CSET/recipes/generic_surface_histogram_series.yaml +++ b/src/CSET/recipes/generic_surface_histogram_series.yaml @@ -1,11 +1,22 @@ category: Histogram title: Histogram $VARNAME description: | - Extracts and plots the probability density of surface `$VARNAME`. It uses - [`plt.hist`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.hist.html). + Extracts and plots a histogram for `$VARNAME`. + + It uses [`plt.hist`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.hist.html). + Three methods are possible for constructing the histogram (a) `frequency` histogram plots + the raw frequency counts in each bin (b) for a `normalised frequency` plot, the + bins are normalised by the total counts so the plotted frequencies sum to 1 and + (c) a `density` histogram is the closest analogue + to the probability density function of the data with the bins being normalised by both + the total counts and the bin widths. The area under a density histogram integrates to 1. + + Some fields may exhibit numerical beats as a consequence of the low precision used to archive + the data. These beats will manifest as seemingly spurious, regular peaks in the plotted histogram. + For example the cloud area fractions from the UM are stored using 6-bit precision + which allows just 64 values over the range [0, 1] and plotting histograms using 50 bins generates + numerical beats. - The default method generates a probability density so that the area under the histogram - normalized to 1. Histograms of rainfall or snowfall rate are plotted using a logarithmic scale. Histograms of rainfall or snowfall amount are based on the [Klingaman et al. 2017](https://gmd.copernicus.org/articles/10/57/2017/gmd-10-57-2017.html) @@ -37,3 +48,4 @@ steps: - operator: plot.plot_histogram_series sequence_coordinate: $SEQUENCE + histogram_method: $HISTOGRAM_METHOD diff --git a/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_all.yaml b/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_all.yaml index 801a8e620..da245ed44 100644 --- a/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_all.yaml +++ b/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_all.yaml @@ -39,3 +39,4 @@ steps: - operator: plot.plot_histogram_series sequence_coordinate: realization + histogram_method: $HISTOGRAM_METHOD diff --git a/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_hour_of_day.yaml b/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_hour_of_day.yaml index 1685dd932..3091c8aaa 100644 --- a/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_hour_of_day.yaml +++ b/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_hour_of_day.yaml @@ -42,3 +42,4 @@ steps: - operator: plot.plot_histogram_series sequence_coordinate: hour + histogram_method: $HISTOGRAM_METHOD diff --git a/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_lead_time.yaml b/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_lead_time.yaml index 7bb0e2d4d..d128ef2c4 100644 --- a/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_lead_time.yaml +++ b/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_lead_time.yaml @@ -39,3 +39,4 @@ steps: - operator: plot.plot_histogram_series sequence_coordinate: forecast_period + histogram_method: $HISTOGRAM_METHOD diff --git a/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_validity_time.yaml b/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_validity_time.yaml index bcfed7647..7a88320a2 100644 --- a/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_validity_time.yaml +++ b/src/CSET/recipes/generic_surface_histogram_series_case_aggregation_validity_time.yaml @@ -39,3 +39,4 @@ steps: - operator: plot.plot_histogram_series sequence_coordinate: time + histogram_method: $HISTOGRAM_METHOD diff --git a/tests/operators/test_plot.py b/tests/operators/test_plot.py index 53f7604e7..9299f1d26 100644 --- a/tests/operators/test_plot.py +++ b/tests/operators/test_plot.py @@ -691,6 +691,52 @@ def test_plot_and_save_histogram_series_bins_lightning( assert Path("test.png").is_file() +def test_get_histogram_method(tmp_working_dir): + """Test getting the histogram method.""" + with open("meta.json", "wt", encoding="UTF-8") as fp: + fp.write('{"histogram_method": "density"}') + method = plot._get_histogram_method() + assert method == "density" + + +def test_plot_histogram(histogram_cube, tmp_working_dir): + """Plot a histogram using the normalised_frequency method using plot._plot_histogram.""" + cube_data_1d = (histogram_cube.data).flatten() + + fig = mpl.pyplot.figure(figsize=(10, 10), facecolor="w", edgecolor="k") + y = plot._plot_histogram( + cube_data_1d, + method="normalised_frequency", + bins=50, + color="black", + linewidth=2, + histtype="step", + label="histogram", + ) + fig.savefig("test_plot_histogram.png", bbox_inches="tight", dpi=100) + assert y["ylabel"] == "Normalised frequency" + assert Path("test_plot_histogram.png").is_file() + + +def test_plot_and_save_histogram_series(histogram_cube, tmp_working_dir): + """Plot a histogram using the frequency method via plot._plot_histogram.""" + with open("meta.json", "wt", encoding="UTF-8") as fp: + fp.write('{"histogram_method": "frequency"}') + plot._plot_and_save_histogram_series( + histogram_cube, + vmin=250, + vmax=350, + title="test_histogram", + filename="test_plot_and_save_histogram_series.png", + bins=50, + color="black", + linewidth=2, + histtype="step", + label="frequency", + ) + assert Path("test_plot_and_save_histogram_series.png").is_file() + + def test_plot_and_save_postage_stamp_histogram_series(histogram_cube, tmp_working_dir): """Test plotting a postage stamp histogram.""" plot._plot_and_save_postage_stamp_histogram_series(