primap-community · mikapfl · Mar 26, 2025 · Mar 24, 2025 · Mar 24, 2025 · Mar 25, 2025
diff --git a/changelog/324.fix.md b/changelog/324.fix.md
@@ -0,0 +1 @@
+Drop encoding of data sets when merging or saving to netcfd to avoid truncation of coordinate values
diff --git a/primap2/_data_format.py b/primap2/_data_format.py
@@ -240,8 +240,22 @@ def to_netcdf(
             ones ``{"compression": "gzip", "compression_opts": 9}``.
             This allows using any compression plugin installed in the HDF5
             library, e.g. LZF.
+
+            Note that we drop encoding information that's already present beforehand
+            and only apply the encoding that is explicitly passed here. For example,
+            if your coordinate has a specified data type in the encoding attribute, it
+            will be dropped and the encoding specified will be applied. If you don't specify
+            encoding, a default will be defined.
         """
         ds = self._ds.pint.dequantify()
+
+        if encoding is None:
+            # use the zlib compression algorithm and compression level 9,
+            # 0 (no compression) - larger files, shorter processing
+            # 9 (maximum compression) - smaller files, longer processing
+            compression = dict(zlib=True, complevel=9)
+            encoding = {var: compression for var in ds.data_vars}
+
         if "publication_date" in ds.attrs:
             ds.attrs["publication_date"] = ds.attrs["publication_date"].isoformat()
         for entity in ds:
@@ -251,6 +265,8 @@ def to_netcdf(
                 and ds[entity].data.dtype == object
             ):
                 ds[entity].data = np.vectorize(lambda x: x.serialize())(ds[entity].data)
+
+        ds = ds.drop_encoding()
         return ds.to_netcdf(
             path=path,
             mode=mode,

diff --git a/primap2/_merge.py b/primap2/_merge.py
@@ -215,6 +215,11 @@ def merge(
 
         ds_start = self._ds
 
+        # Remove the encoding from the dataset
+        # Only the encoding of ds_start is considered in the merge,
+        # so we don't have to remove it from ds_merge
+        ds_start = ds_start.drop_encoding()
+
         with contextlib.suppress(xr.MergeError, ValueError):
             # if there are no conflicts just merge using xr.merge
             return xr.merge(

diff --git a/primap2/tests/test_data_format.py b/primap2/tests/test_data_format.py
@@ -28,6 +28,20 @@ def test_io_roundtrip(self, any_ds: xr.Dataset, caplog, tmp_path):
         assert attrs_before == ds.attrs
         assert attrs_before == nds.attrs
 
+    def test_io_roundtrip_after_merge(self, minimal_ds, tmp_path):
+        other_ds = minimal_ds.copy(deep=True)
+        new_areas = np.array(["COL", "G20", "G7", "UMBRELLA"], dtype="<U8")
+        other_ds = other_ds.assign_coords(
+            {"area (ISO3)": xr.DataArray(new_areas, coords={"area (ISO3)": new_areas})}
+        )
+        assert "UMBRELLA" in other_ds["area (ISO3)"]
+        minimal_ds["area (ISO3)"].encoding = {"dtype": np.dtype("<U3")}
+        merged_ds = minimal_ds.pr.merge(other_ds)
+        assert "UMBRELLA" in merged_ds["area (ISO3)"]
+        merged_ds.pr.to_netcdf(tmp_path / "temp.nc")
+        nds = primap2.open_dataset(tmp_path / "temp.nc")
+        assert "UMBRELLA" in nds["area (ISO3)"]
+
 
 class TestEnsureValid:
     def test_something_else_entirely(self, caplog):

diff --git a/primap2/tests/test_merge.py b/primap2/tests/test_merge.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 """Tests for _merge.py"""
 
+import numpy as np
 import pandas as pd
 import pytest
 import xarray as xr
@@ -234,3 +235,19 @@ def test_log_formatting_single_date(minimal_ds, caplog):
         "for time=2000, area (ISO3)=ARG, source=RAND2020:" in caplog.text
     )
     assert "(CO2)\n0.09" in caplog.text
+
+
+def test_merge_str_encoding(minimal_ds):
+    # start data set
+    minimal_ds["area (ISO3)"].encoding = {"dtype": np.dtype("<U3")}
+
+    # other data set to merge with
+    other_ds = minimal_ds.copy(deep=True)
+    new_areas = np.array(["COL", "G20", "G7", "UMBRELLA"], dtype="<U8")
+    other_ds = other_ds.assign_coords(
+        {"area (ISO3)": xr.DataArray(new_areas, coords={"area (ISO3)": new_areas})}
+    )
+    other_ds["area (ISO3)"].encoding = {"dtype": np.dtype("<U8")}
+
+    merged_ds = minimal_ds.pr.merge(other_ds)
+    assert "dtype" not in merged_ds["area (ISO3)"].encoding
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Drop encoding of data sets when merging or saving to netcfd to avoid truncation of coordinate values