fix(datasets): Fix polars.CSVDataset save on Windows (#979)

* test csv win Signed-off-by: ravi_kumar_pilla <[email protected]> * change ci yaml for testing Signed-off-by: ravi_kumar_pilla <[email protected]> * change ci yaml for testing Signed-off-by: ravi_kumar_pilla <[email protected]> * add default encoding when opening file * revert workflow tests Signed-off-by: ravi_kumar_pilla <[email protected]> * fix lint Signed-off-by: ravi_kumar_pilla <[email protected]> * update release note * update release note --------- Signed-off-by: ravi_kumar_pilla <[email protected]>
kedro-org · Jan 13, 2025 · 630f4ea · 630f4ea
1 parent 6f0ffa9
commit 630f4ea
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 12 deletions.
diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
@@ -1,10 +1,13 @@
 # Upcoming Release
 ## Major features and improvements
+
 ## Bug fixes and other changes
 
+- Fix polars.CSVDataset `save` method on Windows using `utf-8` as default encoding.
+
 ## Breaking Changes
 
-- Removed `tracking.MetricsDataset` and `tracking.JSONDataset`
+- Removed `tracking.MetricsDataset` and `tracking.JSONDataset`.
 
 ## Community contributions
 

diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py
@@ -72,7 +72,9 @@ class CSVDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]):
 
     DEFAULT_LOAD_ARGS: dict[str, Any] = {"rechunk": True}
     DEFAULT_SAVE_ARGS: dict[str, Any] = {}
-    DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "w"}}
+    DEFAULT_FS_ARGS: dict[str, Any] = {
+        "open_args_save": {"mode": "w", "encoding": "utf-8"}
+    }
 
     def __init__(  # noqa: PLR0913
         self,

diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py
@@ -88,14 +88,12 @@ def mocked_csv_in_s3(mocked_s3_bucket, mocked_dataframe: pl.DataFrame):
 
 
 class TestCSVDataset:
-    @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
     def test_save_and_load(self, csv_dataset, dummy_dataframe):
         """Test saving and reloading the dataset."""
         csv_dataset.save(dummy_dataframe)
         reloaded = csv_dataset.load()
         assert_frame_equal(dummy_dataframe, reloaded)
 
-    @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
     def test_exists(self, csv_dataset, dummy_dataframe):
         """Test `exists` method invocation for both existing and
         nonexistent dataset."""
@@ -204,15 +202,13 @@ def test_version_str_repr(self, load_version, save_version):
         assert "load_args={'rechunk': True}" in str(ds)
         assert "load_args={'rechunk': True}" in str(ds_versioned)
 
-    @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
     def test_save_and_load(self, versioned_csv_dataset, dummy_dataframe):
         """Test that saved and reloaded data matches the original one for
         the versioned dataset."""
         versioned_csv_dataset.save(dummy_dataframe)
         reloaded_df = versioned_csv_dataset.load()
         assert_frame_equal(dummy_dataframe, reloaded_df)
 
-    @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
     def test_multiple_loads(self, versioned_csv_dataset, dummy_dataframe, filepath_csv):
         """Test that if a new version is created mid-run, by an
         external system, it won't be loaded in the current run."""
@@ -236,7 +232,6 @@ def test_multiple_loads(self, versioned_csv_dataset, dummy_dataframe, filepath_c
             ds_new.resolve_load_version() == v_new
         )  # new version is discoverable by a new instance
 
-    @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
     def test_multiple_saves(self, dummy_dataframe, filepath_csv):
         """Test multiple cycles of save followed by load for the same dataset"""
         ds_versioned = CSVDataset(filepath=filepath_csv, version=Version(None, None))
@@ -259,7 +254,6 @@ def test_multiple_saves(self, dummy_dataframe, filepath_csv):
         ds_new = CSVDataset(filepath=filepath_csv, version=Version(None, None))
         assert ds_new.resolve_load_version() == second_load_version
 
-    @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
     def test_release_instance_cache(self, dummy_dataframe, filepath_csv):
         """Test that cache invalidation does not affect other instances"""
         ds_a = CSVDataset(filepath=filepath_csv, version=Version(None, None))
@@ -288,14 +282,12 @@ def test_no_versions(self, versioned_csv_dataset):
         with pytest.raises(DatasetError, match=pattern):
             versioned_csv_dataset.load()
 
-    @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
     def test_exists(self, versioned_csv_dataset, dummy_dataframe):
         """Test `exists` method invocation for versioned dataset."""
         assert not versioned_csv_dataset.exists()
         versioned_csv_dataset.save(dummy_dataframe)
         assert versioned_csv_dataset.exists()
 
-    @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
     def test_prevent_overwrite(self, versioned_csv_dataset, dummy_dataframe):
         """Check the error when attempting to override the dataset if the
         corresponding CSV file for a given save version already exists."""
@@ -307,7 +299,6 @@ def test_prevent_overwrite(self, versioned_csv_dataset, dummy_dataframe):
         with pytest.raises(DatasetError, match=pattern):
             versioned_csv_dataset.save(dummy_dataframe)
 
-    @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
     @pytest.mark.parametrize(
         "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True
     )
@@ -334,7 +325,6 @@ def test_http_filesystem_no_versioning(self):
                 filepath="https://example.com/file.csv", version=Version(None, None)
             )
 
-    @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
     def test_versioning_existing_dataset(
         self, csv_dataset, versioned_csv_dataset, dummy_dataframe
     ):