Skip to content

Commit

Permalink
fix(datasets): Fix polars.CSVDataset save on Windows (#979)
Browse files Browse the repository at this point in the history
* test csv win

Signed-off-by: ravi_kumar_pilla <[email protected]>

* change ci yaml for testing

Signed-off-by: ravi_kumar_pilla <[email protected]>

* change ci yaml for testing

Signed-off-by: ravi_kumar_pilla <[email protected]>

* add default encoding when opening file

* revert workflow tests

Signed-off-by: ravi_kumar_pilla <[email protected]>

* fix lint

Signed-off-by: ravi_kumar_pilla <[email protected]>

* update release note

* update release note

---------

Signed-off-by: ravi_kumar_pilla <[email protected]>
  • Loading branch information
ravi-kumar-pilla authored Jan 13, 2025
1 parent 6f0ffa9 commit 630f4ea
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 12 deletions.
5 changes: 4 additions & 1 deletion kedro-datasets/RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
# Upcoming Release
## Major features and improvements

## Bug fixes and other changes

- Fix polars.CSVDataset `save` method on Windows using `utf-8` as default encoding.

## Breaking Changes

- Removed `tracking.MetricsDataset` and `tracking.JSONDataset`
- Removed `tracking.MetricsDataset` and `tracking.JSONDataset`.

## Community contributions

Expand Down
4 changes: 3 additions & 1 deletion kedro-datasets/kedro_datasets/polars/csv_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,9 @@ class CSVDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]):

DEFAULT_LOAD_ARGS: dict[str, Any] = {"rechunk": True}
DEFAULT_SAVE_ARGS: dict[str, Any] = {}
DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "w"}}
DEFAULT_FS_ARGS: dict[str, Any] = {
"open_args_save": {"mode": "w", "encoding": "utf-8"}
}

def __init__( # noqa: PLR0913
self,
Expand Down
10 changes: 0 additions & 10 deletions kedro-datasets/tests/polars/test_csv_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,12 @@ def mocked_csv_in_s3(mocked_s3_bucket, mocked_dataframe: pl.DataFrame):


class TestCSVDataset:
@pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
def test_save_and_load(self, csv_dataset, dummy_dataframe):
"""Test saving and reloading the dataset."""
csv_dataset.save(dummy_dataframe)
reloaded = csv_dataset.load()
assert_frame_equal(dummy_dataframe, reloaded)

@pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
def test_exists(self, csv_dataset, dummy_dataframe):
"""Test `exists` method invocation for both existing and
nonexistent dataset."""
Expand Down Expand Up @@ -204,15 +202,13 @@ def test_version_str_repr(self, load_version, save_version):
assert "load_args={'rechunk': True}" in str(ds)
assert "load_args={'rechunk': True}" in str(ds_versioned)

@pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
def test_save_and_load(self, versioned_csv_dataset, dummy_dataframe):
"""Test that saved and reloaded data matches the original one for
the versioned dataset."""
versioned_csv_dataset.save(dummy_dataframe)
reloaded_df = versioned_csv_dataset.load()
assert_frame_equal(dummy_dataframe, reloaded_df)

@pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
def test_multiple_loads(self, versioned_csv_dataset, dummy_dataframe, filepath_csv):
"""Test that if a new version is created mid-run, by an
external system, it won't be loaded in the current run."""
Expand All @@ -236,7 +232,6 @@ def test_multiple_loads(self, versioned_csv_dataset, dummy_dataframe, filepath_c
ds_new.resolve_load_version() == v_new
) # new version is discoverable by a new instance

@pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
def test_multiple_saves(self, dummy_dataframe, filepath_csv):
"""Test multiple cycles of save followed by load for the same dataset"""
ds_versioned = CSVDataset(filepath=filepath_csv, version=Version(None, None))
Expand All @@ -259,7 +254,6 @@ def test_multiple_saves(self, dummy_dataframe, filepath_csv):
ds_new = CSVDataset(filepath=filepath_csv, version=Version(None, None))
assert ds_new.resolve_load_version() == second_load_version

@pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
def test_release_instance_cache(self, dummy_dataframe, filepath_csv):
"""Test that cache invalidation does not affect other instances"""
ds_a = CSVDataset(filepath=filepath_csv, version=Version(None, None))
Expand Down Expand Up @@ -288,14 +282,12 @@ def test_no_versions(self, versioned_csv_dataset):
with pytest.raises(DatasetError, match=pattern):
versioned_csv_dataset.load()

@pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
def test_exists(self, versioned_csv_dataset, dummy_dataframe):
"""Test `exists` method invocation for versioned dataset."""
assert not versioned_csv_dataset.exists()
versioned_csv_dataset.save(dummy_dataframe)
assert versioned_csv_dataset.exists()

@pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
def test_prevent_overwrite(self, versioned_csv_dataset, dummy_dataframe):
"""Check the error when attempting to override the dataset if the
corresponding CSV file for a given save version already exists."""
Expand All @@ -307,7 +299,6 @@ def test_prevent_overwrite(self, versioned_csv_dataset, dummy_dataframe):
with pytest.raises(DatasetError, match=pattern):
versioned_csv_dataset.save(dummy_dataframe)

@pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
@pytest.mark.parametrize(
"load_version", ["2019-01-01T23.59.59.999Z"], indirect=True
)
Expand All @@ -334,7 +325,6 @@ def test_http_filesystem_no_versioning(self):
filepath="https://example.com/file.csv", version=Version(None, None)
)

@pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8")
def test_versioning_existing_dataset(
self, csv_dataset, versioned_csv_dataset, dummy_dataframe
):
Expand Down

0 comments on commit 630f4ea

Please sign in to comment.