From 52c25636e013685eafef1ccd1841b1f436049309 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Mon, 22 Jan 2024 16:25:25 +0000 Subject: [PATCH] ci(datasets): Unpin dask (#522) * Unpin dask Signed-off-by: Ankita Katiyar * Update doctest Signed-off-by: Ankita Katiyar * Update doctest Signed-off-by: Ankita Katiyar * Update kedro-datasets/setup.py Co-authored-by: Nok Lam Chan Signed-off-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> --------- Signed-off-by: Ankita Katiyar Signed-off-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Co-authored-by: Nok Lam Chan --- kedro-datasets/kedro_datasets/dask/parquet_dataset.py | 10 ++++++---- kedro-datasets/setup.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index eb6f11a78..31de106ec 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -39,9 +39,9 @@ class ParquetDataset(AbstractDataset[dd.DataFrame, dd.DataFrame]): >>> import dask.dataframe as dd >>> import pandas as pd >>> from kedro_datasets.dask import ParquetDataset - >>> from pandas.testing import assert_frame_equal + >>> import numpy as np >>> - >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [[5, 6], [7, 8]]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [6, 7]}) >>> ddf = dd.from_pandas(data, npartitions=2) >>> >>> dataset = ParquetDataset( @@ -50,7 +50,7 @@ class ParquetDataset(AbstractDataset[dd.DataFrame, dd.DataFrame]): >>> dataset.save(ddf) >>> reloaded = dataset.load() >>> - >>> assert_frame_equal(ddf.compute(), reloaded.compute()) + >>> assert np.array_equal(ddf.compute(), reloaded.compute()) The output schema can also be explicitly specified using `Triad dd.DataFrame: def _save(self, data: dd.DataFrame) -> None: self._process_schema() - data.to_parquet(self._filepath, storage_options=self.fs_args, **self._save_args) + data.to_parquet( + path=self._filepath, storage_options=self.fs_args, **self._save_args + ) def _process_schema(self) -> None: """This method processes the schema in the catalog.yml or the API, if provided. diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index f908df897..7ed988431 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -180,7 +180,7 @@ def _collect_requirements(requires): "cloudpickle<=2.0.0", "compress-pickle[lz4]~=2.1.0", "coverage[toml]", - "dask[complete]~=2021.10", # pinned by Snyk to avoid a vulnerability + "dask[complete]>=2021.10", "delta-spark>=1.0, <3.0", "deltalake>=0.10.0", "dill~=0.3.1",