From 159e0a3e45ac81e6465c6bb010492f33f7e98064 Mon Sep 17 00:00:00 2001 From: Ravi Kumar Pilla Date: Tue, 7 Jan 2025 18:48:56 -0600 Subject: [PATCH 1/4] chore(datasets): Remove tracking datasets which are used in Kedro Viz Experiment Tracking (#969) * remove et related kedro datasets * update release note and static json schema * temporary doc fix --- kedro-datasets/RELEASE.md | 4 + .../docs/source/api/kedro_datasets.rst | 2 - kedro-datasets/kedro_datasets/_typing.py | 5 - .../kedro_datasets/dask/csv_dataset.py | 4 +- .../kedro_datasets/dask/parquet_dataset.py | 4 +- .../kedro_datasets/tracking/__init__.py | 26 --- .../kedro_datasets/tracking/json_dataset.py | 56 ----- .../tracking/metrics_dataset.py | 76 ------- kedro-datasets/pyproject.toml | 4 - .../static/jsonschema/kedro-catalog-0.18.json | 72 ------- .../static/jsonschema/kedro-catalog-0.19.json | 72 ------- kedro-datasets/tests/tracking/__init__.py | 0 .../tests/tracking/test_json_dataset.py | 195 ----------------- .../tests/tracking/test_metrics_dataset.py | 204 ------------------ 14 files changed, 8 insertions(+), 716 deletions(-) delete mode 100644 kedro-datasets/kedro_datasets/tracking/__init__.py delete mode 100644 kedro-datasets/kedro_datasets/tracking/json_dataset.py delete mode 100644 kedro-datasets/kedro_datasets/tracking/metrics_dataset.py delete mode 100644 kedro-datasets/tests/tracking/__init__.py delete mode 100644 kedro-datasets/tests/tracking/test_json_dataset.py delete mode 100644 kedro-datasets/tests/tracking/test_metrics_dataset.py diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index a477dca5e..16fa5b18a 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,7 +1,11 @@ # Upcoming Release ## Major features and improvements ## Bug fixes and other changes + ## Breaking Changes + +- Removed `tracking.MetricsDataset` and `tracking.JSONDataset` + ## Community contributions # Release 6.0.0 diff --git a/kedro-datasets/docs/source/api/kedro_datasets.rst b/kedro-datasets/docs/source/api/kedro_datasets.rst index 0cbd3bc4e..63142220a 100644 --- a/kedro-datasets/docs/source/api/kedro_datasets.rst +++ b/kedro-datasets/docs/source/api/kedro_datasets.rst @@ -62,6 +62,4 @@ kedro_datasets svmlight.SVMLightDataset tensorflow.TensorFlowModelDataset text.TextDataset - tracking.JSONDataset - tracking.MetricsDataset yaml.YAMLDataset diff --git a/kedro-datasets/kedro_datasets/_typing.py b/kedro-datasets/kedro_datasets/_typing.py index feb6d91b7..aa083f514 100644 --- a/kedro-datasets/kedro_datasets/_typing.py +++ b/kedro-datasets/kedro_datasets/_typing.py @@ -9,8 +9,3 @@ ImagePreview = NewType("ImagePreview", str) PlotlyPreview = NewType("PlotlyPreview", dict) JSONPreview = NewType("JSONPreview", str) - - -# experiment tracking datasets types -MetricsTrackingPreview = NewType("MetricsTrackingPreview", dict) -JSONTrackingPreview = NewType("JSONTrackingPreview", dict) diff --git a/kedro-datasets/kedro_datasets/dask/csv_dataset.py b/kedro-datasets/kedro_datasets/dask/csv_dataset.py index 053da6b00..bc5b5764b 100644 --- a/kedro-datasets/kedro_datasets/dask/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/csv_dataset.py @@ -67,9 +67,9 @@ def __init__( # noqa: PLR0913 filepath: Filepath in POSIX format to a CSV file CSV collection or the directory of a multipart CSV. load_args: Additional loading options `dask.dataframe.read_csv`: - https://docs.dask.org/en/latest/generated/dask.dataframe.read_csv.html + https://docs.dask.org/en/stable/generated/dask.dataframe.read_csv.html save_args: Additional saving options for `dask.dataframe.to_csv`: - https://docs.dask.org/en/latest/generated/dask.dataframe.to_csv.html + https://docs.dask.org/en/stable/generated/dask.dataframe.to_csv.html credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Optional parameters to the backend file system driver: diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index 1acfe7cda..3b2dff73e 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -97,9 +97,9 @@ def __init__( # noqa: PLR0913 filepath: Filepath in POSIX format to a parquet file parquet collection or the directory of a multipart parquet. load_args: Additional loading options `dask.dataframe.read_parquet`: - https://docs.dask.org/en/latest/generated/dask.dataframe.read_parquet.html + https://docs.dask.org/en/stable/generated/dask.dataframe.read_parquet.html save_args: Additional saving options for `dask.dataframe.to_parquet`: - https://docs.dask.org/en/latest/generated/dask.dataframe.to_parquet.html + https://docs.dask.org/en/stable/generated/dask.dataframe.to_parquet.html credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Optional parameters to the backend file system driver: diff --git a/kedro-datasets/kedro_datasets/tracking/__init__.py b/kedro-datasets/kedro_datasets/tracking/__init__.py deleted file mode 100644 index 1b1a5c70d..000000000 --- a/kedro-datasets/kedro_datasets/tracking/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Dataset implementations to save data for Kedro Experiment Tracking.""" - -import warnings -from typing import Any - -import lazy_loader as lazy - -from kedro_datasets import KedroDeprecationWarning - -# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -JSONDataset: Any -MetricsDataset: Any - -__getattr__, __dir__, __all__ = lazy.attach( - __name__, - submod_attrs={ - "json_dataset": ["JSONDataset"], - "metrics_dataset": ["MetricsDataset"], - }, -) - -warnings.warn( - "`tracking.JSONDataset` and `tracking.MetricsDataset` are deprecated. These datasets will be removed in kedro-datasets 7.0.0", - KedroDeprecationWarning, - stacklevel=2, -) diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py deleted file mode 100644 index d73df1b10..000000000 --- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py +++ /dev/null @@ -1,56 +0,0 @@ -"""``JSONDataset`` saves data to a JSON file using an underlying -filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. -The ``JSONDataset`` is part of Kedro Experiment Tracking. The dataset is versioned by default. -""" - -import json -from typing import NoReturn - -from kedro.io.core import DatasetError, get_filepath_str - -from kedro_datasets._typing import JSONTrackingPreview -from kedro_datasets.json import json_dataset - - -class JSONDataset(json_dataset.JSONDataset): - """``JSONDataset`` saves data to a JSON file using an underlying - filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. - The ``JSONDataset`` is part of Kedro Experiment Tracking. - The dataset is write-only and it is versioned by default. - - Example usage for the - `YAML API `_: - - .. code-block:: yaml - - cars: - type: tracking.JSONDataset - filepath: data/09_tracking/cars.json - - Example usage for the - `Python API `_: - - .. code-block:: pycon - - >>> from kedro_datasets.tracking import JSONDataset - >>> - >>> data = {"col1": 1, "col2": 0.23, "col3": 0.002} - >>> - >>> dataset = JSONDataset(filepath=tmp_path / "test.json") - >>> dataset.save(data) - - """ - - versioned = True - - def load(self) -> NoReturn: - raise DatasetError(f"Loading not supported for '{self.__class__.__name__}'") - - def preview(self) -> JSONTrackingPreview: # type: ignore[override] - "Load the JSON tracking dataset used in Kedro-viz experiment tracking." - load_path = get_filepath_str(self._get_load_path(), self._protocol) - - with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: - return JSONTrackingPreview(json.load(fs_file)) diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py deleted file mode 100644 index 6202acf34..000000000 --- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py +++ /dev/null @@ -1,76 +0,0 @@ -"""``MetricsDataset`` saves data to a JSON file using an underlying -filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. -The ``MetricsDataset`` is part of Kedro Experiment Tracking. The dataset is versioned by default -and only takes metrics of numeric values. -""" - -import json -from typing import NoReturn - -from kedro.io.core import DatasetError, get_filepath_str - -from kedro_datasets._typing import MetricsTrackingPreview -from kedro_datasets.json import json_dataset - - -class MetricsDataset(json_dataset.JSONDataset): - """``MetricsDataset`` saves data to a JSON file using an underlying - filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. The - ``MetricsDataset`` is part of Kedro Experiment Tracking. The dataset is write-only, - it is versioned by default and only takes metrics of numeric values. - - Example usage for the - `YAML API `_: - - .. code-block:: yaml - - cars: - type: tracking.MetricsDataset - filepath: data/09_tracking/cars.json - - Example usage for the - `Python API `_: - - .. code-block:: pycon - - >>> from kedro_datasets.tracking import MetricsDataset - >>> - >>> data = {"col1": 1, "col2": 0.23, "col3": 0.002} - >>> - >>> dataset = MetricsDataset(filepath=tmp_path / "test.json") - >>> dataset.save(data) - - """ - - versioned = True - - def load(self) -> NoReturn: - raise DatasetError(f"Loading not supported for '{self.__class__.__name__}'") - - def save(self, data: dict[str, float]) -> None: - """Converts all values in the data from a ``MetricsDataset`` to float to make sure - they are numeric values which can be displayed in Kedro Viz and then saves the dataset. - """ - try: - for key, value in data.items(): - data[key] = float(value) - except ValueError as exc: - raise DatasetError( - f"The MetricsDataset expects only numeric values. {exc}" - ) from exc - - save_path = get_filepath_str(self._get_save_path(), self._protocol) - - with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: - json.dump(data, fs_file, **self._save_args) - - self._invalidate_cache() - - def preview(self) -> MetricsTrackingPreview: # type: ignore[override] - "Load the Metrics tracking dataset used in Kedro-viz experiment tracking" - load_path = get_filepath_str(self._get_load_path(), self._protocol) - - with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: - return json.load(fs_file) diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 91b938c19..3ee8eb9e9 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -163,10 +163,6 @@ tensorflow = ["kedro-datasets[tensorflow-tensorflowmodeldataset]"] text-textdataset = [] text = ["kedro-datasets[text-textdataset]"] -tracking-jsondataset = [] -tracking-metricsdataset = [] -tracking = ["kedro-datasets[tracking-jsondataset, tracking-metricsdataset]"] - yaml-yamldataset = ["kedro-datasets[pandas-base]", "PyYAML>=4.2, <7.0"] yaml = ["kedro-datasets[yaml-yamldataset]"] diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json index 195f0234a..b9fa61d14 100644 --- a/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json @@ -42,8 +42,6 @@ "spark.SparkJDBCDataSet", "tensorflow.TensorFlowModelDataset", "text.TextDataSet", - "tracking.JSONDataSet", - "tracking.MetricsDataSet", "yaml.YAMLDataSet" ] } @@ -1312,76 +1310,6 @@ } } }, - { - "if": { - "properties": { - "type": { - "const": "tracking.JSONDataSet" - } - } - }, - "then": { - "required": [ - "filepath" - ], - "properties": { - "filepath": { - "type": "string", - "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." - }, - "save_args": { - "type": "object", - "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." - }, - "credentials": { - "type": [ - "object", - "string" - ], - "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." - }, - "fs_args": { - "type": "object", - "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." - } - } - } - }, - { - "if": { - "properties": { - "type": { - "const": "tracking.MetricsDataSet" - } - } - }, - "then": { - "required": [ - "filepath" - ], - "properties": { - "filepath": { - "type": "string", - "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." - }, - "save_args": { - "type": "object", - "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." - }, - "credentials": { - "type": [ - "object", - "string" - ], - "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." - }, - "fs_args": { - "type": "object", - "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." - } - } - } - }, { "if": { "properties": { diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json index f19266812..087725710 100644 --- a/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json @@ -41,8 +41,6 @@ "spark.SparkJDBCDataset", "tensorflow.TensorFlowModelDataset", "text.TextDataset", - "tracking.JSONDataset", - "tracking.MetricsDataset", "yaml.YAMLDataset" ] } @@ -1277,76 +1275,6 @@ } } }, - { - "if": { - "properties": { - "type": { - "const": "tracking.JSONDataset" - } - } - }, - "then": { - "required": [ - "filepath" - ], - "properties": { - "filepath": { - "type": "string", - "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." - }, - "save_args": { - "type": "object", - "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." - }, - "credentials": { - "type": [ - "object", - "string" - ], - "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." - }, - "fs_args": { - "type": "object", - "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." - } - } - } - }, - { - "if": { - "properties": { - "type": { - "const": "tracking.MetricsDataset" - } - } - }, - "then": { - "required": [ - "filepath" - ], - "properties": { - "filepath": { - "type": "string", - "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." - }, - "save_args": { - "type": "object", - "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." - }, - "credentials": { - "type": [ - "object", - "string" - ], - "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." - }, - "fs_args": { - "type": "object", - "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." - } - } - } - }, { "if": { "properties": { diff --git a/kedro-datasets/tests/tracking/__init__.py b/kedro-datasets/tests/tracking/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/kedro-datasets/tests/tracking/test_json_dataset.py b/kedro-datasets/tests/tracking/test_json_dataset.py deleted file mode 100644 index de24ba9b9..000000000 --- a/kedro-datasets/tests/tracking/test_json_dataset.py +++ /dev/null @@ -1,195 +0,0 @@ -import inspect -import json -from pathlib import Path, PurePosixPath - -import pytest -from fsspec.implementations.local import LocalFileSystem -from gcsfs import GCSFileSystem -from kedro.io.core import PROTOCOL_DELIMITER, DatasetError, Version -from s3fs.core import S3FileSystem - -from kedro_datasets.tracking import JSONDataset - - -@pytest.fixture -def filepath_json(tmp_path): - return (tmp_path / "test.json").as_posix() - - -@pytest.fixture -def json_dataset(filepath_json, save_args, fs_args): - return JSONDataset(filepath=filepath_json, save_args=save_args, fs_args=fs_args) - - -@pytest.fixture -def explicit_versioned_json_dataset(filepath_json, load_version, save_version): - return JSONDataset( - filepath=filepath_json, version=Version(load_version, save_version) - ) - - -@pytest.fixture -def dummy_data(): - return {"col1": 1, "col2": 2, "col3": "mystring"} - - -class TestJSONDataset: - def test_save(self, filepath_json, dummy_data, tmp_path, save_version): - """Test saving and reloading the dataset.""" - json_dataset = JSONDataset( - filepath=filepath_json, version=Version(None, save_version) - ) - json_dataset.save(dummy_data) - - actual_filepath = Path(json_dataset._filepath.as_posix()) - test_filepath = tmp_path / "locally_saved.json" - - test_filepath.parent.mkdir(parents=True, exist_ok=True) - with open(test_filepath, "w", encoding="utf-8") as file: - json.dump(dummy_data, file) - - with open(test_filepath, encoding="utf-8") as file: - test_data = json.load(file) - - with open( - (actual_filepath / save_version / "test.json"), encoding="utf-8" - ) as actual_file: - actual_data = json.load(actual_file) - - assert actual_data == test_data - assert json_dataset._fs_open_args_load == {} - assert json_dataset._fs_open_args_save == {"mode": "w"} - - def test_load_fail(self, json_dataset, dummy_data): - json_dataset.save(dummy_data) - pattern = r"Loading not supported for 'JSONDataset'" - with pytest.raises(DatasetError, match=pattern): - json_dataset.load() - - def test_exists(self, json_dataset, dummy_data): - """Test `exists` method invocation for both existing and - nonexistent dataset.""" - assert not json_dataset.exists() - json_dataset.save(dummy_data) - assert json_dataset.exists() - - @pytest.mark.parametrize( - "save_args", [{"k1": "v1", "index": "value"}], indirect=True - ) - def test_save_extra_params(self, json_dataset, save_args): - """Test overriding the default save arguments.""" - for key, value in save_args.items(): - assert json_dataset._save_args[key] == value - - @pytest.mark.parametrize( - "fs_args", - [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], - indirect=True, - ) - def test_open_extra_args(self, json_dataset, fs_args): - assert json_dataset._fs_open_args_load == fs_args["open_args_load"] - assert json_dataset._fs_open_args_save == {"mode": "w"} # default unchanged - - @pytest.mark.parametrize( - "filepath,instance_type", - [ - ("s3://bucket/file.json", S3FileSystem), - ("file:///tmp/test.json", LocalFileSystem), - ("/tmp/test.json", LocalFileSystem), - ("gcs://bucket/file.json", GCSFileSystem), - ], - ) - def test_protocol_usage(self, filepath, instance_type): - dataset = JSONDataset(filepath=filepath) - assert isinstance(dataset._fs, instance_type) - - path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] - - assert str(dataset._filepath) == path - assert isinstance(dataset._filepath, PurePosixPath) - - def test_catalog_release(self, mocker): - fs_mock = mocker.patch("fsspec.filesystem").return_value - filepath = "test.json" - dataset = JSONDataset(filepath=filepath) - dataset.release() - fs_mock.invalidate_cache.assert_called_once_with(filepath) - - def test_not_version_str_repr(self): - """Test that version is not in string representation of the class instance.""" - filepath = "test.json" - ds = JSONDataset(filepath=filepath) - - assert filepath in str(ds) - assert "version" not in str(ds) - assert "JSONDataset" in str(ds) - assert "protocol" in str(ds) - # Default save_args - assert "save_args={'indent': 2}" in str(ds) - - def test_version_str_repr(self, load_version, save_version): - """Test that version is in string representation of the class instance.""" - filepath = "test.json" - ds_versioned = JSONDataset( - filepath=filepath, version=Version(load_version, save_version) - ) - - assert filepath in str(ds_versioned) - ver_str = f"version=Version(load={load_version}, save='{save_version}')" - assert ver_str in str(ds_versioned) - assert "JSONDataset" in str(ds_versioned) - assert "protocol" in str(ds_versioned) - # Default save_args - assert "save_args={'indent': 2}" in str(ds_versioned) - - def test_prevent_overwrite(self, explicit_versioned_json_dataset, dummy_data): - """Check the error when attempting to override the dataset if the - corresponding json file for a given save version already exists.""" - explicit_versioned_json_dataset.save(dummy_data) - pattern = ( - r"Save path \'.+\' for JSONDataset\(.+\) must " - r"not exist if versioning is enabled\." - ) - with pytest.raises(DatasetError, match=pattern): - explicit_versioned_json_dataset.save(dummy_data) - - @pytest.mark.parametrize( - "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True - ) - @pytest.mark.parametrize( - "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True - ) - def test_save_version_warning( - self, - explicit_versioned_json_dataset, - load_version, - save_version, - dummy_data, - ): - """Check the warning when saving to the path that differs from - the subsequent load path.""" - pattern = ( - f"Save version '{save_version}' did not match " - f"load version '{load_version}' for " - r"JSONDataset\(.+\)" - ) - with pytest.warns(UserWarning, match=pattern): - explicit_versioned_json_dataset.save(dummy_data) - - def test_http_filesystem_no_versioning(self): - pattern = "Versioning is not supported for HTTP protocols." - - with pytest.raises(DatasetError, match=pattern): - JSONDataset( - filepath="https://example.com/file.json", version=Version(None, None) - ) - - def test_preview(self, json_dataset, dummy_data): - expected_preview = {"col1": 1, "col2": 2, "col3": "mystring"} - json_dataset.save(dummy_data) - preview = json_dataset.preview() - assert preview == expected_preview - assert ( - inspect.signature(json_dataset.preview).return_annotation.__name__ - == "JSONTrackingPreview" - ) diff --git a/kedro-datasets/tests/tracking/test_metrics_dataset.py b/kedro-datasets/tests/tracking/test_metrics_dataset.py deleted file mode 100644 index b638fcdfd..000000000 --- a/kedro-datasets/tests/tracking/test_metrics_dataset.py +++ /dev/null @@ -1,204 +0,0 @@ -import inspect -import json -from pathlib import Path, PurePosixPath - -import pytest -from fsspec.implementations.local import LocalFileSystem -from gcsfs import GCSFileSystem -from kedro.io.core import PROTOCOL_DELIMITER, DatasetError, Version -from s3fs.core import S3FileSystem - -from kedro_datasets.tracking import MetricsDataset - - -@pytest.fixture -def filepath_json(tmp_path): - return (tmp_path / "test.json").as_posix() - - -@pytest.fixture -def metrics_dataset(filepath_json, save_args, fs_args): - return MetricsDataset(filepath=filepath_json, save_args=save_args, fs_args=fs_args) - - -@pytest.fixture -def explicit_versioned_metrics_dataset(filepath_json, load_version, save_version): - return MetricsDataset( - filepath=filepath_json, version=Version(load_version, save_version) - ) - - -@pytest.fixture -def dummy_data(): - return {"col1": 1, "col2": 2, "col3": 3} - - -class TestMetricsDataset: - def test_save_data( - self, - dummy_data, - tmp_path, - filepath_json, - save_version, - ): - """Test saving and reloading the dataset.""" - metrics_dataset = MetricsDataset( - filepath=filepath_json, version=Version(None, save_version) - ) - metrics_dataset.save(dummy_data) - - actual_filepath = Path(metrics_dataset._filepath.as_posix()) - test_filepath = tmp_path / "locally_saved.json" - - test_filepath.parent.mkdir(parents=True, exist_ok=True) - with open(test_filepath, "w", encoding="utf-8") as file: - json.dump(dummy_data, file) - - with open(test_filepath, encoding="utf-8") as file: - test_data = json.load(file) - - with open( - (actual_filepath / save_version / "test.json"), encoding="utf-8" - ) as actual_file: - actual_data = json.load(actual_file) - - assert actual_data == test_data - assert metrics_dataset._fs_open_args_load == {} - assert metrics_dataset._fs_open_args_save == {"mode": "w"} - - def test_load_fail(self, metrics_dataset, dummy_data): - metrics_dataset.save(dummy_data) - pattern = r"Loading not supported for 'MetricsDataset'" - with pytest.raises(DatasetError, match=pattern): - metrics_dataset.load() - - def test_exists(self, metrics_dataset, dummy_data): - """Test `exists` method invocation for both existing and - nonexistent dataset.""" - assert not metrics_dataset.exists() - metrics_dataset.save(dummy_data) - assert metrics_dataset.exists() - - @pytest.mark.parametrize( - "save_args", [{"k1": "v1", "index": "value"}], indirect=True - ) - def test_save_extra_params(self, metrics_dataset, save_args): - """Test overriding the default save arguments.""" - for key, value in save_args.items(): - assert metrics_dataset._save_args[key] == value - - @pytest.mark.parametrize( - "fs_args", - [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], - indirect=True, - ) - def test_open_extra_args(self, metrics_dataset, fs_args): - assert metrics_dataset._fs_open_args_load == fs_args["open_args_load"] - assert metrics_dataset._fs_open_args_save == {"mode": "w"} # default unchanged - - @pytest.mark.parametrize( - "filepath,instance_type", - [ - ("s3://bucket/file.json", S3FileSystem), - ("file:///tmp/test.json", LocalFileSystem), - ("/tmp/test.json", LocalFileSystem), - ("gcs://bucket/file.json", GCSFileSystem), - ], - ) - def test_protocol_usage(self, filepath, instance_type): - dataset = MetricsDataset(filepath=filepath) - assert isinstance(dataset._fs, instance_type) - - path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] - - assert str(dataset._filepath) == path - assert isinstance(dataset._filepath, PurePosixPath) - - def test_catalog_release(self, mocker): - fs_mock = mocker.patch("fsspec.filesystem").return_value - filepath = "test.json" - dataset = MetricsDataset(filepath=filepath) - dataset.release() - fs_mock.invalidate_cache.assert_called_once_with(filepath) - - def test_fail_on_saving_non_numeric_value(self, metrics_dataset): - data = {"col1": 1, "col2": 2, "col3": "hello"} - - pattern = "The MetricsDataset expects only numeric values." - with pytest.raises(DatasetError, match=pattern): - metrics_dataset.save(data) - - def test_not_version_str_repr(self): - """Test that version is not in string representation of the class instance.""" - filepath = "test.json" - ds = MetricsDataset(filepath=filepath) - - assert filepath in str(ds) - assert "version" not in str(ds) - assert "MetricsDataset" in str(ds) - assert "protocol" in str(ds) - # Default save_args - assert "save_args={'indent': 2}" in str(ds) - - def test_version_str_repr(self, load_version, save_version): - """Test that version is in string representation of the class instance.""" - filepath = "test.json" - ds_versioned = MetricsDataset( - filepath=filepath, version=Version(load_version, save_version) - ) - - assert filepath in str(ds_versioned) - ver_str = f"version=Version(load={load_version}, save='{save_version}')" - assert ver_str in str(ds_versioned) - assert "MetricsDataset" in str(ds_versioned) - assert "protocol" in str(ds_versioned) - # Default save_args - assert "save_args={'indent': 2}" in str(ds_versioned) - - def test_prevent_overwrite(self, explicit_versioned_metrics_dataset, dummy_data): - """Check the error when attempting to override the dataset if the - corresponding json file for a given save version already exists.""" - explicit_versioned_metrics_dataset.save(dummy_data) - pattern = ( - r"Save path \'.+\' for MetricsDataset\(.+\) must " - r"not exist if versioning is enabled\." - ) - with pytest.raises(DatasetError, match=pattern): - explicit_versioned_metrics_dataset.save(dummy_data) - - @pytest.mark.parametrize( - "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True - ) - @pytest.mark.parametrize( - "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True - ) - def test_save_version_warning( - self, explicit_versioned_metrics_dataset, load_version, save_version, dummy_data - ): - """Check the warning when saving to the path that differs from - the subsequent load path.""" - pattern = ( - f"Save version '{save_version}' did not match " - f"load version '{load_version}' for " - r"MetricsDataset\(.+\)" - ) - with pytest.warns(UserWarning, match=pattern): - explicit_versioned_metrics_dataset.save(dummy_data) - - def test_http_filesystem_no_versioning(self): - pattern = "Versioning is not supported for HTTP protocols." - - with pytest.raises(DatasetError, match=pattern): - MetricsDataset( - filepath="https://example.com/file.json", version=Version(None, None) - ) - - def test_preview(self, metrics_dataset, dummy_data): - expected_preview = {"col1": 1, "col2": 2, "col3": 3} - metrics_dataset.save(dummy_data) - preview = metrics_dataset.preview() - assert preview == expected_preview - assert ( - inspect.signature(metrics_dataset.preview).return_annotation.__name__ - == "MetricsTrackingPreview" - ) From 6f0ffa9e7f7ca95058aca9993a7c16aa2ebbbad5 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Mon, 13 Jan 2025 15:15:32 +0000 Subject: [PATCH 2/4] docs(datasets): Move to linkcode extension (#985) Move to linkcode extension Signed-off-by: Ankita Katiyar --- kedro-datasets/docs/source/conf.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/docs/source/conf.py b/kedro-datasets/docs/source/conf.py index f62e80104..039658936 100644 --- a/kedro-datasets/docs/source/conf.py +++ b/kedro-datasets/docs/source/conf.py @@ -14,6 +14,8 @@ from __future__ import annotations import importlib +import inspect +import os import re import sys from inspect import getmembers, isclass, isfunction @@ -22,6 +24,8 @@ from click import secho, style from kedro import __version__ as release +import kedro_datasets + # -- Project information ----------------------------------------------------- project = "kedro-datasets" @@ -47,7 +51,7 @@ "sphinx_autodoc_typehints", "sphinx.ext.doctest", "sphinx.ext.ifconfig", - "sphinx.ext.viewcode", + "sphinx.ext.linkcode", "sphinxcontrib.jquery", "sphinx_copybutton", "myst_parser", @@ -452,3 +456,25 @@ def setup(app): user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0" myst_heading_anchors = 5 + +def linkcode_resolve(domain, info): + """Resolve a GitHub URL corresponding to a Python object.""" + if domain != 'py': + return None + + try: + mod = sys.modules[info['module']] + obj = mod + for attr in info['fullname'].split('.'): + obj = getattr(obj, attr) + obj = inspect.unwrap(obj) + + filename = inspect.getsourcefile(obj) + source, lineno = inspect.getsourcelines(obj) + relpath = os.path.relpath(filename, start=os.path.dirname( + kedro_datasets.__file__)) + + return f'https://github.com/kedro-org/kedro-plugins/blob/main/kedro-datasets/kedro_datasets/{relpath}#L{lineno}#L{lineno + len(source) - 1}' + + except (KeyError, ImportError, AttributeError, TypeError, OSError, ValueError): + return None From 630f4ea3f032ea62b7b5834762579975a2b44ebc Mon Sep 17 00:00:00 2001 From: Ravi Kumar Pilla Date: Mon, 13 Jan 2025 09:48:50 -0600 Subject: [PATCH 3/4] fix(datasets): Fix polars.CSVDataset `save` on Windows (#979) * test csv win Signed-off-by: ravi_kumar_pilla * change ci yaml for testing Signed-off-by: ravi_kumar_pilla * change ci yaml for testing Signed-off-by: ravi_kumar_pilla * add default encoding when opening file * revert workflow tests Signed-off-by: ravi_kumar_pilla * fix lint Signed-off-by: ravi_kumar_pilla * update release note * update release note --------- Signed-off-by: ravi_kumar_pilla --- kedro-datasets/RELEASE.md | 5 ++++- kedro-datasets/kedro_datasets/polars/csv_dataset.py | 4 +++- kedro-datasets/tests/polars/test_csv_dataset.py | 10 ---------- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 16fa5b18a..27df63f78 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,10 +1,13 @@ # Upcoming Release ## Major features and improvements + ## Bug fixes and other changes +- Fix polars.CSVDataset `save` method on Windows using `utf-8` as default encoding. + ## Breaking Changes -- Removed `tracking.MetricsDataset` and `tracking.JSONDataset` +- Removed `tracking.MetricsDataset` and `tracking.JSONDataset`. ## Community contributions diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py index 6d8a988a5..9e6f35846 100644 --- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -72,7 +72,9 @@ class CSVDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): DEFAULT_LOAD_ARGS: dict[str, Any] = {"rechunk": True} DEFAULT_SAVE_ARGS: dict[str, Any] = {} - DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "w"}} + DEFAULT_FS_ARGS: dict[str, Any] = { + "open_args_save": {"mode": "w", "encoding": "utf-8"} + } def __init__( # noqa: PLR0913 self, diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py index e03f192cc..5312e9b48 100644 --- a/kedro-datasets/tests/polars/test_csv_dataset.py +++ b/kedro-datasets/tests/polars/test_csv_dataset.py @@ -88,14 +88,12 @@ def mocked_csv_in_s3(mocked_s3_bucket, mocked_dataframe: pl.DataFrame): class TestCSVDataset: - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_save_and_load(self, csv_dataset, dummy_dataframe): """Test saving and reloading the dataset.""" csv_dataset.save(dummy_dataframe) reloaded = csv_dataset.load() assert_frame_equal(dummy_dataframe, reloaded) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_exists(self, csv_dataset, dummy_dataframe): """Test `exists` method invocation for both existing and nonexistent dataset.""" @@ -204,7 +202,6 @@ def test_version_str_repr(self, load_version, save_version): assert "load_args={'rechunk': True}" in str(ds) assert "load_args={'rechunk': True}" in str(ds_versioned) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_save_and_load(self, versioned_csv_dataset, dummy_dataframe): """Test that saved and reloaded data matches the original one for the versioned dataset.""" @@ -212,7 +209,6 @@ def test_save_and_load(self, versioned_csv_dataset, dummy_dataframe): reloaded_df = versioned_csv_dataset.load() assert_frame_equal(dummy_dataframe, reloaded_df) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_multiple_loads(self, versioned_csv_dataset, dummy_dataframe, filepath_csv): """Test that if a new version is created mid-run, by an external system, it won't be loaded in the current run.""" @@ -236,7 +232,6 @@ def test_multiple_loads(self, versioned_csv_dataset, dummy_dataframe, filepath_c ds_new.resolve_load_version() == v_new ) # new version is discoverable by a new instance - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_multiple_saves(self, dummy_dataframe, filepath_csv): """Test multiple cycles of save followed by load for the same dataset""" ds_versioned = CSVDataset(filepath=filepath_csv, version=Version(None, None)) @@ -259,7 +254,6 @@ def test_multiple_saves(self, dummy_dataframe, filepath_csv): ds_new = CSVDataset(filepath=filepath_csv, version=Version(None, None)) assert ds_new.resolve_load_version() == second_load_version - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_release_instance_cache(self, dummy_dataframe, filepath_csv): """Test that cache invalidation does not affect other instances""" ds_a = CSVDataset(filepath=filepath_csv, version=Version(None, None)) @@ -288,14 +282,12 @@ def test_no_versions(self, versioned_csv_dataset): with pytest.raises(DatasetError, match=pattern): versioned_csv_dataset.load() - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_exists(self, versioned_csv_dataset, dummy_dataframe): """Test `exists` method invocation for versioned dataset.""" assert not versioned_csv_dataset.exists() versioned_csv_dataset.save(dummy_dataframe) assert versioned_csv_dataset.exists() - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_prevent_overwrite(self, versioned_csv_dataset, dummy_dataframe): """Check the error when attempting to override the dataset if the corresponding CSV file for a given save version already exists.""" @@ -307,7 +299,6 @@ def test_prevent_overwrite(self, versioned_csv_dataset, dummy_dataframe): with pytest.raises(DatasetError, match=pattern): versioned_csv_dataset.save(dummy_dataframe) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") @pytest.mark.parametrize( "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True ) @@ -334,7 +325,6 @@ def test_http_filesystem_no_versioning(self): filepath="https://example.com/file.csv", version=Version(None, None) ) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_versioning_existing_dataset( self, csv_dataset, versioned_csv_dataset, dummy_dataframe ): From bf0c407edb237b1a9ce7993c7cf90796246209a6 Mon Sep 17 00:00:00 2001 From: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> Date: Mon, 13 Jan 2025 16:55:09 +0000 Subject: [PATCH 4/4] feat(all): Replace trufflehog with detect-secrets (#983) * Removed trufflehog Signed-off-by: Elena Khaustova * Updated github actions per plugin Signed-off-by: Elena Khaustova * Updated release notes Signed-off-by: Elena Khaustova * Updated validate-pr check scopes Signed-off-by: Elena Khaustova * Updated lint command Signed-off-by: Elena Khaustova * Added key to trigger check Signed-off-by: Elena Khaustova * Updated GH action to track per plugin Signed-off-by: Elena Khaustova * Removed secret Signed-off-by: Elena Khaustova * Updated GH for kedro-datasets Signed-off-by: Elena Khaustova * Updated secrets baseline Signed-off-by: Elena Khaustova --------- Signed-off-by: Elena Khaustova --- .github/workflows/detect-secrets.yml | 46 +++ .github/workflows/kedro-airflow.yml | 7 + .github/workflows/kedro-datasets.yml | 7 + .github/workflows/kedro-docker.yml | 7 + .github/workflows/kedro-telemetry.yml | 7 + .github/workflows/validate-pr-title.yaml | 1 + .pre-commit-config.yaml | 12 +- .secrets.baseline | 494 +++++++++++++++++++++++ Makefile | 5 +- kedro-airflow/RELEASE.md | 1 + kedro-airflow/pyproject.toml | 2 +- kedro-datasets/RELEASE.md | 2 + kedro-datasets/pyproject.toml | 2 +- kedro-docker/RELEASE.md | 1 + kedro-docker/pyproject.toml | 2 +- kedro-telemetry/RELEASE.md | 1 + kedro-telemetry/pyproject.toml | 2 +- trufflehog-ignore.txt | 3 - 18 files changed, 585 insertions(+), 17 deletions(-) create mode 100644 .github/workflows/detect-secrets.yml create mode 100644 .secrets.baseline delete mode 100644 trufflehog-ignore.txt diff --git a/.github/workflows/detect-secrets.yml b/.github/workflows/detect-secrets.yml new file mode 100644 index 000000000..bd360b52b --- /dev/null +++ b/.github/workflows/detect-secrets.yml @@ -0,0 +1,46 @@ +name: Detect secrets on plugins + +on: + workflow_call: + inputs: + plugin: + type: string + os: + type: string + python-version: + type: string + +jobs: + detect-secrets: + defaults: + run: + shell: bash + runs-on: ${{ inputs.os }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} + - name: Cache python packages + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{inputs.plugin}}-${{inputs.os}}-python-${{inputs.python-version}} + restore-keys: ${{inputs.plugin}} + - name: Install uv + run: | + python -m pip install "uv==0.2.21" + - name: Install dependencies + run: | + cd ${{ inputs.plugin }} + uv pip install --system "kedro @ git+https://github.com/kedro-org/kedro@main" + uv pip install --system "${{inputs.plugin}}[lint] @ ." + uv pip freeze --system + - name: Install pre-commit hooks + run: | + pre-commit install --install-hooks + pre-commit install --hook-type pre-push + - name: Scan all tracked files + run: git ls-files ":(glob)*" ${{ inputs.plugin }} -z | xargs -0 detect-secrets-hook --baseline .secrets.baseline diff --git a/.github/workflows/kedro-airflow.yml b/.github/workflows/kedro-airflow.yml index 85e7ca62d..92c269ea2 100644 --- a/.github/workflows/kedro-airflow.yml +++ b/.github/workflows/kedro-airflow.yml @@ -46,3 +46,10 @@ jobs: plugin: kedro-airflow os: ${{ matrix.os }} python-version: ${{ matrix.python-version }} + + detect-secrets: + uses: ./.github/workflows/detect-secrets.yml + with: + plugin: kedro-airflow + os: ubuntu-latest + python-version: "3.11" diff --git a/.github/workflows/kedro-datasets.yml b/.github/workflows/kedro-datasets.yml index d5aae0282..010115b73 100644 --- a/.github/workflows/kedro-datasets.yml +++ b/.github/workflows/kedro-datasets.yml @@ -61,3 +61,10 @@ jobs: - name: Documentation check for kedro-datasets run: | make check-datasets-docs + + detect-secrets: + uses: ./.github/workflows/detect-secrets.yml + with: + plugin: kedro-datasets + os: ubuntu-latest + python-version: "3.11" diff --git a/.github/workflows/kedro-docker.yml b/.github/workflows/kedro-docker.yml index 66783b3b5..16ffcbafe 100644 --- a/.github/workflows/kedro-docker.yml +++ b/.github/workflows/kedro-docker.yml @@ -46,3 +46,10 @@ jobs: plugin: kedro-docker os: ${{ matrix.os }} python-version: ${{ matrix.python-version }} + + detect-secrets: + uses: ./.github/workflows/detect-secrets.yml + with: + plugin: kedro-docker + os: ubuntu-latest + python-version: "3.11" diff --git a/.github/workflows/kedro-telemetry.yml b/.github/workflows/kedro-telemetry.yml index 5584ac775..aac47914e 100644 --- a/.github/workflows/kedro-telemetry.yml +++ b/.github/workflows/kedro-telemetry.yml @@ -35,3 +35,10 @@ jobs: plugin: kedro-telemetry os: ubuntu-latest python-version: "3.11" + + detect-secrets: + uses: ./.github/workflows/detect-secrets.yml + with: + plugin: kedro-telemetry + os: ubuntu-latest + python-version: "3.11" diff --git a/.github/workflows/validate-pr-title.yaml b/.github/workflows/validate-pr-title.yaml index b6e6fc808..cb1e65327 100644 --- a/.github/workflows/validate-pr-title.yaml +++ b/.github/workflows/validate-pr-title.yaml @@ -19,5 +19,6 @@ jobs: datasets docker telemetry + all env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9f9706a34..9d2eb8de3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,6 +24,12 @@ repos: additional_dependencies: - black==22.12.0 + - repo: https://github.com/Yelp/detect-secrets + rev: v1.5.0 + hooks: + - id: detect-secrets + args: [ '--baseline', '.secrets.baseline' ] + - repo: local hooks: - id: ruff-kedro-datasets @@ -86,12 +92,6 @@ repos: pass_filenames: false entry: black kedro-telemetry/kedro_telemetry kedro-telemetry/tests - - id: secret_scan - name: "Secret scan" - language: system - pass_filenames: false - entry: make secret-scan - - id: bandit name: "Bandit security check" language: system diff --git a/.secrets.baseline b/.secrets.baseline new file mode 100644 index 000000000..ce3799e06 --- /dev/null +++ b/.secrets.baseline @@ -0,0 +1,494 @@ +{ + "version": "1.5.0", + "plugins_used": [ + { + "name": "ArtifactoryDetector" + }, + { + "name": "AWSKeyDetector" + }, + { + "name": "AzureStorageKeyDetector" + }, + { + "name": "Base64HighEntropyString", + "limit": 4.5 + }, + { + "name": "BasicAuthDetector" + }, + { + "name": "CloudantDetector" + }, + { + "name": "DiscordBotTokenDetector" + }, + { + "name": "GitHubTokenDetector" + }, + { + "name": "GitLabTokenDetector" + }, + { + "name": "HexHighEntropyString", + "limit": 3.0 + }, + { + "name": "IbmCloudIamDetector" + }, + { + "name": "IbmCosHmacDetector" + }, + { + "name": "IPPublicDetector" + }, + { + "name": "JwtTokenDetector" + }, + { + "name": "KeywordDetector", + "keyword_exclude": "" + }, + { + "name": "MailchimpDetector" + }, + { + "name": "NpmDetector" + }, + { + "name": "OpenAIDetector" + }, + { + "name": "PrivateKeyDetector" + }, + { + "name": "PypiTokenDetector" + }, + { + "name": "SendGridDetector" + }, + { + "name": "SlackDetector" + }, + { + "name": "SoftlayerDetector" + }, + { + "name": "SquareOAuthDetector" + }, + { + "name": "StripeDetector" + }, + { + "name": "TelegramBotTokenDetector" + }, + { + "name": "TwilioKeyDetector" + } + ], + "filters_used": [ + { + "path": "detect_secrets.filters.allowlist.is_line_allowlisted" + }, + { + "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies", + "min_level": 2 + }, + { + "path": "detect_secrets.filters.heuristic.is_indirect_reference" + }, + { + "path": "detect_secrets.filters.heuristic.is_likely_id_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_lock_file" + }, + { + "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_potential_uuid" + }, + { + "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign" + }, + { + "path": "detect_secrets.filters.heuristic.is_sequential_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_swagger_file" + }, + { + "path": "detect_secrets.filters.heuristic.is_templated_secret" + } + ], + "results": { + "kedro-datasets/kedro_datasets/dask/parquet_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets/dask/parquet_dataset.py", + "hashed_secret": "6e1d66a1596528c308e601c10aa0b92d53606ab9", + "is_verified": false, + "line_number": 71 + } + ], + "kedro-datasets/kedro_datasets/pandas/sql_dataset.py": [ + { + "type": "Basic Auth Credentials", + "filename": "kedro-datasets/kedro_datasets/pandas/sql_dataset.py", + "hashed_secret": "46e3d772a1888eadff26c7ada47fd7502d796e07", + "is_verified": false, + "line_number": 130 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets/pandas/sql_dataset.py", + "hashed_secret": "e026e197bb77b12d16ab6986e068751f016d0ea5", + "is_verified": false, + "line_number": 382 + } + ], + "kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py", + "hashed_secret": "a761ce3a45d97e41840a788495e85a70d1bb3815", + "is_verified": false, + "line_number": 83 + } + ], + "kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py", + "hashed_secret": "46e3d772a1888eadff26c7ada47fd7502d796e07", + "is_verified": false, + "line_number": 57 + } + ], + "kedro-datasets/kedro_datasets_experimental/langchain/_anthropic.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets_experimental/langchain/_anthropic.py", + "hashed_secret": "b60d121b438a380c343d5ec3c2037564b82ffef3", + "is_verified": false, + "line_number": 44 + } + ], + "kedro-datasets/kedro_datasets_experimental/langchain/_cohere.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets_experimental/langchain/_cohere.py", + "hashed_secret": "b60d121b438a380c343d5ec3c2037564b82ffef3", + "is_verified": false, + "line_number": 45 + } + ], + "kedro-datasets/kedro_datasets_experimental/tests/netcdf/test_netcdf_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets_experimental/tests/netcdf/test_netcdf_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 17 + } + ], + "kedro-datasets/kedro_datasets_experimental/tests/video/test_video_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets_experimental/tests/video/test_video_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 16 + } + ], + "kedro-datasets/tests/dask/test_csv_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/dask/test_csv_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 14 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/dask/test_csv_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 27 + } + ], + "kedro-datasets/tests/dask/test_parquet_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/dask/test_parquet_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 16 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/dask/test_parquet_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 29 + } + ], + "kedro-datasets/tests/holoviews/test_holoviews_writer.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/holoviews/test_holoviews_writer.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 108 + } + ], + "kedro-datasets/tests/matplotlib/test_matplotlib_writer.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/matplotlib/test_matplotlib_writer.py", + "hashed_secret": "dc724af18fbdd4e59189f5fe768a5f8311527050", + "is_verified": false, + "line_number": 16 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/matplotlib/test_matplotlib_writer.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 59 + } + ], + "kedro-datasets/tests/pandas/test_csv_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/pandas/test_csv_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 66 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/pandas/test_csv_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 213 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/pandas/test_csv_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 405 + } + ], + "kedro-datasets/tests/pandas/test_generic_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/pandas/test_generic_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 126 + } + ], + "kedro-datasets/tests/pandas/test_json_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/pandas/test_json_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 140 + } + ], + "kedro-datasets/tests/pandas/test_sql_dataset.py": [ + { + "type": "Basic Auth Credentials", + "filename": "kedro-datasets/tests/pandas/test_sql_dataset.py", + "hashed_secret": "46e3d772a1888eadff26c7ada47fd7502d796e07", + "is_verified": false, + "line_number": 19 + } + ], + "kedro-datasets/tests/pandas/test_xml_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/pandas/test_xml_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 117 + } + ], + "kedro-datasets/tests/partitions/test_incremental_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/partitions/test_incremental_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 440 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/partitions/test_incremental_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 460 + } + ], + "kedro-datasets/tests/partitions/test_partitioned_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", + "hashed_secret": "76f747de912e8682e29a23cb506dd5bf0de080d2", + "is_verified": false, + "line_number": 415 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", + "hashed_secret": "9027cc5a2c1321de60a2d71ccde6229d1152d6d3", + "is_verified": false, + "line_number": 416 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", + "hashed_secret": "5dcbdf371f181b9b7a41a4be7be70f8cbee67da7", + "is_verified": false, + "line_number": 452 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 503 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 523 + } + ], + "kedro-datasets/tests/plotly/test_html_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/plotly/test_html_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 70 + } + ], + "kedro-datasets/tests/plotly/test_json_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/plotly/test_json_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 83 + } + ], + "kedro-datasets/tests/plotly/test_plotly_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/plotly/test_plotly_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 81 + } + ], + "kedro-datasets/tests/polars/test_csv_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/polars/test_csv_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 65 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/polars/test_csv_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 159 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/polars/test_csv_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 351 + } + ], + "kedro-datasets/tests/polars/test_eager_polars_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/polars/test_eager_polars_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 126 + } + ], + "kedro-datasets/tests/polars/test_lazy_polars_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/polars/test_lazy_polars_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 93 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/polars/test_lazy_polars_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 198 + } + ], + "kedro-datasets/tests/snowflake/test_snowpark_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/snowflake/test_snowpark_dataset.py", + "hashed_secret": "1365dbfe676a193420ed7981184720b426ef2b7a", + "is_verified": false, + "line_number": 32 + } + ], + "kedro-datasets/tests/spark/test_spark_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/spark/test_spark_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 42 + } + ], + "kedro-datasets/tests/spark/test_spark_jdbc_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/spark/test_spark_jdbc_dataset.py", + "hashed_secret": "4f4fa638cf19a2919f12e0105085c123ca5c5172", + "is_verified": false, + "line_number": 15 + } + ], + "kedro-datasets/tests/spark/test_spark_streaming_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/spark/test_spark_streaming_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 17 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/spark/test_spark_streaming_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 64 + } + ] + }, + "generated_at": "2025-01-13T16:27:46Z" +} diff --git a/Makefile b/Makefile index c7946d605..e8c8a4e08 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ install-pip-setuptools: python -m pip install -U pip setuptools wheel lint: - pre-commit run -a --hook-stage manual ruff-$(plugin) && pre-commit run trailing-whitespace --all-files && pre-commit run end-of-file-fixer --all-files && pre-commit run check-yaml --all-files && pre-commit run check-added-large-files --all-files && pre-commit run check-case-conflict --all-files && pre-commit run check-merge-conflict --all-files && pre-commit run debug-statements --all-files && pre-commit run black-$(plugin) --all-files --hook-stage manual && pre-commit run secret_scan --all-files --hook-stage manual && pre-commit run bandit --all-files --hook-stage manual + pre-commit run -a --hook-stage manual ruff-$(plugin) && pre-commit run trailing-whitespace --all-files && pre-commit run end-of-file-fixer --all-files && pre-commit run check-yaml --all-files && pre-commit run check-added-large-files --all-files && pre-commit run check-case-conflict --all-files && pre-commit run check-merge-conflict --all-files && pre-commit run debug-statements --all-files && pre-commit run black-$(plugin) --all-files --hook-stage manual && pre-commit run bandit --all-files --hook-stage manual $(MAKE) mypy mypy: @@ -21,9 +21,6 @@ test: e2e-tests: cd $(plugin) && behave -secret-scan: - trufflehog --max_depth 1 --exclude_paths trufflehog-ignore.txt . - install-test-requirements: cd $(plugin) && uv pip install ".[test]" diff --git a/kedro-airflow/RELEASE.md b/kedro-airflow/RELEASE.md index 6bd0b7163..348945ac9 100755 --- a/kedro-airflow/RELEASE.md +++ b/kedro-airflow/RELEASE.md @@ -1,4 +1,5 @@ # Upcoming Release +* Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. # Release 0.9.2 * Removed support for Python 3.8 diff --git a/kedro-airflow/pyproject.toml b/kedro-airflow/pyproject.toml index ec7563cdd..6ef8a8b40 100644 --- a/kedro-airflow/pyproject.toml +++ b/kedro-airflow/pyproject.toml @@ -38,9 +38,9 @@ test = [ lint = [ "bandit", "black~=22.0", + "detect-secrets~=1.5.0", "mypy~=1.0", "pre-commit>=2.9.2", - "trufflehog>=2.1.0, <3.0", "ruff~=0.0.290", # mypy requirements "types-PyYAML", diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 27df63f78..15c13da84 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,6 +1,8 @@ # Upcoming Release ## Major features and improvements +- Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. + ## Bug fixes and other changes - Fix polars.CSVDataset `save` method on Windows using `utf-8` as default encoding. diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 3ee8eb9e9..1fcde25c6 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -270,11 +270,11 @@ lint = [ "bandit>=1.6.2, <2.0", "blacken-docs==1.9.2", "black~=22.0", + "detect-secrets~=1.5.0", "import-linter[toml]==1.2.6", "mypy~=1.0", "pre-commit>=2.9.2", "ruff~=0.0.290", - "trufflehog~=2.1", # mypy related dependencies "types-cachetools", "types-PyYAML", diff --git a/kedro-docker/RELEASE.md b/kedro-docker/RELEASE.md index f81181579..b7bab9313 100644 --- a/kedro-docker/RELEASE.md +++ b/kedro-docker/RELEASE.md @@ -1,4 +1,5 @@ # Upcoming Release +* Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. # Release 0.6.2 diff --git a/kedro-docker/pyproject.toml b/kedro-docker/pyproject.toml index 15c8d04fc..b669a0e2d 100644 --- a/kedro-docker/pyproject.toml +++ b/kedro-docker/pyproject.toml @@ -39,9 +39,9 @@ test = [ lint = [ "bandit", "black~=22.0", + "detect-secrets~=1.5.0", "mypy~=1.0", "pre-commit>=2.9.2", - "trufflehog>=2.1.0, <3.0", "ruff~=0.0.290", ] diff --git a/kedro-telemetry/RELEASE.md b/kedro-telemetry/RELEASE.md index df7bb603a..1b4fce80f 100644 --- a/kedro-telemetry/RELEASE.md +++ b/kedro-telemetry/RELEASE.md @@ -1,4 +1,5 @@ # Upcoming release +* Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. # Release 0.6.2 * Removed support for Python 3.8 diff --git a/kedro-telemetry/pyproject.toml b/kedro-telemetry/pyproject.toml index 45f9d995d..1f43f2315 100644 --- a/kedro-telemetry/pyproject.toml +++ b/kedro-telemetry/pyproject.toml @@ -35,9 +35,9 @@ test = [ lint = [ "bandit>=1.6.2, <2.0", "black~=22.0", + "detect-secrets~=1.5.0", "mypy~=1.0", "pre-commit>=2.9.2", - "trufflehog>=2.1.0, <3.0", "ruff~=0.0.290", # mypy requirements "types-requests", diff --git a/trufflehog-ignore.txt b/trufflehog-ignore.txt deleted file mode 100644 index 1929a2634..000000000 --- a/trufflehog-ignore.txt +++ /dev/null @@ -1,3 +0,0 @@ -kedro-telemetry/README.md -kedro-telemetry/RELEASE.md -kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py