diff --git a/.gitpod.yml b/.gitpod.yml index 70738f4c0..f01e52544 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -1,6 +1,4 @@ -# Learn more from ready-to-use templates: https://www.gitpod.io/docs/introduction/getting-started/quickstart -image: gitpod/workspace-python-3.10:2023-04-20-16-32-37 - +image: gitpod/workspace-python-3.11 tasks: # We want packages installed during the pre-build init steps to go to /workspace @@ -12,22 +10,16 @@ tasks: echo PIP_USER=no >> ~/.bashrc && export PIP_USER=no init: | make sign-off + pip install uv + uv venv + echo source .venv/bin/activate >> ~/.bashrc + source ~/.bashrc + make install-test-requirements plugin=kedro-datasets command: | pre-commit install --install-hooks clear - -github: - prebuilds: - # enable for the master/default branch (defaults to true) - master: true - # enable for all branches in this repo (defaults to false) - branches: true - # enable for pull requests coming from this repo (defaults to true) - pullRequests: true - # enable for pull requests coming from forks (defaults to false) - pullRequestsFromForks: true - # add a "Review in Gitpod" button as a comment to pull requests (defaults to true) - addComment: false - # add a "Review in Gitpod" button to pull requests (defaults to false) - addBadge: true + - name: system + init: | + sudo apt-get update && sudo apt-get install -y --no-install-recommends libgl1 make + sudo apt-get install -y --no-install-recommends libatk-bridge2.0-0 libcups2 ca-certificates fonts-liberation libasound2 libatk-bridge2.0-0 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgbm1 libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libnss3 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 lsb-release wget xdg-utils diff --git a/Makefile b/Makefile index 22bc17816..324332972 100644 --- a/Makefile +++ b/Makefile @@ -5,13 +5,6 @@ package: rm -Rf dist;\ python -m build -pypi: - python -m pip install twine -U - python -m twine upload $(plugin)/dist/* - -install: package - cd $(plugin) && pip install -U dist/*.whl - install-pip-setuptools: python -m pip install -U pip setuptools wheel @@ -25,46 +18,14 @@ mypy: test: cd $(plugin) && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile -# Run test_tensorflow_model_dataset separately, because these tests are flaky when run as part of the full test-suite -dataset-tests: dataset-doctests - cd kedro-datasets && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile --ignore tests/tensorflow - cd kedro-datasets && pytest tests/tensorflow/test_tensorflow_model_dataset.py --no-cov - -extra_pytest_args-no-spark=--ignore kedro_datasets/databricks --ignore kedro_datasets/spark -extra_pytest_args= -dataset-doctest%: - if [ "${*}" != 's-no-spark' ] && [ "${*}" != 's' ]; then \ - echo "make: *** No rule to make target \`${@}\`. Stop."; \ - exit 2; \ - fi; \ - \ - # The ignored datasets below require complicated setup with cloud/database clients which is overkill for the doctest examples. - cd kedro-datasets && pytest kedro_datasets --doctest-modules --doctest-continue-on-failure --no-cov \ - --ignore kedro_datasets/pandas/gbq_dataset.py \ - --ignore kedro_datasets/partitions/partitioned_dataset.py \ - --ignore kedro_datasets/redis/redis_dataset.py \ - --ignore kedro_datasets/snowflake/snowpark_dataset.py \ - --ignore kedro_datasets/spark/spark_hive_dataset.py \ - --ignore kedro_datasets/spark/spark_jdbc_dataset.py \ - $(extra_pytest_arg${*}) - -test-sequential: - cd $(plugin) && pytest tests --cov-config pyproject.toml - e2e-tests: cd $(plugin) && behave secret-scan: trufflehog --max_depth 1 --exclude_paths trufflehog-ignore.txt . -clean: - cd $(plugin);\ - rm -rf build dist pip-wheel-metadata .pytest_cache;\ - find . -regex ".*/__pycache__" -exec rm -rf {} +;\ - find . -regex ".*\.egg-info" -exec rm -rf {} +;\ - install-test-requirements: - cd $(plugin) && pip install ".[test]" + cd $(plugin) && uv pip install ".[test]" install-pre-commit: pre-commit install --install-hooks @@ -79,12 +40,12 @@ sign-off: echo '--in-place "$$1"' >> .git/hooks/commit-msg chmod +x .git/hooks/commit-msg +## kedro-datasets specific + # kedro-datasets related only test-no-spark: dataset-doctests-no-spark cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks --numprocesses 4 --dist loadfile -test-no-spark-sequential: dataset-doctests-no-spark - cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks # kedro-datasets/snowflake tests skipped from default scope test-snowflake-only: @@ -93,3 +54,26 @@ test-snowflake-only: check-datasets-docs: cd kedro-datasets && python -m sphinx -WETan -j auto -D language=en -b linkcheck -d _build/doctrees docs/source _build/linkcheck + +# Run test_tensorflow_model_dataset separately, because these tests are flaky when run as part of the full test-suite +dataset-tests: dataset-doctests + cd kedro-datasets && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile --ignore tests/tensorflow + cd kedro-datasets && pytest tests/tensorflow/test_tensorflow_model_dataset.py --no-cov + +extra_pytest_args-no-spark=--ignore kedro_datasets/databricks --ignore kedro_datasets/spark +extra_pytest_args= +dataset-doctest%: + if [ "${*}" != 's-no-spark' ] && [ "${*}" != 's' ]; then \ + echo "make: *** No rule to make target \`${@}\`. Stop."; \ + exit 2; \ + fi; \ + \ + # The ignored datasets below require complicated setup with cloud/database clients which is overkill for the doctest examples. + cd kedro-datasets && pytest kedro_datasets --doctest-modules --doctest-continue-on-failure --no-cov \ + --ignore kedro_datasets/pandas/gbq_dataset.py \ + --ignore kedro_datasets/partitions/partitioned_dataset.py \ + --ignore kedro_datasets/redis/redis_dataset.py \ + --ignore kedro_datasets/snowflake/snowpark_dataset.py \ + --ignore kedro_datasets/spark/spark_hive_dataset.py \ + --ignore kedro_datasets/spark/spark_jdbc_dataset.py \ + $(extra_pytest_arg${*}) diff --git a/kedro-airflow/kedro_airflow/grouping.py b/kedro-airflow/kedro_airflow/grouping.py index 26c931f8d..3890804ae 100644 --- a/kedro-airflow/kedro_airflow/grouping.py +++ b/kedro-airflow/kedro_airflow/grouping.py @@ -4,6 +4,11 @@ from kedro.pipeline.node import Node from kedro.pipeline.pipeline import Pipeline +try: + from kedro.io import CatalogProtocol +except ImportError: # pragma: no cover + pass + def _is_memory_dataset(catalog, dataset_name: str) -> bool: if dataset_name not in catalog: @@ -11,7 +16,9 @@ def _is_memory_dataset(catalog, dataset_name: str) -> bool: return False -def get_memory_datasets(catalog: DataCatalog, pipeline: Pipeline) -> set[str]: +def get_memory_datasets( + catalog: CatalogProtocol | DataCatalog, pipeline: Pipeline +) -> set[str]: """Gather all datasets in the pipeline that are of type MemoryDataset, excluding 'parameters'.""" return { dataset_name @@ -21,7 +28,7 @@ def get_memory_datasets(catalog: DataCatalog, pipeline: Pipeline) -> set[str]: def create_adjacency_list( - catalog: DataCatalog, pipeline: Pipeline + catalog: CatalogProtocol | DataCatalog, pipeline: Pipeline ) -> tuple[dict[str, set], dict[str, set]]: """ Builds adjacency list (adj_list) to search connected components - undirected graph, @@ -48,7 +55,7 @@ def create_adjacency_list( def group_memory_nodes( - catalog: DataCatalog, pipeline: Pipeline + catalog: CatalogProtocol | DataCatalog, pipeline: Pipeline ) -> tuple[dict[str, list[Node]], dict[str, list[str]]]: """ Nodes that are connected through MemoryDatasets cannot be distributed across diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 109d4e6fe..95b7a9dca 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -5,6 +5,8 @@ | Type | Description | Location | |-------------------------------------|-----------------------------------------------------------|-----------------------------------------| | `pytorch.PyTorchDataset` | A dataset for securely saving and loading PyTorch models | `kedro_datasets_experimental.pytorch` | +| `prophet.ProphetModelDataset` | A dataset for Meta's Prophet model for time series forecasting | `kedro_datasets_experimental.prophet` | + * Added the following new core datasets: @@ -14,12 +16,19 @@ ## Bug fixes and other changes * Refactored all datasets to set `fs_args` defaults in the same way as `load_args` and `save_args` and not have hardcoded values in the save methods. +* Fixed bug related to loading/saving models from/to remote storage using `TensorFlowModelDataset`. +* Fixed deprecated load and save approaches of GBQTableDataset and GBQQueryDataset by invoking save and load directly over `pandas-gbq` lib +* Fixed incorrect `pandas` optional dependency ## Breaking Changes ## Community contributions Many thanks to the following Kedroids for contributing PRs to this release: * [Brandon Meek](https://github.com/bpmeek) * [yury-fedotov](https://github.com/yury-fedotov) +* [gitgud5000](https://github.com/gitgud5000) +* [janickspirig](https://github.com/janickspirig) +* [Galen Seilis](https://github.com/galenseilis) +* [Mariusz Wojakowski](https://github.com/mariusz89016) # Release 4.1.0 diff --git a/kedro-datasets/docs/source/api/kedro_datasets_experimental.rst b/kedro-datasets/docs/source/api/kedro_datasets_experimental.rst index 0eb76c739..219510954 100644 --- a/kedro-datasets/docs/source/api/kedro_datasets_experimental.rst +++ b/kedro-datasets/docs/source/api/kedro_datasets_experimental.rst @@ -16,5 +16,6 @@ kedro_datasets_experimental langchain.ChatOpenAIDataset langchain.OpenAIEmbeddingsDataset netcdf.NetCDFDataset + prophet.ProphetModelDataset pytorch.PyTorchDataset rioxarray.GeoTIFFDataset diff --git a/kedro-datasets/docs/source/conf.py b/kedro-datasets/docs/source/conf.py index 70c6be3ae..09524612a 100644 --- a/kedro-datasets/docs/source/conf.py +++ b/kedro-datasets/docs/source/conf.py @@ -140,6 +140,8 @@ "xarray.core.dataset.Dataset", "xarray.core.dataarray.DataArray", "torch.nn.modules.module.Module", + "prophet.forecaster.Prophet", + "Prophet", ), "py:data": ( "typing.Any", diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py index f16f828f7..e7ed3c2df 100644 --- a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py @@ -10,6 +10,7 @@ import fsspec import pandas as pd +import pandas_gbq as pd_gbq from google.cloud import bigquery from google.cloud.exceptions import NotFound from google.oauth2.credentials import Credentials @@ -138,16 +139,17 @@ def _describe(self) -> dict[str, Any]: def _load(self) -> pd.DataFrame: sql = f"select * from {self._dataset}.{self._table_name}" # nosec - self._load_args.setdefault("query", sql) - return pd.read_gbq( + self._load_args.setdefault("query_or_table", sql) + return pd_gbq.read_gbq( project_id=self._project_id, credentials=self._credentials, **self._load_args, ) def _save(self, data: pd.DataFrame) -> None: - data.to_gbq( - f"{self._dataset}.{self._table_name}", + pd_gbq.to_gbq( + dataframe=data, + destination_table=f"{self._dataset}.{self._table_name}", project_id=self._project_id, credentials=self._credentials, **self._save_args, @@ -176,7 +178,7 @@ def _validate_location(self): class GBQQueryDataset(AbstractDataset[None, pd.DataFrame]): """``GBQQueryDataset`` loads data from a provided SQL query from Google - BigQuery. It uses ``pandas.read_gbq`` which itself uses ``pandas-gbq`` + BigQuery. It uses ``pandas_gbq.read_gbq`` which itself uses ``pandas-gbq`` internally to read from BigQuery table. Therefore it supports all allowed pandas options on ``read_gbq``. @@ -274,7 +276,7 @@ def __init__( # noqa: PLR0913 # load sql query from arg or from file if sql: - self._load_args["query"] = sql + self._load_args["query_or_table"] = sql self._filepath = None else: # filesystem for loading sql file @@ -291,7 +293,7 @@ def __init__( # noqa: PLR0913 def _describe(self) -> dict[str, Any]: load_args = copy.deepcopy(self._load_args) desc = {} - desc["sql"] = str(load_args.pop("query", None)) + desc["sql"] = str(load_args.pop("query_or_table", None)) desc["filepath"] = str(self._filepath) desc["load_args"] = str(load_args) @@ -303,9 +305,9 @@ def _load(self) -> pd.DataFrame: if self._filepath: load_path = get_filepath_str(PurePosixPath(self._filepath), self._protocol) with self._fs.open(load_path, mode="r") as fs_file: - load_args["query"] = fs_file.read() + load_args["query_or_table"] = fs_file.read() - return pd.read_gbq( + return pd_gbq.read_gbq( project_id=self._project_id, credentials=self._credentials, **load_args, diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py index 5c5dc27a1..e4492161d 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py @@ -144,7 +144,7 @@ def _load(self) -> tf.keras.Model: # We assume .keras path = str(PurePath(tempdir) / TEMPORARY_KERAS_FILE) # noqa: PLW2901 - self._fs.copy(load_path, path) + self._fs.get(load_path, path) # Pass the local temporary directory/file path to keras.load_model device_name = self._load_args.pop("tf_device", None) @@ -169,7 +169,7 @@ def _save(self, data: tf.keras.Model) -> None: # Use fsspec to take from local tempfile directory/file and # put in ArbitraryFileSystem - self._fs.copy(path, save_path) + self._fs.put(path, save_path) def _exists(self) -> bool: try: diff --git a/kedro-datasets/kedro_datasets_experimental/prophet/__init__.py b/kedro-datasets/kedro_datasets_experimental/prophet/__init__.py new file mode 100644 index 000000000..93cd66d99 --- /dev/null +++ b/kedro-datasets/kedro_datasets_experimental/prophet/__init__.py @@ -0,0 +1,11 @@ +"""``JSONDataset`` implementation to load/save data from/to a Prophet model file.""" + +from typing import Any + +import lazy_loader as lazy + +ProphetDataset: Any + +__getattr__, __dir__, __all__ = lazy.attach( + __name__, submod_attrs={"prophet_dataset": ["ProphetModelDataset"]} +) diff --git a/kedro-datasets/kedro_datasets_experimental/prophet/prophet_dataset.py b/kedro-datasets/kedro_datasets_experimental/prophet/prophet_dataset.py new file mode 100644 index 000000000..ca2cd1e75 --- /dev/null +++ b/kedro-datasets/kedro_datasets_experimental/prophet/prophet_dataset.py @@ -0,0 +1,121 @@ +from __future__ import annotations + +from typing import Any + +from kedro.io.core import Version, get_filepath_str +from prophet import Prophet +from prophet.serialize import model_from_json, model_to_json + +from kedro_datasets.json import JSONDataset + + +class ProphetModelDataset(JSONDataset): + """``ProphetModelDataset`` loads/saves Facebook Prophet models to a JSON file using an + underlying filesystem (e.g., local, S3, GCS). It uses Prophet's built-in + serialization to handle the JSON file. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + model: + type: custom_datasets.ProphetModelDataset + filepath: gcs://your_bucket/model.json + fs_args: + project: my-project + credentials: my_gcp_credentials + + Example usage for the + `Python API `_: + + .. code-block:: pycon + + >>> from kedro_datasets_experimental.prophet import ProphetModelDataset + >>> from prophet import Prophet + >>> import pandas as pd + >>> + >>> df = pd.DataFrame({ + >>> "ds": ["2024-01-01", "2024-01-02", "2024-01-03"], + >>> "y": [100, 200, 300] + >>> }) + >>> + >>> model = Prophet() + >>> model.fit(df) + >>> dataset = ProphetModelDataset(filepath="path/to/model.json") + >>> dataset.save(model) + >>> reloaded_model = dataset.load() + + """ + + def __init__( # noqa: PLR0913 + self, + *, + filepath: str, + save_args: dict[str, Any] | None = None, + version: Version | None = None, + credentials: dict[str, Any] | None = None, + fs_args: dict[str, Any] | None = None, + metadata: dict[str, Any] | None = None, + ) -> None: + """Creates a new instance of ``ProphetModelDataset`` pointing to a concrete JSON file + on a specific filesystem. + + Args: + filepath: Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`. + If prefix is not provided, `file` protocol (local filesystem) will be used. + The prefix should be any protocol supported by ``fsspec``. + Note: `http(s)` doesn't support versioning. + save_args: json options for saving JSON files (arguments passed + into ```json.dump``). Here you can find all available arguments: + https://docs.python.org/3/library/json.html + All defaults are preserved, but "default_flow_style", which is set to False. + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + credentials: Credentials required to get access to the underlying filesystem. + E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as + to pass to the filesystem's `open` method through nested keys + `open_args_load` and `open_args_save`. + Here you can find all available arguments for `open`: + https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. + """ + super().__init__( + filepath=filepath, + save_args=save_args, + version=version, + credentials=credentials, + fs_args=fs_args, + metadata=metadata, + ) + + def _load(self) -> Prophet: + """Loads a Prophet model from a JSON file. + + Returns: + Prophet: A deserialized Prophet model. + """ + load_path = get_filepath_str(self._get_load_path(), self._protocol) + + with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: + return model_from_json(fs_file.read()) + + def _save(self, data: Prophet) -> None: + """Saves a Prophet model to a JSON file. + + Args: + data: The Prophet model instance to be serialized and saved. + """ + save_path = get_filepath_str(self._get_save_path(), self._protocol) + + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + fs_file.write(model_to_json(data)) + + self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets_experimental/pytorch/pytorch_dataset.py b/kedro-datasets/kedro_datasets_experimental/pytorch/pytorch_dataset.py index 914fdb6b7..15c10a93d 100644 --- a/kedro-datasets/kedro_datasets_experimental/pytorch/pytorch_dataset.py +++ b/kedro-datasets/kedro_datasets_experimental/pytorch/pytorch_dataset.py @@ -96,11 +96,11 @@ def _describe(self) -> dict[str, Any]: def _load(self) -> Any: load_path = get_filepath_str(self._get_load_path(), self._protocol) - return torch.load(load_path, **self._fs_open_args_load) + return torch.load(load_path, **self._fs_open_args_load) #nosec: B614 def _save(self, data: torch.nn.Module) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - torch.save(data.state_dict(), save_path, **self._fs_open_args_save) + torch.save(data.state_dict(), save_path, **self._fs_open_args_save) #nosec: B614 self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets_experimental/tests/conftest.py b/kedro-datasets/kedro_datasets_experimental/tests/conftest.py new file mode 100644 index 000000000..91d19f646 --- /dev/null +++ b/kedro-datasets/kedro_datasets_experimental/tests/conftest.py @@ -0,0 +1,34 @@ +""" +This file contains the fixtures that are reusable by any tests within +this directory. You don't need to import the fixtures as pytest will +discover them automatically. More info here: +https://docs.pytest.org/en/latest/fixture.html +""" + +from kedro.io.core import generate_timestamp +from pytest import fixture + + +@fixture(params=[None]) +def load_version(request): + return request.param + + +@fixture(params=[None]) +def save_version(request): + return request.param or generate_timestamp() + + +@fixture(params=[None]) +def load_args(request): + return request.param + + +@fixture(params=[None]) +def save_args(request): + return request.param + + +@fixture(params=[None]) +def fs_args(request): + return request.param diff --git a/kedro-datasets/tests/kedro_datasets_experimental/__init__.py b/kedro-datasets/kedro_datasets_experimental/tests/prophet/__init__.py similarity index 100% rename from kedro-datasets/tests/kedro_datasets_experimental/__init__.py rename to kedro-datasets/kedro_datasets_experimental/tests/prophet/__init__.py diff --git a/kedro-datasets/kedro_datasets_experimental/tests/prophet/test_prophet_dataset.py b/kedro-datasets/kedro_datasets_experimental/tests/prophet/test_prophet_dataset.py new file mode 100644 index 000000000..88510a99b --- /dev/null +++ b/kedro-datasets/kedro_datasets_experimental/tests/prophet/test_prophet_dataset.py @@ -0,0 +1,209 @@ +from pathlib import Path, PurePosixPath + +import pandas as pd +import pytest +from fsspec.implementations.http import HTTPFileSystem +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from kedro.io.core import PROTOCOL_DELIMITER, DatasetError, Version +from prophet import Prophet +from s3fs.core import S3FileSystem + +from kedro_datasets_experimental.prophet import ProphetModelDataset + + +@pytest.fixture +def filepath_json(tmp_path): + return (tmp_path / "test_model.json").as_posix() + + +@pytest.fixture +def prophet_model_dataset(filepath_json, save_args, fs_args): + return ProphetModelDataset( + filepath=filepath_json, save_args=save_args, fs_args=fs_args + ) + + +@pytest.fixture +def versioned_prophet_model_dataset(filepath_json, load_version, save_version): + return ProphetModelDataset( + filepath=filepath_json, version=Version(load_version, save_version) + ) + + +@pytest.fixture +def dummy_model(): + df = pd.DataFrame({"ds": ["2024-01-01", "2024-01-02", "2024-01-03"], "y": [100, 200, 300]}) + model = Prophet() + # Fit the model with dummy data + model.fit(df) + return model + + +class TestProphetModelDataset: + def test_save_and_load(self, prophet_model_dataset, dummy_model): + """Test saving and reloading the Prophet model.""" + prophet_model_dataset.save(dummy_model) + reloaded = prophet_model_dataset.load() + assert isinstance(reloaded, Prophet) + assert prophet_model_dataset._fs_open_args_load == {} + assert prophet_model_dataset._fs_open_args_save == {"mode": "w"} + + def test_exists(self, prophet_model_dataset, dummy_model): + """Test `exists` method invocation for both existing and + nonexistent dataset.""" + assert not prophet_model_dataset.exists() + prophet_model_dataset.save(dummy_model) + assert prophet_model_dataset.exists() + + @pytest.mark.parametrize("save_args", [{"k1": "v1", "indent": 4}], indirect=True) + def test_save_extra_params(self, prophet_model_dataset, save_args): + """Test overriding the default save arguments.""" + for key, value in save_args.items(): + assert prophet_model_dataset._save_args[key] == value + + @pytest.mark.parametrize( + "fs_args", + [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], + indirect=True, + ) + def test_open_extra_args(self, prophet_model_dataset, fs_args): + assert prophet_model_dataset._fs_open_args_load == fs_args["open_args_load"] + assert prophet_model_dataset._fs_open_args_save == { + "mode": "w" + } # default unchanged + + def test_load_missing_file(self, prophet_model_dataset): + """Check the error when trying to load missing file.""" + pattern = r"Failed while loading data from data set ProphetModelDataset\(.*\)" + with pytest.raises(DatasetError, match=pattern): + prophet_model_dataset.load() + + @pytest.mark.parametrize( + "filepath,instance_type", + [ + ("s3://bucket/model.json", S3FileSystem), + ("file:///tmp/test_model.json", LocalFileSystem), + ("/tmp/test_model.json", LocalFileSystem), #nosec: B108 + ("gcs://bucket/model.json", GCSFileSystem), + ("https://example.com/model.json", HTTPFileSystem), + ], + ) + def test_protocol_usage(self, filepath, instance_type): + dataset = ProphetModelDataset(filepath=filepath) + assert isinstance(dataset._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(dataset._filepath) == path + assert isinstance(dataset._filepath, PurePosixPath) + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test_model.json" + dataset = ProphetModelDataset(filepath=filepath) + dataset.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) + + +class TestProphetModelDatasetVersioned: + def test_version_str_repr(self, load_version, save_version): + """Test that version is in string representation of the class instance + when applicable.""" + filepath = "test_model.json" + ds = ProphetModelDataset(filepath=filepath) + ds_versioned = ProphetModelDataset( + filepath=filepath, version=Version(load_version, save_version) + ) + assert filepath in str(ds) + assert "version" not in str(ds) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "ProphetModelDataset" in str(ds_versioned) + assert "ProphetModelDataset" in str(ds) + assert "protocol" in str(ds_versioned) + assert "protocol" in str(ds) + # Default save_args + assert "save_args={'indent': 2}" in str(ds) + assert "save_args={'indent': 2}" in str(ds_versioned) + + def test_save_and_load(self, versioned_prophet_model_dataset, dummy_model): + """Test that saved and reloaded data matches the original one for + the versioned dataset.""" + versioned_prophet_model_dataset.save(dummy_model) + reloaded = versioned_prophet_model_dataset.load() + assert isinstance(reloaded, Prophet) + + def test_no_versions(self, versioned_prophet_model_dataset): + """Check the error if no versions are available for load.""" + pattern = r"Did not find any versions for ProphetModelDataset\(.+\)" + with pytest.raises(DatasetError, match=pattern): + versioned_prophet_model_dataset.load() + + def test_exists(self, versioned_prophet_model_dataset, dummy_model): + """Test `exists` method invocation for versioned dataset.""" + assert not versioned_prophet_model_dataset.exists() + versioned_prophet_model_dataset.save(dummy_model) + assert versioned_prophet_model_dataset.exists() + + def test_prevent_overwrite(self, versioned_prophet_model_dataset, dummy_model): + """Check the error when attempting to override the dataset if the + corresponding json file for a given save version already exists.""" + versioned_prophet_model_dataset.save(dummy_model) + pattern = ( + r"Save path \'.+\' for ProphetModelDataset\(.+\) must " + r"not exist if versioning is enabled\." + ) + with pytest.raises(DatasetError, match=pattern): + versioned_prophet_model_dataset.save(dummy_model) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, versioned_prophet_model_dataset, load_version, save_version, dummy_model + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + f"Save version '{save_version}' did not match " + f"load version '{load_version}' for " + r"ProphetModelDataset\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + versioned_prophet_model_dataset.save(dummy_model) + + def test_http_filesystem_no_versioning(self): + pattern = "Versioning is not supported for HTTP protocols." + + with pytest.raises(DatasetError, match=pattern): + ProphetModelDataset( + filepath="https://example.com/model.json", version=Version(None, None) + ) + + def test_versioning_existing_dataset( + self, prophet_model_dataset, versioned_prophet_model_dataset, dummy_model + ): + """Check the error when attempting to save a versioned dataset on top of an + already existing (non-versioned) dataset.""" + prophet_model_dataset.save(dummy_model) + assert prophet_model_dataset.exists() + assert ( + prophet_model_dataset._filepath == versioned_prophet_model_dataset._filepath + ) + pattern = ( + f"(?=.*file with the same name already exists in the directory)" + f"(?=.*{versioned_prophet_model_dataset._filepath.parent.as_posix()})" + ) + with pytest.raises(DatasetError, match=pattern): + versioned_prophet_model_dataset.save(dummy_model) + + # Remove non-versioned dataset and try again + Path(prophet_model_dataset._filepath.as_posix()).unlink() + versioned_prophet_model_dataset.save(dummy_model) + assert versioned_prophet_model_dataset.exists() diff --git a/kedro-datasets/tests/kedro_datasets_experimental/pytorch/__init__.py b/kedro-datasets/kedro_datasets_experimental/tests/pytorch/__init__.py similarity index 100% rename from kedro-datasets/tests/kedro_datasets_experimental/pytorch/__init__.py rename to kedro-datasets/kedro_datasets_experimental/tests/pytorch/__init__.py diff --git a/kedro-datasets/tests/kedro_datasets_experimental/pytorch/test_pytorch_dataset.py b/kedro-datasets/kedro_datasets_experimental/tests/pytorch/test_pytorch_dataset.py similarity index 100% rename from kedro-datasets/tests/kedro_datasets_experimental/pytorch/test_pytorch_dataset.py rename to kedro-datasets/kedro_datasets_experimental/tests/pytorch/test_pytorch_dataset.py diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index e1180f2f2..56e40fd4d 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -33,8 +33,9 @@ api = ["kedro-datasets[api-apidataset]"] biosequence-biosequencedataset = ["biopython~=1.73"] biosequence = ["kedro-datasets[biosequence-biosequencedataset]"] +dask-csvdataset = ["dask[dataframe]>=2021.10"] dask-parquetdataset = ["dask[complete]>=2021.10", "triad>=0.6.7, <1.0"] -dask = ["kedro-datasets[dask-parquetdataset]"] +dask = ["kedro-datasets[dask-parquetdataset, dask-csvdataset]"] databricks-managedtabledataset = ["kedro-datasets[spark-base,pandas-base,delta-base,hdfs-base,s3fs-base]"] databricks = ["kedro-datasets[databricks-managedtabledataset]"] @@ -92,7 +93,7 @@ pandas-featherdataset = ["kedro-datasets[pandas-base]"] pandas-gbqtabledataset = ["kedro-datasets[pandas-base]", "pandas-gbq>=0.12.0"] pandas-gbqquerydataset = ["kedro-datasets[pandas-base]", "pandas-gbq>=0.12.0"] pandas-genericdataset = ["kedro-datasets[pandas-base]"] -pandas-hdfdataset = ["kedro-datasets[pandas-base]", "tables~=3.6"] +pandas-hdfdataset = ["kedro-datasets[pandas-base]", "tables>=3.6"] pandas-jsondataset = ["kedro-datasets[pandas-base]"] pandas-parquetdataset = ["kedro-datasets[pandas-base]", "pyarrow>=6.0"] pandas-sqltabledataset = ["kedro-datasets[pandas-base]", "SQLAlchemy>=1.4, <3.0"] @@ -104,7 +105,7 @@ pandas = [ pandas-exceldataset,\ pandas-featherdataset,\ pandas-gbqquerydataset,\ - pandas-gbqtabledataset.\ + pandas-gbqtabledataset,\ pandas-genericdataset,\ pandas-hdfdataset,\ pandas-jsondataset,\ @@ -127,9 +128,12 @@ plotly = ["kedro-datasets[plotly-htmldataset,plotly-jsondataset,plotly-plotlydat polars-csvdataset = ["kedro-datasets[polars-base]"] polars-eagerpolarsdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "xlsx2csv>=0.8.0", "deltalake >= 0.6.2"] -polars-genericdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "xlsx2csv>=0.8.0", "deltalake >= 0.6.2"] polars-lazypolarsdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "deltalake >= 0.6.2"] -polars = ["kedro-datasets[polars-genericdataset]"] +polars = [ + """kedro-datasets[polars-csvdataset,\ + polars-eagerpolarsdataset,\ + polars-lazypolarsdataset]""" +] redis-pickledataset = ["redis~=4.1"] redis = ["kedro-datasets[redis-pickledataset]"] @@ -140,8 +144,15 @@ snowflake = ["kedro-datasets[snowflake-snowparktabledataset]"] spark-deltatabledataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base,delta-base]"] spark-sparkdataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"] spark-sparkhivedataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"] -spark-sparkjdbcdataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"] -spark = ["kedro-datasets[spark-deltatabledataset]"] +spark-sparkjdbcdataset = ["kedro-datasets[spark-base]"] +spark-sparkstreamingdataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"] +spark = [ + """kedro-datasets[spark-deltatabledataset,\ + spark-sparkdataset,\ + spark-sparkhivedataset,\ + spark-sparkjdbcdataset,\ + spark-sparkstreamingdataset]""" +] svmlight-svmlightdataset = ["scikit-learn>=1.0.2", "scipy~=1.7.3"] svmlight = ["kedro-datasets[svmlight-svmlightdataset]"] @@ -172,6 +183,8 @@ langchain = ["kedro-datasets[langchain-chatopenaidataset,langchain-openaiembeddi netcdf-netcdfdataset = ["h5netcdf>=1.2.0","netcdf4>=1.6.4","xarray>=2023.1.0"] netcdf = ["kedro-datasets[netcdf-netcdfdataset]"] +prophet-dataset = ["prophet>=1.1.5"] +prophet = ["kedro-datasets[prophet]"] pytorch-dataset = ["torch"] pytorch = ["kedro-datasets[pytorch-dataset]"] @@ -209,7 +222,7 @@ test = [ "ibis-framework[duckdb,examples]", "import-linter[toml]==1.2.6", "ipython>=7.31.1, <8.0", - "Jinja2<3.1.0", + "Jinja2<3.2.0", "joblib>=0.14", "jupyterlab>=3.0", "jupyter~=1.0", @@ -248,8 +261,7 @@ test = [ "scipy>=1.7.3", "packaging", "SQLAlchemy>=1.2", - "tables>=3.8.0; platform_system == 'Windows'", # Import issues with python 3.8 with pytables pinning to 3.8.0 fixes this https://github.com/PyTables/PyTables/issues/933#issuecomment-1555917593 - "tables~=3.6; platform_system != 'Windows'", + "tables>=3.6", "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'", "tensorflow~=2.0; platform_system != 'Darwin' or platform_machine != 'arm64'", "triad>=0.6.7, <1.0", @@ -280,7 +292,8 @@ experimental = [ "netcdf4>=1.6.4", "xarray>=2023.1.0", "rioxarray", - "torch" + "torch", + "prophet>=1.1.5", ] # All requirements diff --git a/kedro-datasets/tests/pandas/test_gbq_dataset.py b/kedro-datasets/tests/pandas/test_gbq_dataset.py index a797708ae..63095b74e 100644 --- a/kedro-datasets/tests/pandas/test_gbq_dataset.py +++ b/kedro-datasets/tests/pandas/test_gbq_dataset.py @@ -95,7 +95,9 @@ def test_save_extra_params(self, gbq_dataset, save_args): def test_load_missing_file(self, gbq_dataset, mocker): """Check the error when trying to load missing table.""" pattern = r"Failed while loading data from data set GBQTableDataset\(.*\)" - mocked_read_gbq = mocker.patch("kedro_datasets.pandas.gbq_dataset.pd.read_gbq") + mocked_read_gbq = mocker.patch( + "kedro_datasets.pandas.gbq_dataset.pd_gbq.read_gbq" + ) mocked_read_gbq.side_effect = ValueError with pytest.raises(DatasetError, match=pattern): gbq_dataset.load() @@ -133,30 +135,43 @@ def test_save_load_data(self, gbq_dataset, dummy_dataframe, mocker): """Test saving and reloading the data set.""" sql = f"select * from {DATASET}.{TABLE_NAME}" table_id = f"{DATASET}.{TABLE_NAME}" - mocked_read_gbq = mocker.patch("kedro_datasets.pandas.gbq_dataset.pd.read_gbq") + mocked_to_gbq = mocker.patch("kedro_datasets.pandas.gbq_dataset.pd_gbq.to_gbq") + mocked_read_gbq = mocker.patch( + "kedro_datasets.pandas.gbq_dataset.pd_gbq.read_gbq" + ) mocked_read_gbq.return_value = dummy_dataframe mocked_df = mocker.Mock() gbq_dataset.save(mocked_df) loaded_data = gbq_dataset.load() - mocked_df.to_gbq.assert_called_once_with( - table_id, project_id=PROJECT, credentials=None, progress_bar=False + mocked_to_gbq.assert_called_once_with( + dataframe=mocked_df, + destination_table=table_id, + project_id=PROJECT, + credentials=None, + progress_bar=False, ) mocked_read_gbq.assert_called_once_with( - project_id=PROJECT, credentials=None, query=sql + project_id=PROJECT, credentials=None, query_or_table=sql ) assert_frame_equal(dummy_dataframe, loaded_data) - @pytest.mark.parametrize("load_args", [{"query": "Select 1"}], indirect=True) + @pytest.mark.parametrize( + "load_args", [{"query_or_table": "Select 1"}], indirect=True + ) def test_read_gbq_with_query(self, gbq_dataset, dummy_dataframe, mocker, load_args): """Test loading data set with query in the argument.""" - mocked_read_gbq = mocker.patch("kedro_datasets.pandas.gbq_dataset.pd.read_gbq") + mocked_read_gbq = mocker.patch( + "kedro_datasets.pandas.gbq_dataset.pd_gbq.read_gbq" + ) mocked_read_gbq.return_value = dummy_dataframe loaded_data = gbq_dataset.load() mocked_read_gbq.assert_called_once_with( - project_id=PROJECT, credentials=None, query=load_args["query"] + project_id=PROJECT, + credentials=None, + query_or_table=load_args["query_or_table"], ) assert_frame_equal(dummy_dataframe, loaded_data) @@ -239,26 +254,30 @@ def test_credentials_propagation(self, mocker): def test_load(self, mocker, gbq_sql_dataset, dummy_dataframe): """Test `load` method invocation""" - mocked_read_gbq = mocker.patch("kedro_datasets.pandas.gbq_dataset.pd.read_gbq") + mocked_read_gbq = mocker.patch( + "kedro_datasets.pandas.gbq_dataset.pd_gbq.read_gbq" + ) mocked_read_gbq.return_value = dummy_dataframe loaded_data = gbq_sql_dataset.load() mocked_read_gbq.assert_called_once_with( - project_id=PROJECT, credentials=None, query=SQL_QUERY + project_id=PROJECT, credentials=None, query_or_table=SQL_QUERY ) assert_frame_equal(dummy_dataframe, loaded_data) def test_load_query_file(self, mocker, gbq_sql_file_dataset, dummy_dataframe): """Test `load` method invocation using a file as input query""" - mocked_read_gbq = mocker.patch("kedro_datasets.pandas.gbq_dataset.pd.read_gbq") + mocked_read_gbq = mocker.patch( + "kedro_datasets.pandas.gbq_dataset.pd_gbq.read_gbq" + ) mocked_read_gbq.return_value = dummy_dataframe loaded_data = gbq_sql_file_dataset.load() mocked_read_gbq.assert_called_once_with( - project_id=PROJECT, credentials=None, query=SQL_QUERY + project_id=PROJECT, credentials=None, query_or_table=SQL_QUERY ) assert_frame_equal(dummy_dataframe, loaded_data) diff --git a/kedro-docker/RELEASE.md b/kedro-docker/RELEASE.md index 9d0868b91..725ee0f96 100644 --- a/kedro-docker/RELEASE.md +++ b/kedro-docker/RELEASE.md @@ -1,5 +1,8 @@ # Upcoming Release +# Release 0.6.1 +* Unpinned pip version requirement + # Release 0.6.0 ## Major features and improvements * Added support for Python 3.12 diff --git a/kedro-docker/features/docker.feature b/kedro-docker/features/docker.feature index 2d54de7f3..f3682fd30 100644 --- a/kedro-docker/features/docker.feature +++ b/kedro-docker/features/docker.feature @@ -100,7 +100,7 @@ Feature: Docker commands in new projects Scenario: Execute docker run target without building image When I execute the kedro command "docker run" - Then I should get an error exit code + Then I should get a successful exit code And Standard output should contain a message including "Error: Unable to find image `project-dummy` locally." Scenario: Execute docker dive target @@ -118,5 +118,5 @@ Feature: Docker commands in new projects Scenario: Execute docker dive without building image When I execute the kedro command "docker dive" - Then I should get an error exit code + Then I should get a successful exit code And Standard output should contain a message including "Error: Unable to find image `project-dummy` locally." diff --git a/kedro-docker/features/steps/cli_steps.py b/kedro-docker/features/steps/cli_steps.py index ca98b4d44..f504c522b 100644 --- a/kedro-docker/features/steps/cli_steps.py +++ b/kedro-docker/features/steps/cli_steps.py @@ -305,7 +305,7 @@ def check_status_code(context): print(context.result.stderr) assert ( False - ), f"Expected exit code {OK_EXIT_CODE} but got {context.result.returncode}" + ), f"Expected exit code /= {OK_EXIT_CODE} but got {context.result.returncode}" @then("I should get an error exit code") diff --git a/kedro-docker/kedro_docker/__init__.py b/kedro-docker/kedro_docker/__init__.py index 3cb3da72d..95b77ddf9 100644 --- a/kedro-docker/kedro_docker/__init__.py +++ b/kedro-docker/kedro_docker/__init__.py @@ -1,3 +1,3 @@ """Kedro plugin for packaging a project with Docker.""" -__version__ = "0.6.0" +__version__ = "0.6.1" diff --git a/kedro-telemetry/RELEASE.md b/kedro-telemetry/RELEASE.md index db3fb8053..27b620e64 100644 --- a/kedro-telemetry/RELEASE.md +++ b/kedro-telemetry/RELEASE.md @@ -1,5 +1,10 @@ # Upcoming release +# Release 0.6.1 +* Changed Kedro CLI loading method to improve loading times. +* Changed logging level from error to debug for most logging messages. +* Set default value for the `identity` parameter, to prevent errors due to it being empty. + # Release 0.6.0 * Moved to an opt-out model for telemetry, enabling it by default without requiring prior consent. * Added `DO_NOT_TRACK` and `KEDRO_DISABLE_TELEMETRY` environment variables to skip telemetry. diff --git a/kedro-telemetry/kedro_telemetry/__init__.py b/kedro-telemetry/kedro_telemetry/__init__.py index 3194e2c22..39a11503d 100644 --- a/kedro-telemetry/kedro_telemetry/__init__.py +++ b/kedro-telemetry/kedro_telemetry/__init__.py @@ -1,7 +1,7 @@ """Kedro plugin for collecting Kedro usage data.""" -__version__ = "0.6.0" +__version__ = "0.6.1" import logging -logging.getLogger(__name__).setLevel(logging.INFO) +logging.getLogger(__name__).setLevel(logging.DEBUG) diff --git a/kedro-telemetry/kedro_telemetry/masking.py b/kedro-telemetry/kedro_telemetry/masking.py index ea432f455..9308dc771 100644 --- a/kedro-telemetry/kedro_telemetry/masking.py +++ b/kedro-telemetry/kedro_telemetry/masking.py @@ -1,7 +1,7 @@ """Module containing command masking functionality.""" from __future__ import annotations -from typing import Any, Iterator +from typing import Any import click @@ -81,16 +81,19 @@ def _get_cli_structure( return output -def _mask_kedro_cli( - cli_struct: dict[str | None, Any], command_args: list[str] -) -> list[str]: +def _mask_kedro_cli(cli: click.CommandCollection, command_args: list[str]) -> list[str]: """Takes a dynamic vocabulary (based on `KedroCLI`) and returns a masked CLI input""" output = [] - - # Preserve the initial part of the command until parameters sections begin arg_index = 0 - current_CLI = cli_struct.get("kedro", {}) + cmd = command_args[0] if command_args else "" + if cmd in {"--help", "--version", "-h", "-v", ""}: + return command_args + click_cmd = cli.get_command(ctx=None, cmd_name=cmd) # type: ignore + if click_cmd is None: + return [MASK] + + current_CLI = _get_cli_structure(click_cmd) while ( arg_index < len(command_args) and not command_args[arg_index].startswith("-") @@ -116,13 +119,3 @@ def _mask_kedro_cli( output.append(MASK) return output - - -def _recursive_items(dictionary: dict[Any, Any]) -> Iterator[Any]: - for key, value in dictionary.items(): - if isinstance(value, dict): - yield key - yield from _recursive_items(value) - else: - yield key - yield value diff --git a/kedro-telemetry/kedro_telemetry/plugin.py b/kedro-telemetry/kedro_telemetry/plugin.py index a342f6e66..136201c3d 100644 --- a/kedro-telemetry/kedro_telemetry/plugin.py +++ b/kedro-telemetry/kedro_telemetry/plugin.py @@ -26,7 +26,7 @@ from kedro.pipeline import Pipeline from kedro_telemetry import __version__ as TELEMETRY_VERSION -from kedro_telemetry.masking import _get_cli_structure, _mask_kedro_cli +from kedro_telemetry.masking import _mask_kedro_cli HEAP_APPID_PROD = "2388822444" HEAP_ENDPOINT = "https://heapanalytics.com/api/track" @@ -49,6 +49,7 @@ CONFIG_FILENAME = "telemetry.toml" PYPROJECT_CONFIG_NAME = "pyproject.toml" UNDEFINED_PACKAGE_NAME = "undefined_package_name" +MISSING_USER_IDENTITY = "missing_user_identity" logger = logging.getLogger(__name__) @@ -78,7 +79,7 @@ def _get_or_create_uuid() -> str: return new_uuid except Exception as e: - logging.error(f"Failed to retrieve UUID: {e}") + logging.debug(f"Failed to retrieve UUID: {e}") return "" @@ -104,7 +105,7 @@ def _get_or_create_project_id(pyproject_path: Path) -> str | None: file.write(toml_string) return project_id except KeyError: - logging.error( + logging.debug( f"Failed to retrieve project id or save project id: " f"{str(pyproject_path)} does not contain a [tool.kedro] section" ) @@ -148,7 +149,7 @@ def _generate_new_uuid(full_path: str) -> str: return new_uuid except Exception as e: - logging.error(f"Failed to create UUID: {e}") + logging.debug(f"Failed to create UUID: {e}") return "" @@ -176,10 +177,7 @@ def before_command_run( # get KedroCLI and its structure from actual project root cli = KedroCLI(project_path=project_path if project_path else Path.cwd()) - cli_struct = _get_cli_structure(cli_obj=cli, get_help=False) - masked_command_args = _mask_kedro_cli( - cli_struct=cli_struct, command_args=command_args - ) + masked_command_args = _mask_kedro_cli(cli, command_args=command_args) self._user_uuid = _get_or_create_uuid() @@ -200,13 +198,15 @@ def after_command_run(self): @hook_impl def after_context_created(self, context): - """Hook implementation to send project statistics data to Heap""" + """Hook implementation to read metadata""" self._consent = _check_for_telemetry_consent(context.project_path) self._project_path = context.project_path @hook_impl def after_catalog_created(self, catalog): + """Hook implementation to send project statistics data to Heap""" + if self._consent is False: return @@ -241,12 +241,12 @@ def _send_telemetry_heap_event(self, event_name: str): try: _send_heap_event( event_name=event_name, - identity=self._user_uuid, + identity=self._user_uuid if self._user_uuid else MISSING_USER_IDENTITY, properties=self._event_properties, ) self._sent = True except Exception as exc: - logger.warning( + logger.debug( "Something went wrong in hook implementation to send command run data to Heap. " "Exception: %s", exc, @@ -324,22 +324,21 @@ def _send_heap_event( "event": event_name, "timestamp": datetime.now().strftime(TIMESTAMP_FORMAT), "properties": properties or {}, + "identity": identity, } - if identity: - data["identity"] = identity try: resp = requests.post( url=HEAP_ENDPOINT, headers=HEAP_HEADERS, data=json.dumps(data), timeout=10 ) if resp.status_code != 200: # noqa: PLR2004 - logger.warning( + logger.debug( "Failed to send data to Heap. Response code returned: %s, Response reason: %s", resp.status_code, resp.reason, ) except requests.exceptions.RequestException as exc: - logger.warning( + logger.debug( "Failed to send data to Heap. Exception of type '%s' was raised.", type(exc).__name__, ) diff --git a/kedro-telemetry/tests/integration/dummy-project/.gitignore b/kedro-telemetry/tests/integration/dummy-project/.gitignore new file mode 100644 index 000000000..51a4444c6 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/.gitignore @@ -0,0 +1,151 @@ +########################## +# KEDRO PROJECT + +# ignore all local configuration +conf/local/** +!conf/local/.gitkeep + +# ignore potentially sensitive credentials files +conf/**/*credentials* + +# ignore everything in the following folders +data/** + +# except their sub-folders +!data/**/ + +# also keep all .gitkeep files +!.gitkeep + +# keep also the example dataset +!data/01_raw/* + + +########################## +# Common files + +# IntelliJ +.idea/ +*.iml +out/ +.idea_modules/ + +### macOS +*.DS_Store +.AppleDouble +.LSOverride +.Trashes + +# Vim +*~ +.*.swo +.*.swp + +# emacs +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc + +# JIRA plugin +atlassian-ide-plugin.xml + +# C extensions +*.so + +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +.static_storage/ +.media/ +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/kedro-telemetry/tests/integration/dummy-project/conf/README.md b/kedro-telemetry/tests/integration/dummy-project/conf/README.md new file mode 100644 index 000000000..b135e80c2 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/conf/README.md @@ -0,0 +1,20 @@ +# What is this for? + +This folder should be used to store configuration files used by Kedro or by separate tools. + +This file can be used to provide users with instructions for how to reproduce local configuration with their own credentials. You can edit the file however you like, but you may wish to retain the information below and add your own section in the section titled **Instructions**. + +## Local configuration + +The `local` folder should be used for configuration that is either user-specific (e.g. IDE configuration) or protected (e.g. security keys). + +> *Note:* Please do not check in any local configuration to version control. + +## Base configuration + +The `base` folder is for shared configuration, such as non-sensitive and project-related configuration that may be shared across team members. + +WARNING: Please do not put access credentials in the base configuration folder. + +## Find out more +You can find out more about configuration from the [user guide documentation](https://docs.kedro.org/en/stable/configuration/configuration_basics.html). diff --git a/kedro-telemetry/tests/integration/dummy-project/conf/base/catalog.yml b/kedro-telemetry/tests/integration/dummy-project/conf/base/catalog.yml new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-telemetry/tests/integration/dummy-project/conf/base/parameters.yml b/kedro-telemetry/tests/integration/dummy-project/conf/base/parameters.yml new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-telemetry/tests/integration/dummy-project/conf/local/.gitkeep b/kedro-telemetry/tests/integration/dummy-project/conf/local/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-telemetry/tests/integration/dummy-project/pyproject.toml b/kedro-telemetry/tests/integration/dummy-project/pyproject.toml new file mode 100644 index 000000000..ec07f1b99 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/pyproject.toml @@ -0,0 +1,34 @@ +[build-system] +requires = [ "setuptools",] +build-backend = "setuptools.build_meta" + +[project] +name = "dummy_project" +readme = "README.md" +dynamic = [ "dependencies", "version",] + +[project.scripts] +dummy-project = "dummy_project.__main__:main" + +[tool.kedro] +package_name = "dummy_project" +project_name = "dummy_project" +kedro_init_version = "0.19.6" +tools = [ "None",] +example_pipeline = "True" +source_dir = "src" + +[project.entry-points."kedro.hooks"] + +[tool.setuptools.dynamic.dependencies] +file = "requirements.txt" + +[tool.setuptools.dynamic.version] +attr = "dummy_project.__version__" + +[tool.setuptools.packages.find] +where = [ "src",] +namespaces = false + +[tool.kedro_telemetry] +project_id = "KEDRO_TELEMETRY_TEST" diff --git a/kedro-telemetry/tests/integration/dummy-project/requirements.txt b/kedro-telemetry/tests/integration/dummy-project/requirements.txt new file mode 100644 index 000000000..1c5f8e218 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/requirements.txt @@ -0,0 +1,9 @@ +ipython>=8.10 +jupyterlab>=3.0 +kedro~=0.19.6 +kedro-datasets[pandas-csvdataset, pandas-exceldataset, pandas-parquetdataset]>=3.0; python_version >= "3.9" +kedro-datasets[pandas.CSVDataset, pandas.ExcelDataset, pandas.ParquetDataset]>=1.0; python_version < "3.9" +kedro-telemetry>=0.3.1 +kedro-viz>=6.7.0 +notebook +scikit-learn~=1.0 diff --git a/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/__init__.py b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/__init__.py new file mode 100644 index 000000000..11d70edcb --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/__init__.py @@ -0,0 +1,4 @@ +"""dummy_project +""" + +__version__ = "0.1" diff --git a/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/__main__.py b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/__main__.py new file mode 100644 index 000000000..56cef4b26 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/__main__.py @@ -0,0 +1,47 @@ +"""dummy_project file for ensuring the package is executable +as `dummy-project` and `python -m dummy_project` +""" +import importlib +from pathlib import Path + +from kedro.framework.cli.utils import KedroCliError, load_entry_points +from kedro.framework.project import configure_project + + +def _find_run_command(package_name): + try: + project_cli = importlib.import_module(f"{package_name}.cli") + # fail gracefully if cli.py does not exist + except ModuleNotFoundError as exc: + if f"{package_name}.cli" not in str(exc): + raise + plugins = load_entry_points("project") + run = _find_run_command_in_plugins(plugins) if plugins else None + if run: + # use run command from installed plugin if it exists + return run + # use run command from the framework project + from kedro.framework.cli.project import run + + return run + # fail badly if cli.py exists, but has no `cli` in it + if not hasattr(project_cli, "cli"): + raise KedroCliError(f"Cannot load commands from {package_name}.cli") + return project_cli.run + + +def _find_run_command_in_plugins(plugins): + for group in plugins: + if "run" in group.commands: + return group.commands["run"] + + +def main(*args, **kwargs): + package_name = Path(__file__).parent.name + configure_project(package_name) + run = _find_run_command(package_name) + run(*args, **kwargs) + + +if __name__ == "__main__": + main() diff --git a/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipeline_registry.py b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipeline_registry.py new file mode 100644 index 000000000..2d4272e31 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipeline_registry.py @@ -0,0 +1,16 @@ +"""Project pipelines.""" +from typing import Dict + +from kedro.framework.project import find_pipelines +from kedro.pipeline import Pipeline + + +def register_pipelines() -> Dict[str, Pipeline]: + """Register the project's pipelines. + + Returns: + A mapping from pipeline names to ``Pipeline`` objects. + """ + pipelines = find_pipelines() + pipelines["__default__"] = sum(pipelines.values()) + return pipelines diff --git a/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/__init__.py b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/data_processing/__init__.py b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/data_processing/__init__.py new file mode 100755 index 000000000..ddfdfdea5 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/data_processing/__init__.py @@ -0,0 +1,3 @@ +"""Complete Data Processing pipeline for the spaceflights tutorial""" + +from .pipeline import create_pipeline # NOQA diff --git a/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/data_processing/pipeline.py b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/data_processing/pipeline.py new file mode 100755 index 000000000..6fddf34d9 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/data_processing/pipeline.py @@ -0,0 +1,9 @@ +from kedro.pipeline import Pipeline, node, pipeline + + +def one(): + return "dummy" + + +def create_pipeline(**kwargs) -> Pipeline: + return pipeline([node(one, [], "dummy_output")]) diff --git a/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/settings.py b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/settings.py new file mode 100644 index 000000000..fc96f56e7 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/settings.py @@ -0,0 +1,46 @@ +"""Project settings. There is no need to edit this file unless you want to change values +from the Kedro defaults. For further information, including these default values, see +https://docs.kedro.org/en/stable/kedro_project_setup/settings.html.""" + +# Instantiated project hooks. +# For example, after creating a hooks.py and defining a ProjectHooks class there, do +# from dummy_project.hooks import ProjectHooks + +# Hooks are executed in a Last-In-First-Out (LIFO) order. +# HOOKS = (ProjectHooks(),) + +# Installed plugins for which to disable hook auto-registration. +# DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",) + +# Class that manages storing KedroSession data. +# from kedro.framework.session.store import BaseSessionStore +# SESSION_STORE_CLASS = BaseSessionStore +# Keyword arguments to pass to the `SESSION_STORE_CLASS` constructor. +# SESSION_STORE_ARGS = { +# "path": "./sessions" +# } + +# Directory that holds configuration. +# CONF_SOURCE = "conf" + +# Class that manages how configuration is loaded. +from kedro.config import OmegaConfigLoader # noqa: E402 + +CONFIG_LOADER_CLASS = OmegaConfigLoader +# Keyword arguments to pass to the `CONFIG_LOADER_CLASS` constructor. +CONFIG_LOADER_ARGS = { + "base_env": "base", + "default_run_env": "local", + # "config_patterns": { + # "spark" : ["spark*/"], + # "parameters": ["parameters*", "parameters*/**", "**/parameters*"], + # } +} + +# Class that manages Kedro's library components. +# from kedro.framework.context import KedroContext +# CONTEXT_CLASS = KedroContext + +# Class that manages the Data Catalog. +# from kedro.io import DataCatalog +# DATA_CATALOG_CLASS = DataCatalog diff --git a/kedro-telemetry/tests/integration/test_telemetry.py b/kedro-telemetry/tests/integration/test_telemetry.py new file mode 100644 index 000000000..33d84547a --- /dev/null +++ b/kedro-telemetry/tests/integration/test_telemetry.py @@ -0,0 +1,60 @@ +from pathlib import Path + +from click.testing import CliRunner +from kedro.framework.cli.cli import KedroCLI +from kedro.framework.session import KedroSession +from kedro.framework.startup import bootstrap_project +from pytest import fixture + + +@fixture +def dummy_project_path(): + return Path(__file__).parent / "dummy-project" + + +class TestKedroTelemetryHookIntegration: + def test_telemetry_sent_once_with_kedro_run(self, mocker, dummy_project_path): + mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event") + mocker.patch( + "kedro_telemetry.plugin._check_for_telemetry_consent", return_value=True + ) + kedro_cli = KedroCLI(dummy_project_path) + CliRunner().invoke(kedro_cli, ["run"]) + mocked_heap_call.assert_called_once() + + def test_telemetry_sent_once_with_other_kedro_command( + self, mocker, dummy_project_path + ): + from kedro_telemetry.plugin import telemetry_hook + + telemetry_hook.consent = None + telemetry_hook._sent = False + telemetry_hook.event_properties = None + telemetry_hook.project_path = None + + mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event") + mocker.patch( + "kedro_telemetry.plugin._check_for_telemetry_consent", return_value=True + ) + kedro_cli = KedroCLI(dummy_project_path) + CliRunner().invoke(kedro_cli, ["run"]) + mocked_heap_call.assert_called_once() + + def test_telemetry_sent_once_with_session_run(self, mocker, dummy_project_path): + from kedro_telemetry.plugin import telemetry_hook + + telemetry_hook.consent = None + telemetry_hook._sent = False + telemetry_hook.event_properties = None + telemetry_hook.project_path = None + + mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event") + mocker.patch( + "kedro_telemetry.plugin._check_for_telemetry_consent", return_value=True + ) + # Mock because all tests are sharing the kedro_telemetry.plugin.telemetry_hook object + + bootstrap_project(dummy_project_path) + with KedroSession.create(project_path=dummy_project_path) as session: + session.run() + mocked_heap_call.assert_called_once() diff --git a/kedro-telemetry/tests/test_masking.py b/kedro-telemetry/tests/test_masking.py index 778e85a54..59ee8ace0 100644 --- a/kedro-telemetry/tests/test_masking.py +++ b/kedro-telemetry/tests/test_masking.py @@ -13,7 +13,6 @@ MASK, _get_cli_structure, _mask_kedro_cli, - _recursive_items, ) REPO_NAME = "cli_tools_dummy_project" @@ -152,94 +151,48 @@ def test_get_cli_structure_help(self, mocker, fake_metadata): assert v.startswith("Usage: [OPTIONS]") @pytest.mark.parametrize( - "input_dict, expected_output_count", + "input_command_args, expected_masked_args", [ - ({}, 0), - ({"a": "foo"}, 2), - ({"a": {"b": "bar"}, "c": {"baz"}}, 5), + ([], []), ( - { - "a": {"b": "bar"}, - "c": None, - "d": {"e": "fizz"}, - "f": {"g": {"h": "buzz"}}, - }, - 12, + ["info"], + ["info"], ), - ], - ) - def test_recursive_items(self, input_dict, expected_output_count): - assert expected_output_count == len( - list(_recursive_items(dictionary=input_dict)) - ) - - def test_recursive_items_empty(self): - assert len(list(_recursive_items({}))) == 0 - - @pytest.mark.parametrize( - "input_cli_structure, input_command_args, expected_masked_args", - [ - ({}, [], []), ( - {"kedro": {"command_a": None, "command_b": None}}, - ["command_a"], - ["command_a"], + ["run", "--pipeline=data_science"], + ["run", "--pipeline", MASK], ), ( - { - "kedro": { - "command_a": {"--param1": None, "--param2": None}, - "command_b": None, - } - }, - ["command_a", "--param1=foo"], - ["command_a", "--param1", MASK], + ["catalog", "list"], + ["catalog", "list"], ), ( - { - "kedro": { - "command_a": {"--param1": None, "--param2": None}, - "command_b": None, - } - }, - ["command_a", "--param1= foo"], - ["command_a", "--param1", MASK], + ["pipeline", "create", "mypipeline"], + ["pipeline", "create", MASK], ), ( - { - "kedro": { - "command_a": {"--param": None, "-p": None}, - "command_b": None, - } - }, - ["command_a", "-p", "bar"], - ["command_a", "-p", MASK], + ["run", "-p", "bar"], + ["run", "-p", MASK], ), ( - { - "kedro": { - "command_a": {"--param": None, "-p": None}, - "command_b": None, - } - }, - ["command_a", "-xyz", "bar"], - ["command_a", MASK, MASK], - ), - ( - { - "kedro": { - "command_a": {"--param": None, "-p": None}, - "command_b": None, - } - }, - ["command_a", "should", "be", "seen", "only"], - ["command_a", MASK, MASK, MASK, MASK], + ["run", "--params=hello=4", "--pipeline=my_pipeline"], + ["run", "--params", MASK, "--pipeline", MASK], ), ], ) def test_mask_kedro_cli( - self, input_cli_structure, input_command_args, expected_masked_args + self, input_command_args, expected_masked_args, fake_metadata, mocker ): + Module = namedtuple("Module", ["cli"]) + mocker.patch("kedro.framework.cli.cli._is_project", return_value=True) + mocker.patch( + "kedro.framework.cli.cli.bootstrap_project", return_value=fake_metadata + ) + mocker.patch( + "kedro.framework.cli.cli.importlib.import_module", + return_value=Module(cli=cli), + ) + kedro_cli = KedroCLI(fake_metadata.project_path) assert expected_masked_args == _mask_kedro_cli( - cli_struct=input_cli_structure, command_args=input_command_args + cli=kedro_cli, command_args=input_command_args ) diff --git a/kedro-telemetry/tests/test_plugin.py b/kedro-telemetry/tests/test_plugin.py index c100dd9e1..048f17561 100644 --- a/kedro-telemetry/tests/test_plugin.py +++ b/kedro-telemetry/tests/test_plugin.py @@ -16,6 +16,7 @@ from kedro_telemetry.plugin import ( _SKIP_TELEMETRY_ENV_VAR_KEYS, KNOWN_CI_ENV_VAR_KEYS, + MISSING_USER_IDENTITY, KedroTelemetryHook, _check_for_telemetry_consent, _is_known_ci_env, @@ -347,7 +348,7 @@ def test_before_command_run_anonymous(self, mocker, fake_metadata): expected_calls = [ mocker.call( event_name="CLI command", - identity="", + identity=MISSING_USER_IDENTITY, properties=generic_properties, ), ]