From 1eb2bf79d5ab4b85763976e9ae54f12c53081d74 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Tue, 27 Aug 2024 15:35:54 +0100 Subject: [PATCH 01/15] ci(docker): Update error code on e2e tests (#813) * Update error message docker Signed-off-by: Ankita Katiyar * typo Signed-off-by: Ankita Katiyar * Update step Signed-off-by: Ankita Katiyar * Change exit code Signed-off-by: Ankita Katiyar * Error message Signed-off-by: Ankita Katiyar --------- Signed-off-by: Ankita Katiyar --- kedro-docker/features/docker.feature | 4 ++-- kedro-docker/features/steps/cli_steps.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kedro-docker/features/docker.feature b/kedro-docker/features/docker.feature index 2d54de7f3..f3682fd30 100644 --- a/kedro-docker/features/docker.feature +++ b/kedro-docker/features/docker.feature @@ -100,7 +100,7 @@ Feature: Docker commands in new projects Scenario: Execute docker run target without building image When I execute the kedro command "docker run" - Then I should get an error exit code + Then I should get a successful exit code And Standard output should contain a message including "Error: Unable to find image `project-dummy` locally." Scenario: Execute docker dive target @@ -118,5 +118,5 @@ Feature: Docker commands in new projects Scenario: Execute docker dive without building image When I execute the kedro command "docker dive" - Then I should get an error exit code + Then I should get a successful exit code And Standard output should contain a message including "Error: Unable to find image `project-dummy` locally." diff --git a/kedro-docker/features/steps/cli_steps.py b/kedro-docker/features/steps/cli_steps.py index ca98b4d44..f504c522b 100644 --- a/kedro-docker/features/steps/cli_steps.py +++ b/kedro-docker/features/steps/cli_steps.py @@ -305,7 +305,7 @@ def check_status_code(context): print(context.result.stderr) assert ( False - ), f"Expected exit code {OK_EXIT_CODE} but got {context.result.returncode}" + ), f"Expected exit code /= {OK_EXIT_CODE} but got {context.result.returncode}" @then("I should get an error exit code") From cf1617bb34d59a82a90fc213960cf418c21d83c9 Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Wed, 28 Aug 2024 14:22:03 +0100 Subject: [PATCH 02/15] ci: Clean up of Makefile and Gitpod setup (#464) * switch to Dockerfile, same configuration as the kedro branch. Signed-off-by: Nok * simplify setup Signed-off-by: Nok * update makefile Signed-off-by: Nok * simplified install in background while not slowing down startup Signed-off-by: Nok * fix command Signed-off-by: Nok * fix setup Signed-off-by: Nok * combined makefile Signed-off-by: Nok --------- Signed-off-by: Nok --- .gitpod.yml | 28 ++++++++-------------- Makefile | 68 ++++++++++++++++++++--------------------------------- 2 files changed, 36 insertions(+), 60 deletions(-) diff --git a/.gitpod.yml b/.gitpod.yml index 70738f4c0..f01e52544 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -1,6 +1,4 @@ -# Learn more from ready-to-use templates: https://www.gitpod.io/docs/introduction/getting-started/quickstart -image: gitpod/workspace-python-3.10:2023-04-20-16-32-37 - +image: gitpod/workspace-python-3.11 tasks: # We want packages installed during the pre-build init steps to go to /workspace @@ -12,22 +10,16 @@ tasks: echo PIP_USER=no >> ~/.bashrc && export PIP_USER=no init: | make sign-off + pip install uv + uv venv + echo source .venv/bin/activate >> ~/.bashrc + source ~/.bashrc + make install-test-requirements plugin=kedro-datasets command: | pre-commit install --install-hooks clear - -github: - prebuilds: - # enable for the master/default branch (defaults to true) - master: true - # enable for all branches in this repo (defaults to false) - branches: true - # enable for pull requests coming from this repo (defaults to true) - pullRequests: true - # enable for pull requests coming from forks (defaults to false) - pullRequestsFromForks: true - # add a "Review in Gitpod" button as a comment to pull requests (defaults to true) - addComment: false - # add a "Review in Gitpod" button to pull requests (defaults to false) - addBadge: true + - name: system + init: | + sudo apt-get update && sudo apt-get install -y --no-install-recommends libgl1 make + sudo apt-get install -y --no-install-recommends libatk-bridge2.0-0 libcups2 ca-certificates fonts-liberation libasound2 libatk-bridge2.0-0 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgbm1 libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libnss3 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 lsb-release wget xdg-utils diff --git a/Makefile b/Makefile index 22bc17816..324332972 100644 --- a/Makefile +++ b/Makefile @@ -5,13 +5,6 @@ package: rm -Rf dist;\ python -m build -pypi: - python -m pip install twine -U - python -m twine upload $(plugin)/dist/* - -install: package - cd $(plugin) && pip install -U dist/*.whl - install-pip-setuptools: python -m pip install -U pip setuptools wheel @@ -25,46 +18,14 @@ mypy: test: cd $(plugin) && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile -# Run test_tensorflow_model_dataset separately, because these tests are flaky when run as part of the full test-suite -dataset-tests: dataset-doctests - cd kedro-datasets && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile --ignore tests/tensorflow - cd kedro-datasets && pytest tests/tensorflow/test_tensorflow_model_dataset.py --no-cov - -extra_pytest_args-no-spark=--ignore kedro_datasets/databricks --ignore kedro_datasets/spark -extra_pytest_args= -dataset-doctest%: - if [ "${*}" != 's-no-spark' ] && [ "${*}" != 's' ]; then \ - echo "make: *** No rule to make target \`${@}\`. Stop."; \ - exit 2; \ - fi; \ - \ - # The ignored datasets below require complicated setup with cloud/database clients which is overkill for the doctest examples. - cd kedro-datasets && pytest kedro_datasets --doctest-modules --doctest-continue-on-failure --no-cov \ - --ignore kedro_datasets/pandas/gbq_dataset.py \ - --ignore kedro_datasets/partitions/partitioned_dataset.py \ - --ignore kedro_datasets/redis/redis_dataset.py \ - --ignore kedro_datasets/snowflake/snowpark_dataset.py \ - --ignore kedro_datasets/spark/spark_hive_dataset.py \ - --ignore kedro_datasets/spark/spark_jdbc_dataset.py \ - $(extra_pytest_arg${*}) - -test-sequential: - cd $(plugin) && pytest tests --cov-config pyproject.toml - e2e-tests: cd $(plugin) && behave secret-scan: trufflehog --max_depth 1 --exclude_paths trufflehog-ignore.txt . -clean: - cd $(plugin);\ - rm -rf build dist pip-wheel-metadata .pytest_cache;\ - find . -regex ".*/__pycache__" -exec rm -rf {} +;\ - find . -regex ".*\.egg-info" -exec rm -rf {} +;\ - install-test-requirements: - cd $(plugin) && pip install ".[test]" + cd $(plugin) && uv pip install ".[test]" install-pre-commit: pre-commit install --install-hooks @@ -79,12 +40,12 @@ sign-off: echo '--in-place "$$1"' >> .git/hooks/commit-msg chmod +x .git/hooks/commit-msg +## kedro-datasets specific + # kedro-datasets related only test-no-spark: dataset-doctests-no-spark cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks --numprocesses 4 --dist loadfile -test-no-spark-sequential: dataset-doctests-no-spark - cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks # kedro-datasets/snowflake tests skipped from default scope test-snowflake-only: @@ -93,3 +54,26 @@ test-snowflake-only: check-datasets-docs: cd kedro-datasets && python -m sphinx -WETan -j auto -D language=en -b linkcheck -d _build/doctrees docs/source _build/linkcheck + +# Run test_tensorflow_model_dataset separately, because these tests are flaky when run as part of the full test-suite +dataset-tests: dataset-doctests + cd kedro-datasets && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile --ignore tests/tensorflow + cd kedro-datasets && pytest tests/tensorflow/test_tensorflow_model_dataset.py --no-cov + +extra_pytest_args-no-spark=--ignore kedro_datasets/databricks --ignore kedro_datasets/spark +extra_pytest_args= +dataset-doctest%: + if [ "${*}" != 's-no-spark' ] && [ "${*}" != 's' ]; then \ + echo "make: *** No rule to make target \`${@}\`. Stop."; \ + exit 2; \ + fi; \ + \ + # The ignored datasets below require complicated setup with cloud/database clients which is overkill for the doctest examples. + cd kedro-datasets && pytest kedro_datasets --doctest-modules --doctest-continue-on-failure --no-cov \ + --ignore kedro_datasets/pandas/gbq_dataset.py \ + --ignore kedro_datasets/partitions/partitioned_dataset.py \ + --ignore kedro_datasets/redis/redis_dataset.py \ + --ignore kedro_datasets/snowflake/snowpark_dataset.py \ + --ignore kedro_datasets/spark/spark_hive_dataset.py \ + --ignore kedro_datasets/spark/spark_jdbc_dataset.py \ + $(extra_pytest_arg${*}) From 5e8787a91af8dc6e06e6824e398d58fe590bc6e3 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Thu, 5 Sep 2024 16:54:55 +0100 Subject: [PATCH 03/15] chore(telemetry): Change logging level to `DEBUG` for most messages (#823) * Change logging level to debug Signed-off-by: Ankita Katiyar * Tests Signed-off-by: Ankita Katiyar * Add back message tests Signed-off-by: Ankita Katiyar * Change logging level for pytest Signed-off-by: Ankita Katiyar --------- Signed-off-by: Ankita Katiyar --- kedro-telemetry/kedro_telemetry/__init__.py | 2 +- kedro-telemetry/kedro_telemetry/plugin.py | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/kedro-telemetry/kedro_telemetry/__init__.py b/kedro-telemetry/kedro_telemetry/__init__.py index 3194e2c22..6f4dcc75f 100644 --- a/kedro-telemetry/kedro_telemetry/__init__.py +++ b/kedro-telemetry/kedro_telemetry/__init__.py @@ -4,4 +4,4 @@ import logging -logging.getLogger(__name__).setLevel(logging.INFO) +logging.getLogger(__name__).setLevel(logging.DEBUG) diff --git a/kedro-telemetry/kedro_telemetry/plugin.py b/kedro-telemetry/kedro_telemetry/plugin.py index a342f6e66..fc9692db0 100644 --- a/kedro-telemetry/kedro_telemetry/plugin.py +++ b/kedro-telemetry/kedro_telemetry/plugin.py @@ -78,7 +78,7 @@ def _get_or_create_uuid() -> str: return new_uuid except Exception as e: - logging.error(f"Failed to retrieve UUID: {e}") + logging.debug(f"Failed to retrieve UUID: {e}") return "" @@ -104,7 +104,7 @@ def _get_or_create_project_id(pyproject_path: Path) -> str | None: file.write(toml_string) return project_id except KeyError: - logging.error( + logging.debug( f"Failed to retrieve project id or save project id: " f"{str(pyproject_path)} does not contain a [tool.kedro] section" ) @@ -148,7 +148,7 @@ def _generate_new_uuid(full_path: str) -> str: return new_uuid except Exception as e: - logging.error(f"Failed to create UUID: {e}") + logging.debug(f"Failed to create UUID: {e}") return "" @@ -200,13 +200,15 @@ def after_command_run(self): @hook_impl def after_context_created(self, context): - """Hook implementation to send project statistics data to Heap""" + """Hook implementation to read metadata""" self._consent = _check_for_telemetry_consent(context.project_path) self._project_path = context.project_path @hook_impl def after_catalog_created(self, catalog): + """Hook implementation to send project statistics data to Heap""" + if self._consent is False: return @@ -246,7 +248,7 @@ def _send_telemetry_heap_event(self, event_name: str): ) self._sent = True except Exception as exc: - logger.warning( + logger.debug( "Something went wrong in hook implementation to send command run data to Heap. " "Exception: %s", exc, @@ -333,13 +335,13 @@ def _send_heap_event( url=HEAP_ENDPOINT, headers=HEAP_HEADERS, data=json.dumps(data), timeout=10 ) if resp.status_code != 200: # noqa: PLR2004 - logger.warning( + logger.debug( "Failed to send data to Heap. Response code returned: %s, Response reason: %s", resp.status_code, resp.reason, ) except requests.exceptions.RequestException as exc: - logger.warning( + logger.debug( "Failed to send data to Heap. Exception of type '%s' was raised.", type(exc).__name__, ) From b766f45f8660a8f49d7f4eba30f5e556b89fc8c6 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Mon, 9 Sep 2024 13:41:50 +0100 Subject: [PATCH 04/15] fix(telemetry): Avoid eager loading the whole KedroCLI for masking (#824) * First pass: only load command that was called Signed-off-by: Ankita Katiyar * Try to make it work with help and invalud commands Signed-off-by: Ankita Katiyar * Fix tests Signed-off-by: Ankita Katiyar * Fix masking tests Signed-off-by: Ankita Katiyar * Remove unused function and add argument name and type Signed-off-by: Ankita Katiyar --------- Signed-off-by: Ankita Katiyar --- kedro-telemetry/kedro_telemetry/masking.py | 27 +++--- kedro-telemetry/kedro_telemetry/plugin.py | 7 +- kedro-telemetry/tests/test_masking.py | 99 ++++++---------------- 3 files changed, 38 insertions(+), 95 deletions(-) diff --git a/kedro-telemetry/kedro_telemetry/masking.py b/kedro-telemetry/kedro_telemetry/masking.py index ea432f455..9308dc771 100644 --- a/kedro-telemetry/kedro_telemetry/masking.py +++ b/kedro-telemetry/kedro_telemetry/masking.py @@ -1,7 +1,7 @@ """Module containing command masking functionality.""" from __future__ import annotations -from typing import Any, Iterator +from typing import Any import click @@ -81,16 +81,19 @@ def _get_cli_structure( return output -def _mask_kedro_cli( - cli_struct: dict[str | None, Any], command_args: list[str] -) -> list[str]: +def _mask_kedro_cli(cli: click.CommandCollection, command_args: list[str]) -> list[str]: """Takes a dynamic vocabulary (based on `KedroCLI`) and returns a masked CLI input""" output = [] - - # Preserve the initial part of the command until parameters sections begin arg_index = 0 - current_CLI = cli_struct.get("kedro", {}) + cmd = command_args[0] if command_args else "" + if cmd in {"--help", "--version", "-h", "-v", ""}: + return command_args + click_cmd = cli.get_command(ctx=None, cmd_name=cmd) # type: ignore + if click_cmd is None: + return [MASK] + + current_CLI = _get_cli_structure(click_cmd) while ( arg_index < len(command_args) and not command_args[arg_index].startswith("-") @@ -116,13 +119,3 @@ def _mask_kedro_cli( output.append(MASK) return output - - -def _recursive_items(dictionary: dict[Any, Any]) -> Iterator[Any]: - for key, value in dictionary.items(): - if isinstance(value, dict): - yield key - yield from _recursive_items(value) - else: - yield key - yield value diff --git a/kedro-telemetry/kedro_telemetry/plugin.py b/kedro-telemetry/kedro_telemetry/plugin.py index fc9692db0..dd8b46877 100644 --- a/kedro-telemetry/kedro_telemetry/plugin.py +++ b/kedro-telemetry/kedro_telemetry/plugin.py @@ -26,7 +26,7 @@ from kedro.pipeline import Pipeline from kedro_telemetry import __version__ as TELEMETRY_VERSION -from kedro_telemetry.masking import _get_cli_structure, _mask_kedro_cli +from kedro_telemetry.masking import _mask_kedro_cli HEAP_APPID_PROD = "2388822444" HEAP_ENDPOINT = "https://heapanalytics.com/api/track" @@ -176,10 +176,7 @@ def before_command_run( # get KedroCLI and its structure from actual project root cli = KedroCLI(project_path=project_path if project_path else Path.cwd()) - cli_struct = _get_cli_structure(cli_obj=cli, get_help=False) - masked_command_args = _mask_kedro_cli( - cli_struct=cli_struct, command_args=command_args - ) + masked_command_args = _mask_kedro_cli(cli, command_args=command_args) self._user_uuid = _get_or_create_uuid() diff --git a/kedro-telemetry/tests/test_masking.py b/kedro-telemetry/tests/test_masking.py index 778e85a54..59ee8ace0 100644 --- a/kedro-telemetry/tests/test_masking.py +++ b/kedro-telemetry/tests/test_masking.py @@ -13,7 +13,6 @@ MASK, _get_cli_structure, _mask_kedro_cli, - _recursive_items, ) REPO_NAME = "cli_tools_dummy_project" @@ -152,94 +151,48 @@ def test_get_cli_structure_help(self, mocker, fake_metadata): assert v.startswith("Usage: [OPTIONS]") @pytest.mark.parametrize( - "input_dict, expected_output_count", + "input_command_args, expected_masked_args", [ - ({}, 0), - ({"a": "foo"}, 2), - ({"a": {"b": "bar"}, "c": {"baz"}}, 5), + ([], []), ( - { - "a": {"b": "bar"}, - "c": None, - "d": {"e": "fizz"}, - "f": {"g": {"h": "buzz"}}, - }, - 12, + ["info"], + ["info"], ), - ], - ) - def test_recursive_items(self, input_dict, expected_output_count): - assert expected_output_count == len( - list(_recursive_items(dictionary=input_dict)) - ) - - def test_recursive_items_empty(self): - assert len(list(_recursive_items({}))) == 0 - - @pytest.mark.parametrize( - "input_cli_structure, input_command_args, expected_masked_args", - [ - ({}, [], []), ( - {"kedro": {"command_a": None, "command_b": None}}, - ["command_a"], - ["command_a"], + ["run", "--pipeline=data_science"], + ["run", "--pipeline", MASK], ), ( - { - "kedro": { - "command_a": {"--param1": None, "--param2": None}, - "command_b": None, - } - }, - ["command_a", "--param1=foo"], - ["command_a", "--param1", MASK], + ["catalog", "list"], + ["catalog", "list"], ), ( - { - "kedro": { - "command_a": {"--param1": None, "--param2": None}, - "command_b": None, - } - }, - ["command_a", "--param1= foo"], - ["command_a", "--param1", MASK], + ["pipeline", "create", "mypipeline"], + ["pipeline", "create", MASK], ), ( - { - "kedro": { - "command_a": {"--param": None, "-p": None}, - "command_b": None, - } - }, - ["command_a", "-p", "bar"], - ["command_a", "-p", MASK], + ["run", "-p", "bar"], + ["run", "-p", MASK], ), ( - { - "kedro": { - "command_a": {"--param": None, "-p": None}, - "command_b": None, - } - }, - ["command_a", "-xyz", "bar"], - ["command_a", MASK, MASK], - ), - ( - { - "kedro": { - "command_a": {"--param": None, "-p": None}, - "command_b": None, - } - }, - ["command_a", "should", "be", "seen", "only"], - ["command_a", MASK, MASK, MASK, MASK], + ["run", "--params=hello=4", "--pipeline=my_pipeline"], + ["run", "--params", MASK, "--pipeline", MASK], ), ], ) def test_mask_kedro_cli( - self, input_cli_structure, input_command_args, expected_masked_args + self, input_command_args, expected_masked_args, fake_metadata, mocker ): + Module = namedtuple("Module", ["cli"]) + mocker.patch("kedro.framework.cli.cli._is_project", return_value=True) + mocker.patch( + "kedro.framework.cli.cli.bootstrap_project", return_value=fake_metadata + ) + mocker.patch( + "kedro.framework.cli.cli.importlib.import_module", + return_value=Module(cli=cli), + ) + kedro_cli = KedroCLI(fake_metadata.project_path) assert expected_masked_args == _mask_kedro_cli( - cli_struct=input_cli_structure, command_args=input_command_args + cli=kedro_cli, command_args=input_command_args ) From 3da39f7d0c2d1401006ca48fab6bb1a4e2a90bf8 Mon Sep 17 00:00:00 2001 From: Dmitry Sorokin <40151847+DimedS@users.noreply.github.com> Date: Mon, 9 Sep 2024 20:54:35 +0100 Subject: [PATCH 05/15] fix(datasets): Cleanup of dependency (#822) * Update dependencies --------- Signed-off-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> Co-authored-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> --- kedro-datasets/pyproject.toml | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index e1180f2f2..895caebfd 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -33,8 +33,9 @@ api = ["kedro-datasets[api-apidataset]"] biosequence-biosequencedataset = ["biopython~=1.73"] biosequence = ["kedro-datasets[biosequence-biosequencedataset]"] +dask-csvdataset = ["dask[dataframe]>=2021.10"] dask-parquetdataset = ["dask[complete]>=2021.10", "triad>=0.6.7, <1.0"] -dask = ["kedro-datasets[dask-parquetdataset]"] +dask = ["kedro-datasets[dask-parquetdataset, dask-csvdataset]"] databricks-managedtabledataset = ["kedro-datasets[spark-base,pandas-base,delta-base,hdfs-base,s3fs-base]"] databricks = ["kedro-datasets[databricks-managedtabledataset]"] @@ -92,7 +93,7 @@ pandas-featherdataset = ["kedro-datasets[pandas-base]"] pandas-gbqtabledataset = ["kedro-datasets[pandas-base]", "pandas-gbq>=0.12.0"] pandas-gbqquerydataset = ["kedro-datasets[pandas-base]", "pandas-gbq>=0.12.0"] pandas-genericdataset = ["kedro-datasets[pandas-base]"] -pandas-hdfdataset = ["kedro-datasets[pandas-base]", "tables~=3.6"] +pandas-hdfdataset = ["kedro-datasets[pandas-base]", "tables>=3.6"] pandas-jsondataset = ["kedro-datasets[pandas-base]"] pandas-parquetdataset = ["kedro-datasets[pandas-base]", "pyarrow>=6.0"] pandas-sqltabledataset = ["kedro-datasets[pandas-base]", "SQLAlchemy>=1.4, <3.0"] @@ -127,9 +128,12 @@ plotly = ["kedro-datasets[plotly-htmldataset,plotly-jsondataset,plotly-plotlydat polars-csvdataset = ["kedro-datasets[polars-base]"] polars-eagerpolarsdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "xlsx2csv>=0.8.0", "deltalake >= 0.6.2"] -polars-genericdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "xlsx2csv>=0.8.0", "deltalake >= 0.6.2"] polars-lazypolarsdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "deltalake >= 0.6.2"] -polars = ["kedro-datasets[polars-genericdataset]"] +polars = [ + """kedro-datasets[polars-csvdataset,\ + polars-eagerpolarsdataset,\ + polars-lazypolarsdataset]""" +] redis-pickledataset = ["redis~=4.1"] redis = ["kedro-datasets[redis-pickledataset]"] @@ -140,8 +144,15 @@ snowflake = ["kedro-datasets[snowflake-snowparktabledataset]"] spark-deltatabledataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base,delta-base]"] spark-sparkdataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"] spark-sparkhivedataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"] -spark-sparkjdbcdataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"] -spark = ["kedro-datasets[spark-deltatabledataset]"] +spark-sparkjdbcdataset = ["kedro-datasets[spark-base]"] +spark-sparkstreamingdataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"] +spark = [ + """kedro-datasets[spark-deltatabledataset,\ + spark-sparkdataset,\ + spark-sparkhivedataset,\ + spark-sparkjdbcdataset,\ + spark-sparkstreamingdataset]""" +] svmlight-svmlightdataset = ["scikit-learn>=1.0.2", "scipy~=1.7.3"] svmlight = ["kedro-datasets[svmlight-svmlightdataset]"] @@ -209,7 +220,7 @@ test = [ "ibis-framework[duckdb,examples]", "import-linter[toml]==1.2.6", "ipython>=7.31.1, <8.0", - "Jinja2<3.1.0", + "Jinja2<3.2.0", "joblib>=0.14", "jupyterlab>=3.0", "jupyter~=1.0", @@ -248,8 +259,7 @@ test = [ "scipy>=1.7.3", "packaging", "SQLAlchemy>=1.2", - "tables>=3.8.0; platform_system == 'Windows'", # Import issues with python 3.8 with pytables pinning to 3.8.0 fixes this https://github.com/PyTables/PyTables/issues/933#issuecomment-1555917593 - "tables~=3.6; platform_system != 'Windows'", + "tables>=3.6", "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'", "tensorflow~=2.0; platform_system != 'Darwin' or platform_machine != 'arm64'", "triad>=0.6.7, <1.0", From 552b973a256c0f4a9f96e36feb70f4fc15fb371b Mon Sep 17 00:00:00 2001 From: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> Date: Tue, 10 Sep 2024 17:49:25 +0100 Subject: [PATCH 06/15] fix(telemetry): Set default identity value (#830) * Set default identity Signed-off-by: Elena Khaustova * Updated test_before_command_run_anonymous Signed-off-by: Elena Khaustova * Fix lint Signed-off-by: Elena Khaustova --------- Signed-off-by: Elena Khaustova --- kedro-telemetry/kedro_telemetry/plugin.py | 6 +++--- kedro-telemetry/tests/test_plugin.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/kedro-telemetry/kedro_telemetry/plugin.py b/kedro-telemetry/kedro_telemetry/plugin.py index dd8b46877..136201c3d 100644 --- a/kedro-telemetry/kedro_telemetry/plugin.py +++ b/kedro-telemetry/kedro_telemetry/plugin.py @@ -49,6 +49,7 @@ CONFIG_FILENAME = "telemetry.toml" PYPROJECT_CONFIG_NAME = "pyproject.toml" UNDEFINED_PACKAGE_NAME = "undefined_package_name" +MISSING_USER_IDENTITY = "missing_user_identity" logger = logging.getLogger(__name__) @@ -240,7 +241,7 @@ def _send_telemetry_heap_event(self, event_name: str): try: _send_heap_event( event_name=event_name, - identity=self._user_uuid, + identity=self._user_uuid if self._user_uuid else MISSING_USER_IDENTITY, properties=self._event_properties, ) self._sent = True @@ -323,9 +324,8 @@ def _send_heap_event( "event": event_name, "timestamp": datetime.now().strftime(TIMESTAMP_FORMAT), "properties": properties or {}, + "identity": identity, } - if identity: - data["identity"] = identity try: resp = requests.post( diff --git a/kedro-telemetry/tests/test_plugin.py b/kedro-telemetry/tests/test_plugin.py index c100dd9e1..048f17561 100644 --- a/kedro-telemetry/tests/test_plugin.py +++ b/kedro-telemetry/tests/test_plugin.py @@ -16,6 +16,7 @@ from kedro_telemetry.plugin import ( _SKIP_TELEMETRY_ENV_VAR_KEYS, KNOWN_CI_ENV_VAR_KEYS, + MISSING_USER_IDENTITY, KedroTelemetryHook, _check_for_telemetry_consent, _is_known_ci_env, @@ -347,7 +348,7 @@ def test_before_command_run_anonymous(self, mocker, fake_metadata): expected_calls = [ mocker.call( event_name="CLI command", - identity="", + identity=MISSING_USER_IDENTITY, properties=generic_properties, ), ] From 4b75db75c0c7b11c299500e95811f24c23c503bd Mon Sep 17 00:00:00 2001 From: Dmitry Sorokin <40151847+DimedS@users.noreply.github.com> Date: Tue, 24 Sep 2024 09:52:09 +0100 Subject: [PATCH 07/15] feat(telemetry): add integration tests (#771) Add integration tests for telemetry ensure that only one event is sent to Heap Analytics per Kedro command --------- Signed-off-by: Dmitry Sorokin Signed-off-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> Signed-off-by: Nok Co-authored-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> Co-authored-by: Nok --- .../integration/dummy-project/.gitignore | 151 ++++++++++++++++++ .../integration/dummy-project/conf/README.md | 20 +++ .../dummy-project/conf/base/catalog.yml | 0 .../dummy-project/conf/base/parameters.yml | 0 .../dummy-project/conf/local/.gitkeep | 0 .../integration/dummy-project/pyproject.toml | 34 ++++ .../dummy-project/requirements.txt | 9 ++ .../src/dummy_project/__init__.py | 4 + .../src/dummy_project/__main__.py | 47 ++++++ .../src/dummy_project/pipeline_registry.py | 16 ++ .../src/dummy_project/pipelines/__init__.py | 0 .../pipelines/data_processing/__init__.py | 3 + .../pipelines/data_processing/pipeline.py | 9 ++ .../src/dummy_project/settings.py | 46 ++++++ .../tests/integration/test_telemetry.py | 60 +++++++ 15 files changed, 399 insertions(+) create mode 100644 kedro-telemetry/tests/integration/dummy-project/.gitignore create mode 100644 kedro-telemetry/tests/integration/dummy-project/conf/README.md create mode 100644 kedro-telemetry/tests/integration/dummy-project/conf/base/catalog.yml create mode 100644 kedro-telemetry/tests/integration/dummy-project/conf/base/parameters.yml create mode 100644 kedro-telemetry/tests/integration/dummy-project/conf/local/.gitkeep create mode 100644 kedro-telemetry/tests/integration/dummy-project/pyproject.toml create mode 100644 kedro-telemetry/tests/integration/dummy-project/requirements.txt create mode 100644 kedro-telemetry/tests/integration/dummy-project/src/dummy_project/__init__.py create mode 100644 kedro-telemetry/tests/integration/dummy-project/src/dummy_project/__main__.py create mode 100644 kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipeline_registry.py create mode 100644 kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/__init__.py create mode 100755 kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/data_processing/__init__.py create mode 100755 kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/data_processing/pipeline.py create mode 100644 kedro-telemetry/tests/integration/dummy-project/src/dummy_project/settings.py create mode 100644 kedro-telemetry/tests/integration/test_telemetry.py diff --git a/kedro-telemetry/tests/integration/dummy-project/.gitignore b/kedro-telemetry/tests/integration/dummy-project/.gitignore new file mode 100644 index 000000000..51a4444c6 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/.gitignore @@ -0,0 +1,151 @@ +########################## +# KEDRO PROJECT + +# ignore all local configuration +conf/local/** +!conf/local/.gitkeep + +# ignore potentially sensitive credentials files +conf/**/*credentials* + +# ignore everything in the following folders +data/** + +# except their sub-folders +!data/**/ + +# also keep all .gitkeep files +!.gitkeep + +# keep also the example dataset +!data/01_raw/* + + +########################## +# Common files + +# IntelliJ +.idea/ +*.iml +out/ +.idea_modules/ + +### macOS +*.DS_Store +.AppleDouble +.LSOverride +.Trashes + +# Vim +*~ +.*.swo +.*.swp + +# emacs +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc + +# JIRA plugin +atlassian-ide-plugin.xml + +# C extensions +*.so + +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +.static_storage/ +.media/ +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/kedro-telemetry/tests/integration/dummy-project/conf/README.md b/kedro-telemetry/tests/integration/dummy-project/conf/README.md new file mode 100644 index 000000000..b135e80c2 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/conf/README.md @@ -0,0 +1,20 @@ +# What is this for? + +This folder should be used to store configuration files used by Kedro or by separate tools. + +This file can be used to provide users with instructions for how to reproduce local configuration with their own credentials. You can edit the file however you like, but you may wish to retain the information below and add your own section in the section titled **Instructions**. + +## Local configuration + +The `local` folder should be used for configuration that is either user-specific (e.g. IDE configuration) or protected (e.g. security keys). + +> *Note:* Please do not check in any local configuration to version control. + +## Base configuration + +The `base` folder is for shared configuration, such as non-sensitive and project-related configuration that may be shared across team members. + +WARNING: Please do not put access credentials in the base configuration folder. + +## Find out more +You can find out more about configuration from the [user guide documentation](https://docs.kedro.org/en/stable/configuration/configuration_basics.html). diff --git a/kedro-telemetry/tests/integration/dummy-project/conf/base/catalog.yml b/kedro-telemetry/tests/integration/dummy-project/conf/base/catalog.yml new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-telemetry/tests/integration/dummy-project/conf/base/parameters.yml b/kedro-telemetry/tests/integration/dummy-project/conf/base/parameters.yml new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-telemetry/tests/integration/dummy-project/conf/local/.gitkeep b/kedro-telemetry/tests/integration/dummy-project/conf/local/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-telemetry/tests/integration/dummy-project/pyproject.toml b/kedro-telemetry/tests/integration/dummy-project/pyproject.toml new file mode 100644 index 000000000..ec07f1b99 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/pyproject.toml @@ -0,0 +1,34 @@ +[build-system] +requires = [ "setuptools",] +build-backend = "setuptools.build_meta" + +[project] +name = "dummy_project" +readme = "README.md" +dynamic = [ "dependencies", "version",] + +[project.scripts] +dummy-project = "dummy_project.__main__:main" + +[tool.kedro] +package_name = "dummy_project" +project_name = "dummy_project" +kedro_init_version = "0.19.6" +tools = [ "None",] +example_pipeline = "True" +source_dir = "src" + +[project.entry-points."kedro.hooks"] + +[tool.setuptools.dynamic.dependencies] +file = "requirements.txt" + +[tool.setuptools.dynamic.version] +attr = "dummy_project.__version__" + +[tool.setuptools.packages.find] +where = [ "src",] +namespaces = false + +[tool.kedro_telemetry] +project_id = "KEDRO_TELEMETRY_TEST" diff --git a/kedro-telemetry/tests/integration/dummy-project/requirements.txt b/kedro-telemetry/tests/integration/dummy-project/requirements.txt new file mode 100644 index 000000000..1c5f8e218 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/requirements.txt @@ -0,0 +1,9 @@ +ipython>=8.10 +jupyterlab>=3.0 +kedro~=0.19.6 +kedro-datasets[pandas-csvdataset, pandas-exceldataset, pandas-parquetdataset]>=3.0; python_version >= "3.9" +kedro-datasets[pandas.CSVDataset, pandas.ExcelDataset, pandas.ParquetDataset]>=1.0; python_version < "3.9" +kedro-telemetry>=0.3.1 +kedro-viz>=6.7.0 +notebook +scikit-learn~=1.0 diff --git a/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/__init__.py b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/__init__.py new file mode 100644 index 000000000..11d70edcb --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/__init__.py @@ -0,0 +1,4 @@ +"""dummy_project +""" + +__version__ = "0.1" diff --git a/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/__main__.py b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/__main__.py new file mode 100644 index 000000000..56cef4b26 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/__main__.py @@ -0,0 +1,47 @@ +"""dummy_project file for ensuring the package is executable +as `dummy-project` and `python -m dummy_project` +""" +import importlib +from pathlib import Path + +from kedro.framework.cli.utils import KedroCliError, load_entry_points +from kedro.framework.project import configure_project + + +def _find_run_command(package_name): + try: + project_cli = importlib.import_module(f"{package_name}.cli") + # fail gracefully if cli.py does not exist + except ModuleNotFoundError as exc: + if f"{package_name}.cli" not in str(exc): + raise + plugins = load_entry_points("project") + run = _find_run_command_in_plugins(plugins) if plugins else None + if run: + # use run command from installed plugin if it exists + return run + # use run command from the framework project + from kedro.framework.cli.project import run + + return run + # fail badly if cli.py exists, but has no `cli` in it + if not hasattr(project_cli, "cli"): + raise KedroCliError(f"Cannot load commands from {package_name}.cli") + return project_cli.run + + +def _find_run_command_in_plugins(plugins): + for group in plugins: + if "run" in group.commands: + return group.commands["run"] + + +def main(*args, **kwargs): + package_name = Path(__file__).parent.name + configure_project(package_name) + run = _find_run_command(package_name) + run(*args, **kwargs) + + +if __name__ == "__main__": + main() diff --git a/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipeline_registry.py b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipeline_registry.py new file mode 100644 index 000000000..2d4272e31 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipeline_registry.py @@ -0,0 +1,16 @@ +"""Project pipelines.""" +from typing import Dict + +from kedro.framework.project import find_pipelines +from kedro.pipeline import Pipeline + + +def register_pipelines() -> Dict[str, Pipeline]: + """Register the project's pipelines. + + Returns: + A mapping from pipeline names to ``Pipeline`` objects. + """ + pipelines = find_pipelines() + pipelines["__default__"] = sum(pipelines.values()) + return pipelines diff --git a/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/__init__.py b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/data_processing/__init__.py b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/data_processing/__init__.py new file mode 100755 index 000000000..ddfdfdea5 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/data_processing/__init__.py @@ -0,0 +1,3 @@ +"""Complete Data Processing pipeline for the spaceflights tutorial""" + +from .pipeline import create_pipeline # NOQA diff --git a/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/data_processing/pipeline.py b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/data_processing/pipeline.py new file mode 100755 index 000000000..6fddf34d9 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/pipelines/data_processing/pipeline.py @@ -0,0 +1,9 @@ +from kedro.pipeline import Pipeline, node, pipeline + + +def one(): + return "dummy" + + +def create_pipeline(**kwargs) -> Pipeline: + return pipeline([node(one, [], "dummy_output")]) diff --git a/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/settings.py b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/settings.py new file mode 100644 index 000000000..fc96f56e7 --- /dev/null +++ b/kedro-telemetry/tests/integration/dummy-project/src/dummy_project/settings.py @@ -0,0 +1,46 @@ +"""Project settings. There is no need to edit this file unless you want to change values +from the Kedro defaults. For further information, including these default values, see +https://docs.kedro.org/en/stable/kedro_project_setup/settings.html.""" + +# Instantiated project hooks. +# For example, after creating a hooks.py and defining a ProjectHooks class there, do +# from dummy_project.hooks import ProjectHooks + +# Hooks are executed in a Last-In-First-Out (LIFO) order. +# HOOKS = (ProjectHooks(),) + +# Installed plugins for which to disable hook auto-registration. +# DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",) + +# Class that manages storing KedroSession data. +# from kedro.framework.session.store import BaseSessionStore +# SESSION_STORE_CLASS = BaseSessionStore +# Keyword arguments to pass to the `SESSION_STORE_CLASS` constructor. +# SESSION_STORE_ARGS = { +# "path": "./sessions" +# } + +# Directory that holds configuration. +# CONF_SOURCE = "conf" + +# Class that manages how configuration is loaded. +from kedro.config import OmegaConfigLoader # noqa: E402 + +CONFIG_LOADER_CLASS = OmegaConfigLoader +# Keyword arguments to pass to the `CONFIG_LOADER_CLASS` constructor. +CONFIG_LOADER_ARGS = { + "base_env": "base", + "default_run_env": "local", + # "config_patterns": { + # "spark" : ["spark*/"], + # "parameters": ["parameters*", "parameters*/**", "**/parameters*"], + # } +} + +# Class that manages Kedro's library components. +# from kedro.framework.context import KedroContext +# CONTEXT_CLASS = KedroContext + +# Class that manages the Data Catalog. +# from kedro.io import DataCatalog +# DATA_CATALOG_CLASS = DataCatalog diff --git a/kedro-telemetry/tests/integration/test_telemetry.py b/kedro-telemetry/tests/integration/test_telemetry.py new file mode 100644 index 000000000..33d84547a --- /dev/null +++ b/kedro-telemetry/tests/integration/test_telemetry.py @@ -0,0 +1,60 @@ +from pathlib import Path + +from click.testing import CliRunner +from kedro.framework.cli.cli import KedroCLI +from kedro.framework.session import KedroSession +from kedro.framework.startup import bootstrap_project +from pytest import fixture + + +@fixture +def dummy_project_path(): + return Path(__file__).parent / "dummy-project" + + +class TestKedroTelemetryHookIntegration: + def test_telemetry_sent_once_with_kedro_run(self, mocker, dummy_project_path): + mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event") + mocker.patch( + "kedro_telemetry.plugin._check_for_telemetry_consent", return_value=True + ) + kedro_cli = KedroCLI(dummy_project_path) + CliRunner().invoke(kedro_cli, ["run"]) + mocked_heap_call.assert_called_once() + + def test_telemetry_sent_once_with_other_kedro_command( + self, mocker, dummy_project_path + ): + from kedro_telemetry.plugin import telemetry_hook + + telemetry_hook.consent = None + telemetry_hook._sent = False + telemetry_hook.event_properties = None + telemetry_hook.project_path = None + + mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event") + mocker.patch( + "kedro_telemetry.plugin._check_for_telemetry_consent", return_value=True + ) + kedro_cli = KedroCLI(dummy_project_path) + CliRunner().invoke(kedro_cli, ["run"]) + mocked_heap_call.assert_called_once() + + def test_telemetry_sent_once_with_session_run(self, mocker, dummy_project_path): + from kedro_telemetry.plugin import telemetry_hook + + telemetry_hook.consent = None + telemetry_hook._sent = False + telemetry_hook.event_properties = None + telemetry_hook.project_path = None + + mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event") + mocker.patch( + "kedro_telemetry.plugin._check_for_telemetry_consent", return_value=True + ) + # Mock because all tests are sharing the kedro_telemetry.plugin.telemetry_hook object + + bootstrap_project(dummy_project_path) + with KedroSession.create(project_path=dummy_project_path) as session: + session.run() + mocked_heap_call.assert_called_once() From 0f0c59e6012a0a0b6dbc7738ddafde684f1c3873 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Wed, 25 Sep 2024 13:01:44 +0100 Subject: [PATCH 08/15] ci(airflow): Replace type hints with CatalogProtocol (#845) * Replace type checking with CatalogProtocol Signed-off-by: Ankita Katiyar * Add try except for import Signed-off-by: Ankita Katiyar * Ignore bandit warnings Signed-off-by: Ankita Katiyar * Remove any Signed-off-by: Ankita Katiyar --------- Signed-off-by: Ankita Katiyar --- kedro-airflow/kedro_airflow/grouping.py | 13 ++++++++++--- .../pytorch/pytorch_dataset.py | 4 ++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/kedro-airflow/kedro_airflow/grouping.py b/kedro-airflow/kedro_airflow/grouping.py index 26c931f8d..3890804ae 100644 --- a/kedro-airflow/kedro_airflow/grouping.py +++ b/kedro-airflow/kedro_airflow/grouping.py @@ -4,6 +4,11 @@ from kedro.pipeline.node import Node from kedro.pipeline.pipeline import Pipeline +try: + from kedro.io import CatalogProtocol +except ImportError: # pragma: no cover + pass + def _is_memory_dataset(catalog, dataset_name: str) -> bool: if dataset_name not in catalog: @@ -11,7 +16,9 @@ def _is_memory_dataset(catalog, dataset_name: str) -> bool: return False -def get_memory_datasets(catalog: DataCatalog, pipeline: Pipeline) -> set[str]: +def get_memory_datasets( + catalog: CatalogProtocol | DataCatalog, pipeline: Pipeline +) -> set[str]: """Gather all datasets in the pipeline that are of type MemoryDataset, excluding 'parameters'.""" return { dataset_name @@ -21,7 +28,7 @@ def get_memory_datasets(catalog: DataCatalog, pipeline: Pipeline) -> set[str]: def create_adjacency_list( - catalog: DataCatalog, pipeline: Pipeline + catalog: CatalogProtocol | DataCatalog, pipeline: Pipeline ) -> tuple[dict[str, set], dict[str, set]]: """ Builds adjacency list (adj_list) to search connected components - undirected graph, @@ -48,7 +55,7 @@ def create_adjacency_list( def group_memory_nodes( - catalog: DataCatalog, pipeline: Pipeline + catalog: CatalogProtocol | DataCatalog, pipeline: Pipeline ) -> tuple[dict[str, list[Node]], dict[str, list[str]]]: """ Nodes that are connected through MemoryDatasets cannot be distributed across diff --git a/kedro-datasets/kedro_datasets_experimental/pytorch/pytorch_dataset.py b/kedro-datasets/kedro_datasets_experimental/pytorch/pytorch_dataset.py index 914fdb6b7..15c10a93d 100644 --- a/kedro-datasets/kedro_datasets_experimental/pytorch/pytorch_dataset.py +++ b/kedro-datasets/kedro_datasets_experimental/pytorch/pytorch_dataset.py @@ -96,11 +96,11 @@ def _describe(self) -> dict[str, Any]: def _load(self) -> Any: load_path = get_filepath_str(self._get_load_path(), self._protocol) - return torch.load(load_path, **self._fs_open_args_load) + return torch.load(load_path, **self._fs_open_args_load) #nosec: B614 def _save(self, data: torch.nn.Module) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - torch.save(data.state_dict(), save_path, **self._fs_open_args_save) + torch.save(data.state_dict(), save_path, **self._fs_open_args_save) #nosec: B614 self._invalidate_cache() From 06cf752c6dee4f4eb96797eb5507f2ca5f8b5a2f Mon Sep 17 00:00:00 2001 From: Janick <128086870+janickspirig@users.noreply.github.com> Date: Thu, 26 Sep 2024 06:04:39 -0300 Subject: [PATCH 09/15] fix(datasets): Replace deprecated GBQDataset load/save funcs (#826) * fix:use_pd_gbq Signed-off-by: janick_spirig * fix:updated_readme Signed-off-by: janick_spirig * fix:updated_args Signed-off-by: janick_spirig * fix:updated_args_in_test Signed-off-by: janick_spirig * fix:linting Signed-off-by: janick_spirig --------- Signed-off-by: janick_spirig Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> Co-authored-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> --- kedro-datasets/RELEASE.md | 2 + .../kedro_datasets/pandas/gbq_dataset.py | 20 +++++---- .../tests/pandas/test_gbq_dataset.py | 43 +++++++++++++------ 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 109d4e6fe..30af78fe9 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -14,12 +14,14 @@ ## Bug fixes and other changes * Refactored all datasets to set `fs_args` defaults in the same way as `load_args` and `save_args` and not have hardcoded values in the save methods. +* Fixed deprecated load and save approaches of GBQTableDataset and GBQQueryDataset by invoking save and load directly over `pandas-gbq` lib ## Breaking Changes ## Community contributions Many thanks to the following Kedroids for contributing PRs to this release: * [Brandon Meek](https://github.com/bpmeek) * [yury-fedotov](https://github.com/yury-fedotov) +* [janickspirig](https://github.com/janickspirig) # Release 4.1.0 diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py index f16f828f7..e7ed3c2df 100644 --- a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py @@ -10,6 +10,7 @@ import fsspec import pandas as pd +import pandas_gbq as pd_gbq from google.cloud import bigquery from google.cloud.exceptions import NotFound from google.oauth2.credentials import Credentials @@ -138,16 +139,17 @@ def _describe(self) -> dict[str, Any]: def _load(self) -> pd.DataFrame: sql = f"select * from {self._dataset}.{self._table_name}" # nosec - self._load_args.setdefault("query", sql) - return pd.read_gbq( + self._load_args.setdefault("query_or_table", sql) + return pd_gbq.read_gbq( project_id=self._project_id, credentials=self._credentials, **self._load_args, ) def _save(self, data: pd.DataFrame) -> None: - data.to_gbq( - f"{self._dataset}.{self._table_name}", + pd_gbq.to_gbq( + dataframe=data, + destination_table=f"{self._dataset}.{self._table_name}", project_id=self._project_id, credentials=self._credentials, **self._save_args, @@ -176,7 +178,7 @@ def _validate_location(self): class GBQQueryDataset(AbstractDataset[None, pd.DataFrame]): """``GBQQueryDataset`` loads data from a provided SQL query from Google - BigQuery. It uses ``pandas.read_gbq`` which itself uses ``pandas-gbq`` + BigQuery. It uses ``pandas_gbq.read_gbq`` which itself uses ``pandas-gbq`` internally to read from BigQuery table. Therefore it supports all allowed pandas options on ``read_gbq``. @@ -274,7 +276,7 @@ def __init__( # noqa: PLR0913 # load sql query from arg or from file if sql: - self._load_args["query"] = sql + self._load_args["query_or_table"] = sql self._filepath = None else: # filesystem for loading sql file @@ -291,7 +293,7 @@ def __init__( # noqa: PLR0913 def _describe(self) -> dict[str, Any]: load_args = copy.deepcopy(self._load_args) desc = {} - desc["sql"] = str(load_args.pop("query", None)) + desc["sql"] = str(load_args.pop("query_or_table", None)) desc["filepath"] = str(self._filepath) desc["load_args"] = str(load_args) @@ -303,9 +305,9 @@ def _load(self) -> pd.DataFrame: if self._filepath: load_path = get_filepath_str(PurePosixPath(self._filepath), self._protocol) with self._fs.open(load_path, mode="r") as fs_file: - load_args["query"] = fs_file.read() + load_args["query_or_table"] = fs_file.read() - return pd.read_gbq( + return pd_gbq.read_gbq( project_id=self._project_id, credentials=self._credentials, **load_args, diff --git a/kedro-datasets/tests/pandas/test_gbq_dataset.py b/kedro-datasets/tests/pandas/test_gbq_dataset.py index a797708ae..63095b74e 100644 --- a/kedro-datasets/tests/pandas/test_gbq_dataset.py +++ b/kedro-datasets/tests/pandas/test_gbq_dataset.py @@ -95,7 +95,9 @@ def test_save_extra_params(self, gbq_dataset, save_args): def test_load_missing_file(self, gbq_dataset, mocker): """Check the error when trying to load missing table.""" pattern = r"Failed while loading data from data set GBQTableDataset\(.*\)" - mocked_read_gbq = mocker.patch("kedro_datasets.pandas.gbq_dataset.pd.read_gbq") + mocked_read_gbq = mocker.patch( + "kedro_datasets.pandas.gbq_dataset.pd_gbq.read_gbq" + ) mocked_read_gbq.side_effect = ValueError with pytest.raises(DatasetError, match=pattern): gbq_dataset.load() @@ -133,30 +135,43 @@ def test_save_load_data(self, gbq_dataset, dummy_dataframe, mocker): """Test saving and reloading the data set.""" sql = f"select * from {DATASET}.{TABLE_NAME}" table_id = f"{DATASET}.{TABLE_NAME}" - mocked_read_gbq = mocker.patch("kedro_datasets.pandas.gbq_dataset.pd.read_gbq") + mocked_to_gbq = mocker.patch("kedro_datasets.pandas.gbq_dataset.pd_gbq.to_gbq") + mocked_read_gbq = mocker.patch( + "kedro_datasets.pandas.gbq_dataset.pd_gbq.read_gbq" + ) mocked_read_gbq.return_value = dummy_dataframe mocked_df = mocker.Mock() gbq_dataset.save(mocked_df) loaded_data = gbq_dataset.load() - mocked_df.to_gbq.assert_called_once_with( - table_id, project_id=PROJECT, credentials=None, progress_bar=False + mocked_to_gbq.assert_called_once_with( + dataframe=mocked_df, + destination_table=table_id, + project_id=PROJECT, + credentials=None, + progress_bar=False, ) mocked_read_gbq.assert_called_once_with( - project_id=PROJECT, credentials=None, query=sql + project_id=PROJECT, credentials=None, query_or_table=sql ) assert_frame_equal(dummy_dataframe, loaded_data) - @pytest.mark.parametrize("load_args", [{"query": "Select 1"}], indirect=True) + @pytest.mark.parametrize( + "load_args", [{"query_or_table": "Select 1"}], indirect=True + ) def test_read_gbq_with_query(self, gbq_dataset, dummy_dataframe, mocker, load_args): """Test loading data set with query in the argument.""" - mocked_read_gbq = mocker.patch("kedro_datasets.pandas.gbq_dataset.pd.read_gbq") + mocked_read_gbq = mocker.patch( + "kedro_datasets.pandas.gbq_dataset.pd_gbq.read_gbq" + ) mocked_read_gbq.return_value = dummy_dataframe loaded_data = gbq_dataset.load() mocked_read_gbq.assert_called_once_with( - project_id=PROJECT, credentials=None, query=load_args["query"] + project_id=PROJECT, + credentials=None, + query_or_table=load_args["query_or_table"], ) assert_frame_equal(dummy_dataframe, loaded_data) @@ -239,26 +254,30 @@ def test_credentials_propagation(self, mocker): def test_load(self, mocker, gbq_sql_dataset, dummy_dataframe): """Test `load` method invocation""" - mocked_read_gbq = mocker.patch("kedro_datasets.pandas.gbq_dataset.pd.read_gbq") + mocked_read_gbq = mocker.patch( + "kedro_datasets.pandas.gbq_dataset.pd_gbq.read_gbq" + ) mocked_read_gbq.return_value = dummy_dataframe loaded_data = gbq_sql_dataset.load() mocked_read_gbq.assert_called_once_with( - project_id=PROJECT, credentials=None, query=SQL_QUERY + project_id=PROJECT, credentials=None, query_or_table=SQL_QUERY ) assert_frame_equal(dummy_dataframe, loaded_data) def test_load_query_file(self, mocker, gbq_sql_file_dataset, dummy_dataframe): """Test `load` method invocation using a file as input query""" - mocked_read_gbq = mocker.patch("kedro_datasets.pandas.gbq_dataset.pd.read_gbq") + mocked_read_gbq = mocker.patch( + "kedro_datasets.pandas.gbq_dataset.pd_gbq.read_gbq" + ) mocked_read_gbq.return_value = dummy_dataframe loaded_data = gbq_sql_file_dataset.load() mocked_read_gbq.assert_called_once_with( - project_id=PROJECT, credentials=None, query=SQL_QUERY + project_id=PROJECT, credentials=None, query_or_table=SQL_QUERY ) assert_frame_equal(dummy_dataframe, loaded_data) From 0a3a38101cc61a967c9fbc053701a7b762e6bb65 Mon Sep 17 00:00:00 2001 From: gitgud5000 <17186026+gitgud5000@users.noreply.github.com> Date: Thu, 26 Sep 2024 11:27:32 +0200 Subject: [PATCH 10/15] fix(datasets): Use `put()` and `get()` instead of `copy` in `TensorFlowModelDataset`'s `_save` and `_load` methods. (#844) * fix(datasets): Use `put()` and `get()` instead of `copy` in `TensorFlowModelDataset`'s `_save` and `_load` methods. Signed-off-by: gitgud5000 <17186026+gitgud5000@users.noreply.github.com> * chore: Added comments of changes in `RELEASE.md` Signed-off-by: gitgud5000 <17186026+gitgud5000@users.noreply.github.com> --------- Signed-off-by: gitgud5000 <17186026+gitgud5000@users.noreply.github.com> Signed-off-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> Co-authored-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> --- kedro-datasets/RELEASE.md | 2 ++ .../kedro_datasets/tensorflow/tensorflow_model_dataset.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 30af78fe9..bd2e9ac15 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -14,6 +14,7 @@ ## Bug fixes and other changes * Refactored all datasets to set `fs_args` defaults in the same way as `load_args` and `save_args` and not have hardcoded values in the save methods. +* Fixed bug related to loading/saving models from/to remote storage using `TensorFlowModelDataset`. * Fixed deprecated load and save approaches of GBQTableDataset and GBQQueryDataset by invoking save and load directly over `pandas-gbq` lib ## Breaking Changes @@ -21,6 +22,7 @@ Many thanks to the following Kedroids for contributing PRs to this release: * [Brandon Meek](https://github.com/bpmeek) * [yury-fedotov](https://github.com/yury-fedotov) +* [gitgud5000](https://github.com/gitgud5000) * [janickspirig](https://github.com/janickspirig) diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py index 5c5dc27a1..e4492161d 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py @@ -144,7 +144,7 @@ def _load(self) -> tf.keras.Model: # We assume .keras path = str(PurePath(tempdir) / TEMPORARY_KERAS_FILE) # noqa: PLW2901 - self._fs.copy(load_path, path) + self._fs.get(load_path, path) # Pass the local temporary directory/file path to keras.load_model device_name = self._load_args.pop("tf_device", None) @@ -169,7 +169,7 @@ def _save(self, data: tf.keras.Model) -> None: # Use fsspec to take from local tempfile directory/file and # put in ArbitraryFileSystem - self._fs.copy(path, save_path) + self._fs.put(path, save_path) def _exists(self) -> bool: try: From 970862baefd424d08432dea40981067ee82ea7e6 Mon Sep 17 00:00:00 2001 From: Galen Seilis Date: Fri, 27 Sep 2024 07:11:26 -0700 Subject: [PATCH 11/15] feat: ProphetModelDataset (#720) * Added dataset and tests for Facebook's Prophet model * Removed in-path example. * Added prophet as optional dependency in toml * Added prophet as testing dependency * Update docstring to have a doctest example with example data Signed-off-by: galenseilis * Try without assert * Move ProphetModelDataset to experimental datasets Signed-off-by: Merel Theisen * Move test fixtures + fix imports Signed-off-by: Merel Theisen * Fix prophet docstring and tests Signed-off-by: Merel Theisen * Fix lint Signed-off-by: Merel Theisen * Fix docs Signed-off-by: Merel Theisen * Fix docs Signed-off-by: Merel Theisen * Bandit Signed-off-by: Ankita Katiyar * Add nosec instead Signed-off-by: Ankita Katiyar * Add to release notes Signed-off-by: Ankita Katiyar --------- Signed-off-by: galenseilis Signed-off-by: Merel Theisen Signed-off-by: Ankita Katiyar Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> Co-authored-by: Merel Theisen Co-authored-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Co-authored-by: Ankita Katiyar --- kedro-datasets/RELEASE.md | 3 + .../api/kedro_datasets_experimental.rst | 1 + kedro-datasets/docs/source/conf.py | 2 + .../prophet/__init__.py | 11 + .../prophet/prophet_dataset.py | 121 ++++++++++ .../tests/conftest.py | 34 +++ .../tests/prophet/__init__.py | 0 .../tests/prophet/test_prophet_dataset.py | 209 ++++++++++++++++++ kedro-datasets/pyproject.toml | 5 +- 9 files changed, 385 insertions(+), 1 deletion(-) create mode 100644 kedro-datasets/kedro_datasets_experimental/prophet/__init__.py create mode 100644 kedro-datasets/kedro_datasets_experimental/prophet/prophet_dataset.py create mode 100644 kedro-datasets/kedro_datasets_experimental/tests/conftest.py create mode 100644 kedro-datasets/kedro_datasets_experimental/tests/prophet/__init__.py create mode 100644 kedro-datasets/kedro_datasets_experimental/tests/prophet/test_prophet_dataset.py diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index bd2e9ac15..52ba9fe51 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -5,6 +5,8 @@ | Type | Description | Location | |-------------------------------------|-----------------------------------------------------------|-----------------------------------------| | `pytorch.PyTorchDataset` | A dataset for securely saving and loading PyTorch models | `kedro_datasets_experimental.pytorch` | +| `prophet.ProphetModelDataset` | A dataset for Meta's Prophet model for time series forecasting | `kedro_datasets_experimental.prophet` | + * Added the following new core datasets: @@ -24,6 +26,7 @@ Many thanks to the following Kedroids for contributing PRs to this release: * [yury-fedotov](https://github.com/yury-fedotov) * [gitgud5000](https://github.com/gitgud5000) * [janickspirig](https://github.com/janickspirig) +* [Galen Seilis](https://github.com/galenseilis) # Release 4.1.0 diff --git a/kedro-datasets/docs/source/api/kedro_datasets_experimental.rst b/kedro-datasets/docs/source/api/kedro_datasets_experimental.rst index 0eb76c739..219510954 100644 --- a/kedro-datasets/docs/source/api/kedro_datasets_experimental.rst +++ b/kedro-datasets/docs/source/api/kedro_datasets_experimental.rst @@ -16,5 +16,6 @@ kedro_datasets_experimental langchain.ChatOpenAIDataset langchain.OpenAIEmbeddingsDataset netcdf.NetCDFDataset + prophet.ProphetModelDataset pytorch.PyTorchDataset rioxarray.GeoTIFFDataset diff --git a/kedro-datasets/docs/source/conf.py b/kedro-datasets/docs/source/conf.py index 70c6be3ae..09524612a 100644 --- a/kedro-datasets/docs/source/conf.py +++ b/kedro-datasets/docs/source/conf.py @@ -140,6 +140,8 @@ "xarray.core.dataset.Dataset", "xarray.core.dataarray.DataArray", "torch.nn.modules.module.Module", + "prophet.forecaster.Prophet", + "Prophet", ), "py:data": ( "typing.Any", diff --git a/kedro-datasets/kedro_datasets_experimental/prophet/__init__.py b/kedro-datasets/kedro_datasets_experimental/prophet/__init__.py new file mode 100644 index 000000000..93cd66d99 --- /dev/null +++ b/kedro-datasets/kedro_datasets_experimental/prophet/__init__.py @@ -0,0 +1,11 @@ +"""``JSONDataset`` implementation to load/save data from/to a Prophet model file.""" + +from typing import Any + +import lazy_loader as lazy + +ProphetDataset: Any + +__getattr__, __dir__, __all__ = lazy.attach( + __name__, submod_attrs={"prophet_dataset": ["ProphetModelDataset"]} +) diff --git a/kedro-datasets/kedro_datasets_experimental/prophet/prophet_dataset.py b/kedro-datasets/kedro_datasets_experimental/prophet/prophet_dataset.py new file mode 100644 index 000000000..ca2cd1e75 --- /dev/null +++ b/kedro-datasets/kedro_datasets_experimental/prophet/prophet_dataset.py @@ -0,0 +1,121 @@ +from __future__ import annotations + +from typing import Any + +from kedro.io.core import Version, get_filepath_str +from prophet import Prophet +from prophet.serialize import model_from_json, model_to_json + +from kedro_datasets.json import JSONDataset + + +class ProphetModelDataset(JSONDataset): + """``ProphetModelDataset`` loads/saves Facebook Prophet models to a JSON file using an + underlying filesystem (e.g., local, S3, GCS). It uses Prophet's built-in + serialization to handle the JSON file. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + model: + type: custom_datasets.ProphetModelDataset + filepath: gcs://your_bucket/model.json + fs_args: + project: my-project + credentials: my_gcp_credentials + + Example usage for the + `Python API `_: + + .. code-block:: pycon + + >>> from kedro_datasets_experimental.prophet import ProphetModelDataset + >>> from prophet import Prophet + >>> import pandas as pd + >>> + >>> df = pd.DataFrame({ + >>> "ds": ["2024-01-01", "2024-01-02", "2024-01-03"], + >>> "y": [100, 200, 300] + >>> }) + >>> + >>> model = Prophet() + >>> model.fit(df) + >>> dataset = ProphetModelDataset(filepath="path/to/model.json") + >>> dataset.save(model) + >>> reloaded_model = dataset.load() + + """ + + def __init__( # noqa: PLR0913 + self, + *, + filepath: str, + save_args: dict[str, Any] | None = None, + version: Version | None = None, + credentials: dict[str, Any] | None = None, + fs_args: dict[str, Any] | None = None, + metadata: dict[str, Any] | None = None, + ) -> None: + """Creates a new instance of ``ProphetModelDataset`` pointing to a concrete JSON file + on a specific filesystem. + + Args: + filepath: Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`. + If prefix is not provided, `file` protocol (local filesystem) will be used. + The prefix should be any protocol supported by ``fsspec``. + Note: `http(s)` doesn't support versioning. + save_args: json options for saving JSON files (arguments passed + into ```json.dump``). Here you can find all available arguments: + https://docs.python.org/3/library/json.html + All defaults are preserved, but "default_flow_style", which is set to False. + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + credentials: Credentials required to get access to the underlying filesystem. + E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as + to pass to the filesystem's `open` method through nested keys + `open_args_load` and `open_args_save`. + Here you can find all available arguments for `open`: + https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. + """ + super().__init__( + filepath=filepath, + save_args=save_args, + version=version, + credentials=credentials, + fs_args=fs_args, + metadata=metadata, + ) + + def _load(self) -> Prophet: + """Loads a Prophet model from a JSON file. + + Returns: + Prophet: A deserialized Prophet model. + """ + load_path = get_filepath_str(self._get_load_path(), self._protocol) + + with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: + return model_from_json(fs_file.read()) + + def _save(self, data: Prophet) -> None: + """Saves a Prophet model to a JSON file. + + Args: + data: The Prophet model instance to be serialized and saved. + """ + save_path = get_filepath_str(self._get_save_path(), self._protocol) + + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + fs_file.write(model_to_json(data)) + + self._invalidate_cache() diff --git a/kedro-datasets/kedro_datasets_experimental/tests/conftest.py b/kedro-datasets/kedro_datasets_experimental/tests/conftest.py new file mode 100644 index 000000000..91d19f646 --- /dev/null +++ b/kedro-datasets/kedro_datasets_experimental/tests/conftest.py @@ -0,0 +1,34 @@ +""" +This file contains the fixtures that are reusable by any tests within +this directory. You don't need to import the fixtures as pytest will +discover them automatically. More info here: +https://docs.pytest.org/en/latest/fixture.html +""" + +from kedro.io.core import generate_timestamp +from pytest import fixture + + +@fixture(params=[None]) +def load_version(request): + return request.param + + +@fixture(params=[None]) +def save_version(request): + return request.param or generate_timestamp() + + +@fixture(params=[None]) +def load_args(request): + return request.param + + +@fixture(params=[None]) +def save_args(request): + return request.param + + +@fixture(params=[None]) +def fs_args(request): + return request.param diff --git a/kedro-datasets/kedro_datasets_experimental/tests/prophet/__init__.py b/kedro-datasets/kedro_datasets_experimental/tests/prophet/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-datasets/kedro_datasets_experimental/tests/prophet/test_prophet_dataset.py b/kedro-datasets/kedro_datasets_experimental/tests/prophet/test_prophet_dataset.py new file mode 100644 index 000000000..88510a99b --- /dev/null +++ b/kedro-datasets/kedro_datasets_experimental/tests/prophet/test_prophet_dataset.py @@ -0,0 +1,209 @@ +from pathlib import Path, PurePosixPath + +import pandas as pd +import pytest +from fsspec.implementations.http import HTTPFileSystem +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from kedro.io.core import PROTOCOL_DELIMITER, DatasetError, Version +from prophet import Prophet +from s3fs.core import S3FileSystem + +from kedro_datasets_experimental.prophet import ProphetModelDataset + + +@pytest.fixture +def filepath_json(tmp_path): + return (tmp_path / "test_model.json").as_posix() + + +@pytest.fixture +def prophet_model_dataset(filepath_json, save_args, fs_args): + return ProphetModelDataset( + filepath=filepath_json, save_args=save_args, fs_args=fs_args + ) + + +@pytest.fixture +def versioned_prophet_model_dataset(filepath_json, load_version, save_version): + return ProphetModelDataset( + filepath=filepath_json, version=Version(load_version, save_version) + ) + + +@pytest.fixture +def dummy_model(): + df = pd.DataFrame({"ds": ["2024-01-01", "2024-01-02", "2024-01-03"], "y": [100, 200, 300]}) + model = Prophet() + # Fit the model with dummy data + model.fit(df) + return model + + +class TestProphetModelDataset: + def test_save_and_load(self, prophet_model_dataset, dummy_model): + """Test saving and reloading the Prophet model.""" + prophet_model_dataset.save(dummy_model) + reloaded = prophet_model_dataset.load() + assert isinstance(reloaded, Prophet) + assert prophet_model_dataset._fs_open_args_load == {} + assert prophet_model_dataset._fs_open_args_save == {"mode": "w"} + + def test_exists(self, prophet_model_dataset, dummy_model): + """Test `exists` method invocation for both existing and + nonexistent dataset.""" + assert not prophet_model_dataset.exists() + prophet_model_dataset.save(dummy_model) + assert prophet_model_dataset.exists() + + @pytest.mark.parametrize("save_args", [{"k1": "v1", "indent": 4}], indirect=True) + def test_save_extra_params(self, prophet_model_dataset, save_args): + """Test overriding the default save arguments.""" + for key, value in save_args.items(): + assert prophet_model_dataset._save_args[key] == value + + @pytest.mark.parametrize( + "fs_args", + [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], + indirect=True, + ) + def test_open_extra_args(self, prophet_model_dataset, fs_args): + assert prophet_model_dataset._fs_open_args_load == fs_args["open_args_load"] + assert prophet_model_dataset._fs_open_args_save == { + "mode": "w" + } # default unchanged + + def test_load_missing_file(self, prophet_model_dataset): + """Check the error when trying to load missing file.""" + pattern = r"Failed while loading data from data set ProphetModelDataset\(.*\)" + with pytest.raises(DatasetError, match=pattern): + prophet_model_dataset.load() + + @pytest.mark.parametrize( + "filepath,instance_type", + [ + ("s3://bucket/model.json", S3FileSystem), + ("file:///tmp/test_model.json", LocalFileSystem), + ("/tmp/test_model.json", LocalFileSystem), #nosec: B108 + ("gcs://bucket/model.json", GCSFileSystem), + ("https://example.com/model.json", HTTPFileSystem), + ], + ) + def test_protocol_usage(self, filepath, instance_type): + dataset = ProphetModelDataset(filepath=filepath) + assert isinstance(dataset._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(dataset._filepath) == path + assert isinstance(dataset._filepath, PurePosixPath) + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test_model.json" + dataset = ProphetModelDataset(filepath=filepath) + dataset.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) + + +class TestProphetModelDatasetVersioned: + def test_version_str_repr(self, load_version, save_version): + """Test that version is in string representation of the class instance + when applicable.""" + filepath = "test_model.json" + ds = ProphetModelDataset(filepath=filepath) + ds_versioned = ProphetModelDataset( + filepath=filepath, version=Version(load_version, save_version) + ) + assert filepath in str(ds) + assert "version" not in str(ds) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "ProphetModelDataset" in str(ds_versioned) + assert "ProphetModelDataset" in str(ds) + assert "protocol" in str(ds_versioned) + assert "protocol" in str(ds) + # Default save_args + assert "save_args={'indent': 2}" in str(ds) + assert "save_args={'indent': 2}" in str(ds_versioned) + + def test_save_and_load(self, versioned_prophet_model_dataset, dummy_model): + """Test that saved and reloaded data matches the original one for + the versioned dataset.""" + versioned_prophet_model_dataset.save(dummy_model) + reloaded = versioned_prophet_model_dataset.load() + assert isinstance(reloaded, Prophet) + + def test_no_versions(self, versioned_prophet_model_dataset): + """Check the error if no versions are available for load.""" + pattern = r"Did not find any versions for ProphetModelDataset\(.+\)" + with pytest.raises(DatasetError, match=pattern): + versioned_prophet_model_dataset.load() + + def test_exists(self, versioned_prophet_model_dataset, dummy_model): + """Test `exists` method invocation for versioned dataset.""" + assert not versioned_prophet_model_dataset.exists() + versioned_prophet_model_dataset.save(dummy_model) + assert versioned_prophet_model_dataset.exists() + + def test_prevent_overwrite(self, versioned_prophet_model_dataset, dummy_model): + """Check the error when attempting to override the dataset if the + corresponding json file for a given save version already exists.""" + versioned_prophet_model_dataset.save(dummy_model) + pattern = ( + r"Save path \'.+\' for ProphetModelDataset\(.+\) must " + r"not exist if versioning is enabled\." + ) + with pytest.raises(DatasetError, match=pattern): + versioned_prophet_model_dataset.save(dummy_model) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, versioned_prophet_model_dataset, load_version, save_version, dummy_model + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + f"Save version '{save_version}' did not match " + f"load version '{load_version}' for " + r"ProphetModelDataset\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + versioned_prophet_model_dataset.save(dummy_model) + + def test_http_filesystem_no_versioning(self): + pattern = "Versioning is not supported for HTTP protocols." + + with pytest.raises(DatasetError, match=pattern): + ProphetModelDataset( + filepath="https://example.com/model.json", version=Version(None, None) + ) + + def test_versioning_existing_dataset( + self, prophet_model_dataset, versioned_prophet_model_dataset, dummy_model + ): + """Check the error when attempting to save a versioned dataset on top of an + already existing (non-versioned) dataset.""" + prophet_model_dataset.save(dummy_model) + assert prophet_model_dataset.exists() + assert ( + prophet_model_dataset._filepath == versioned_prophet_model_dataset._filepath + ) + pattern = ( + f"(?=.*file with the same name already exists in the directory)" + f"(?=.*{versioned_prophet_model_dataset._filepath.parent.as_posix()})" + ) + with pytest.raises(DatasetError, match=pattern): + versioned_prophet_model_dataset.save(dummy_model) + + # Remove non-versioned dataset and try again + Path(prophet_model_dataset._filepath.as_posix()).unlink() + versioned_prophet_model_dataset.save(dummy_model) + assert versioned_prophet_model_dataset.exists() diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 895caebfd..da2c15b18 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -183,6 +183,8 @@ langchain = ["kedro-datasets[langchain-chatopenaidataset,langchain-openaiembeddi netcdf-netcdfdataset = ["h5netcdf>=1.2.0","netcdf4>=1.6.4","xarray>=2023.1.0"] netcdf = ["kedro-datasets[netcdf-netcdfdataset]"] +prophet-dataset = ["prophet>=1.1.5"] +prophet = ["kedro-datasets[prophet]"] pytorch-dataset = ["torch"] pytorch = ["kedro-datasets[pytorch-dataset]"] @@ -290,7 +292,8 @@ experimental = [ "netcdf4>=1.6.4", "xarray>=2023.1.0", "rioxarray", - "torch" + "torch", + "prophet>=1.1.5", ] # All requirements From 07b877b74a718433fef1b370365cadb221346217 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Mon, 30 Sep 2024 11:36:13 +0100 Subject: [PATCH 12/15] chore(datasets): Move experimental `pytorch` tests (#858) Move experimental tests Signed-off-by: Ankita Katiyar --- .../tests/pytorch}/__init__.py | 0 .../tests}/pytorch/test_pytorch_dataset.py | 0 .../tests/kedro_datasets_experimental/pytorch/__init__.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename kedro-datasets/{tests/kedro_datasets_experimental => kedro_datasets_experimental/tests/pytorch}/__init__.py (100%) rename kedro-datasets/{tests/kedro_datasets_experimental => kedro_datasets_experimental/tests}/pytorch/test_pytorch_dataset.py (100%) delete mode 100644 kedro-datasets/tests/kedro_datasets_experimental/pytorch/__init__.py diff --git a/kedro-datasets/tests/kedro_datasets_experimental/__init__.py b/kedro-datasets/kedro_datasets_experimental/tests/pytorch/__init__.py similarity index 100% rename from kedro-datasets/tests/kedro_datasets_experimental/__init__.py rename to kedro-datasets/kedro_datasets_experimental/tests/pytorch/__init__.py diff --git a/kedro-datasets/tests/kedro_datasets_experimental/pytorch/test_pytorch_dataset.py b/kedro-datasets/kedro_datasets_experimental/tests/pytorch/test_pytorch_dataset.py similarity index 100% rename from kedro-datasets/tests/kedro_datasets_experimental/pytorch/test_pytorch_dataset.py rename to kedro-datasets/kedro_datasets_experimental/tests/pytorch/test_pytorch_dataset.py diff --git a/kedro-datasets/tests/kedro_datasets_experimental/pytorch/__init__.py b/kedro-datasets/tests/kedro_datasets_experimental/pytorch/__init__.py deleted file mode 100644 index e69de29bb..000000000 From 3fd8e3369dd639117255bb8b9247024a7b64b759 Mon Sep 17 00:00:00 2001 From: "L. R. Couto" <57910428+lrcouto@users.noreply.github.com> Date: Mon, 30 Sep 2024 13:53:31 -0300 Subject: [PATCH 13/15] feat: Release kedro-docker 0.6.1 (#859) * Release 0.6.1 Signed-off-by: Laura Couto * Add release notes Signed-off-by: Laura Couto --------- Signed-off-by: Laura Couto --- kedro-docker/RELEASE.md | 3 +++ kedro-docker/kedro_docker/__init__.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/kedro-docker/RELEASE.md b/kedro-docker/RELEASE.md index 9d0868b91..725ee0f96 100644 --- a/kedro-docker/RELEASE.md +++ b/kedro-docker/RELEASE.md @@ -1,5 +1,8 @@ # Upcoming Release +# Release 0.6.1 +* Unpinned pip version requirement + # Release 0.6.0 ## Major features and improvements * Added support for Python 3.12 diff --git a/kedro-docker/kedro_docker/__init__.py b/kedro-docker/kedro_docker/__init__.py index 3cb3da72d..95b77ddf9 100644 --- a/kedro-docker/kedro_docker/__init__.py +++ b/kedro-docker/kedro_docker/__init__.py @@ -1,3 +1,3 @@ """Kedro plugin for packaging a project with Docker.""" -__version__ = "0.6.0" +__version__ = "0.6.1" From ca881f1ded0cf08528321a3740e7ebbf57f57c0e Mon Sep 17 00:00:00 2001 From: "L. R. Couto" <57910428+lrcouto@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:17:23 -0300 Subject: [PATCH 14/15] feat: Release kedro-telemetry 0.6.1 (#860) * Release kedro-telemetry 0.6.1 Signed-off-by: Laura Couto * Add release notes Signed-off-by: Laura Couto * Add additional change on release notes. Signed-off-by: L. R. Couto <57910428+lrcouto@users.noreply.github.com> --------- Signed-off-by: Laura Couto Signed-off-by: L. R. Couto <57910428+lrcouto@users.noreply.github.com> --- kedro-telemetry/RELEASE.md | 5 +++++ kedro-telemetry/kedro_telemetry/__init__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/kedro-telemetry/RELEASE.md b/kedro-telemetry/RELEASE.md index db3fb8053..27b620e64 100644 --- a/kedro-telemetry/RELEASE.md +++ b/kedro-telemetry/RELEASE.md @@ -1,5 +1,10 @@ # Upcoming release +# Release 0.6.1 +* Changed Kedro CLI loading method to improve loading times. +* Changed logging level from error to debug for most logging messages. +* Set default value for the `identity` parameter, to prevent errors due to it being empty. + # Release 0.6.0 * Moved to an opt-out model for telemetry, enabling it by default without requiring prior consent. * Added `DO_NOT_TRACK` and `KEDRO_DISABLE_TELEMETRY` environment variables to skip telemetry. diff --git a/kedro-telemetry/kedro_telemetry/__init__.py b/kedro-telemetry/kedro_telemetry/__init__.py index 6f4dcc75f..39a11503d 100644 --- a/kedro-telemetry/kedro_telemetry/__init__.py +++ b/kedro-telemetry/kedro_telemetry/__init__.py @@ -1,6 +1,6 @@ """Kedro plugin for collecting Kedro usage data.""" -__version__ = "0.6.0" +__version__ = "0.6.1" import logging From 312bd9940aca3f2f492269bb43922c4e0671a4aa Mon Sep 17 00:00:00 2001 From: Mariusz Wojakowski Date: Wed, 2 Oct 2024 12:37:21 +0200 Subject: [PATCH 15/15] fix(datasets): fix incorrect `pandas` optional dependency (#864) Signed-off-by: Mariusz Wojakowski Signed-off-by: mariusz.wojakowski --- kedro-datasets/RELEASE.md | 2 ++ kedro-datasets/pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 52ba9fe51..95b7a9dca 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -18,6 +18,7 @@ * Refactored all datasets to set `fs_args` defaults in the same way as `load_args` and `save_args` and not have hardcoded values in the save methods. * Fixed bug related to loading/saving models from/to remote storage using `TensorFlowModelDataset`. * Fixed deprecated load and save approaches of GBQTableDataset and GBQQueryDataset by invoking save and load directly over `pandas-gbq` lib +* Fixed incorrect `pandas` optional dependency ## Breaking Changes ## Community contributions @@ -27,6 +28,7 @@ Many thanks to the following Kedroids for contributing PRs to this release: * [gitgud5000](https://github.com/gitgud5000) * [janickspirig](https://github.com/janickspirig) * [Galen Seilis](https://github.com/galenseilis) +* [Mariusz Wojakowski](https://github.com/mariusz89016) # Release 4.1.0 diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index da2c15b18..56e40fd4d 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -105,7 +105,7 @@ pandas = [ pandas-exceldataset,\ pandas-featherdataset,\ pandas-gbqquerydataset,\ - pandas-gbqtabledataset.\ + pandas-gbqtabledataset,\ pandas-genericdataset,\ pandas-hdfdataset,\ pandas-jsondataset,\