Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(datasets): Remove deprecation warning from Table Dataset #956

Merged
merged 19 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions kedro-datasets/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
- Improved the dependency management for Spark-based datasets by refactoring the Spark and Databricks utility functions used across the datasets.
- Added deprecation warning for `tracking.MetricsDataset` and `tracking.JSONDataset`.
- Moved `kedro-catalog` JSON schemas from Kedro core to `kedro-datasets`.
- Removed file handling using Ibis's backends from `ibis.TableDataset`. `ibis.FileDataset` will handle loading and saving files using Ibis's backends.
ravi-kumar-pilla marked this conversation as resolved.
Show resolved Hide resolved
ravi-kumar-pilla marked this conversation as resolved.
Show resolved Hide resolved

## Breaking Changes

Expand Down
44 changes: 6 additions & 38 deletions kedro-datasets/kedro_datasets/ibis/table_dataset.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
"""Provide data loading and saving functionality for Ibis's backends."""
from __future__ import annotations

import warnings
from copy import deepcopy
from typing import TYPE_CHECKING, Any, ClassVar

import ibis.expr.types as ir
from kedro.io import AbstractDataset, DatasetError
from kedro.io import AbstractDataset

from kedro_datasets import KedroDeprecationWarning
from kedro_datasets._utils import ConnectionMixin

if TYPE_CHECKING:
Expand Down Expand Up @@ -76,16 +74,14 @@ class TableDataset(ConnectionMixin, AbstractDataset[ir.Table, ir.Table]):
def __init__( # noqa: PLR0913
self,
*,
filepath: str | None = None,
file_format: str | None = None,
table_name: str | None = None,
table_name: str,
database: str | None = None,
connection: dict[str, Any] | None = None,
load_args: dict[str, Any] | None = None,
save_args: dict[str, Any] | None = None,
metadata: dict[str, Any] | None = None,
) -> None:
"""Creates a new ``TableDataset`` pointing to a table (or file).
"""Creates a new ``TableDataset`` pointing to a table.

``TableDataset`` connects to the Ibis backend object constructed
from the connection configuration. The `backend` key provided in
Expand All @@ -95,8 +91,7 @@ def __init__( # noqa: PLR0913
`ibis.duckdb.connect() <https://ibis-project.org/backends/duckdb\
#ibis.duckdb.connect>`_).

If ``table_name`` is given (and ``filepath`` isn't), the dataset
establishes a connection to the relevant table for the execution
The dataset establishes a connection to the relevant table for the execution
backend. Therefore, Ibis doesn't fetch data on load; all compute
is deferred until materialization, when the expression is saved.
In practice, this happens when another ``TableDataset`` instance
Expand All @@ -122,22 +117,7 @@ def __init__( # noqa: PLR0913
metadata: Any arbitrary metadata. This is ignored by Kedro,
but may be consumed by users or external plugins.
"""
if filepath is None and table_name is None:
raise DatasetError(
"Must provide at least one of `filepath` or `table_name`."
)

if filepath is not None or file_format is not None:
warnings.warn(
"Use 'FileDataset' to load and save files with an Ibis "
"backend; the functionality will be removed from 'Table"
"Dataset' in Kedro-Datasets 6.0.0",
KedroDeprecationWarning,
stacklevel=2,
)

self._filepath = filepath
self._file_format = file_format

self._table_name = table_name
self._database = database
self._connection_config = connection or self.DEFAULT_CONNECTION_CONFIG
Expand Down Expand Up @@ -171,19 +151,9 @@ def connection(self) -> BaseBackend:
return self._connection

def load(self) -> ir.Table:
if self._filepath is not None:
if self._file_format is None:
raise NotImplementedError

reader = getattr(self.connection, f"read_{self._file_format}")
return reader(self._filepath, self._table_name, **self._load_args)
else:
return self.connection.table(self._table_name, **self._load_args)
return self.connection.table(self._table_name, **self._load_args)

def save(self, data: ir.Table) -> None:
if self._table_name is None:
raise DatasetError("Must provide `table_name` for materialization.")

writer = getattr(self.connection, f"create_{self._materialized}")
writer(self._table_name, data, **self._save_args)

Expand All @@ -193,8 +163,6 @@ def _describe(self) -> dict[str, Any]:
load_args.pop("database", None)
save_args.pop("database", None)
return {
"filepath": self._filepath,
"file_format": self._file_format,
"table_name": self._table_name,
"database": self._database,
"backend": self._connection_config["backend"],
Expand Down
34 changes: 6 additions & 28 deletions kedro-datasets/tests/ibis/test_table_dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import duckdb
import ibis
import pytest
from kedro.io import DatasetError
from packaging.version import Version
from pandas.testing import assert_frame_equal

Expand Down Expand Up @@ -48,19 +47,8 @@ def table_dataset(database_name, connection_config, load_args, save_args):


@pytest.fixture
def table_dataset_from_csv(filepath_csv, connection_config, load_args, save_args):
return TableDataset(
filepath=filepath_csv,
file_format="csv",
connection=connection_config,
load_args=load_args,
save_args=save_args,
)


@pytest.fixture
def dummy_table(table_dataset_from_csv):
return table_dataset_from_csv.load()
def dummy_table():
return ibis.memtable({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]})


@pytest.fixture
Expand Down Expand Up @@ -112,10 +100,10 @@ def test_exists(self, table_dataset, dummy_table):
table_dataset.save(dummy_table)
assert table_dataset.exists()

@pytest.mark.parametrize("load_args", [{"filename": True}], indirect=True)
deepyaman marked this conversation as resolved.
Show resolved Hide resolved
def test_load_extra_params(self, table_dataset_from_csv, load_args):
"""Test overriding the default load arguments."""
assert "filename" in table_dataset_from_csv.load()
def test_load_extra_params(self):
"""Test overriding the load arguments."""
table_dataset = TableDataset(table_name="test", load_args={"database": "test"})
assert "database" in table_dataset._load_args
ravi-kumar-pilla marked this conversation as resolved.
Show resolved Hide resolved

@pytest.mark.parametrize("save_args", [{"materialized": "table"}], indirect=True)
def test_save_extra_params(self, table_dataset, save_args, dummy_table, database):
Expand Down Expand Up @@ -172,16 +160,6 @@ def test_database(
"test" in con.sql("SELECT * FROM duckdb_views").fetchnumpy()["schema_name"]
)

def test_no_filepath_or_table_name(connection_config):
pattern = r"Must provide at least one of `filepath` or `table_name`\."
with pytest.raises(DatasetError, match=pattern):
TableDataset(connection=connection_config)

def test_save_no_table_name(self, table_dataset_from_csv, dummy_table):
pattern = r"Must provide `table_name` for materialization\."
with pytest.raises(DatasetError, match=pattern):
table_dataset_from_csv.save(dummy_table)

@pytest.mark.parametrize(
("connection_config", "key"),
[
Expand Down
Loading