Skip to content

Commit

Permalink
fix(datasets): default to DuckDB in in-memory mode (#897)
Browse files Browse the repository at this point in the history
* fix(datasets): default to DuckDB in in-memory mode

Signed-off-by: Deepyaman Datta <[email protected]>

* test(datasets): use `object()` sentinel as default

Signed-off-by: Deepyaman Datta <[email protected]>

* docs(datasets): add default database to RELEASE.md

Signed-off-by: Deepyaman Datta <[email protected]>

---------

Signed-off-by: Deepyaman Datta <[email protected]>
  • Loading branch information
deepyaman authored Oct 18, 2024
1 parent 44d8245 commit 677d261
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 21 deletions.
10 changes: 7 additions & 3 deletions kedro-datasets/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@

- Added the following new core datasets:

| Type | Description | Location |
| ------------------- | ------------------------------------------------------------- | --------------------- |
| `ibis.TableDataset` | A dataset for loading and saving files using Ibis's backends. | `kedro_datasets.ibis` |
| Type | Description | Location |
| ------------------ | ------------------------------------------------------------- | --------------------- |
| `ibis.FileDataset` | A dataset for loading and saving files using Ibis's backends. | `kedro_datasets.ibis` |

## Bug fixes and other changes

- Changed Ibis datasets to connect to an in-memory DuckDB database if connection configuration is not provided.

# Release 5.0.0

Expand Down
14 changes: 8 additions & 6 deletions kedro-datasets/kedro_datasets/ibis/file_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ class FileDataset(AbstractVersionedDataset[ir.Table, ir.Table]):
"""

DEFAULT_CONNECTION_CONFIG: ClassVar[dict[str, Any]] = {
"backend": "duckdb",
"database": ":memory:",
}
DEFAULT_LOAD_ARGS: ClassVar[dict[str, Any]] = {}
DEFAULT_SAVE_ARGS: ClassVar[dict[str, Any]] = {}

Expand Down Expand Up @@ -107,6 +111,7 @@ def __init__( # noqa: PLR0913
Defaults to writing execution results to a Parquet file.
table_name: The name to use for the created table (on load).
connection: Configuration for connecting to an Ibis backend.
If not provided, connect to DuckDB in in-memory mode.
load_args: Additional arguments passed to the Ibis backend's
`read_{file_format}` method.
save_args: Additional arguments passed to the Ibis backend's
Expand All @@ -120,7 +125,7 @@ def __init__( # noqa: PLR0913
"""
self._file_format = file_format
self._table_name = table_name
self._connection_config = connection
self._connection_config = connection or self.DEFAULT_CONNECTION_CONFIG
self.metadata = metadata

super().__init__(
Expand Down Expand Up @@ -156,8 +161,7 @@ def hashable(value):
import ibis

config = deepcopy(self._connection_config)
backend_attr = config.pop("backend") if config else None
backend = getattr(ibis, str(backend_attr))
backend = getattr(ibis, config.pop("backend"))
cls._connections[key] = backend.connect(**config)

return cls._connections[key]
Expand All @@ -178,9 +182,7 @@ def _describe(self) -> dict[str, Any]:
"filepath": self._filepath,
"file_format": self._file_format,
"table_name": self._table_name,
"backend": self._connection_config.get("backend")
if self._connection_config
else None,
"backend": self._connection_config["backend"],
"load_args": self._load_args,
"save_args": self._save_args,
"version": self._version,
Expand Down
14 changes: 8 additions & 6 deletions kedro-datasets/kedro_datasets/ibis/table_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ class TableDataset(AbstractDataset[ir.Table, ir.Table]):
"""

DEFAULT_CONNECTION_CONFIG: ClassVar[dict[str, Any]] = {
"backend": "duckdb",
"database": ":memory:",
}
DEFAULT_LOAD_ARGS: ClassVar[dict[str, Any]] = {}
DEFAULT_SAVE_ARGS: ClassVar[dict[str, Any]] = {
"materialized": "view",
Expand Down Expand Up @@ -99,6 +103,7 @@ def __init__( # noqa: PLR0913
Args:
table_name: The name of the table or view to read or create.
connection: Configuration for connecting to an Ibis backend.
If not provided, connect to DuckDB in in-memory mode.
load_args: Additional arguments passed to the Ibis backend's
`read_{file_format}` method.
save_args: Additional arguments passed to the Ibis backend's
Expand Down Expand Up @@ -126,7 +131,7 @@ def __init__( # noqa: PLR0913
self._filepath = filepath
self._file_format = file_format
self._table_name = table_name
self._connection_config = connection
self._connection_config = connection or self.DEFAULT_CONNECTION_CONFIG
self.metadata = metadata

# Set load and save arguments, overwriting defaults if provided.
Expand Down Expand Up @@ -158,8 +163,7 @@ def hashable(value):
import ibis

config = deepcopy(self._connection_config)
backend_attr = config.pop("backend") if config else None
backend = getattr(ibis, str(backend_attr))
backend = getattr(ibis, config.pop("backend"))
cls._connections[key] = backend.connect(**config)

return cls._connections[key]
Expand All @@ -186,9 +190,7 @@ def _describe(self) -> dict[str, Any]:
"filepath": self._filepath,
"file_format": self._file_format,
"table_name": self._table_name,
"backend": self._connection_config.get("backend")
if self._connection_config
else None,
"backend": self._connection_config["backend"],
"load_args": self._load_args,
"save_args": self._save_args,
"materialized": self._materialized,
Expand Down
23 changes: 20 additions & 3 deletions kedro-datasets/tests/ibis/test_file_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

from kedro_datasets.ibis import FileDataset

_SENTINEL = object()


@pytest.fixture
def filepath_csv(tmp_path):
Expand All @@ -21,9 +23,13 @@ def database(tmp_path):
return (tmp_path / "file.db").as_posix()


@pytest.fixture(params=[None])
@pytest.fixture(params=[_SENTINEL])
def connection_config(request, database):
return request.param or {"backend": "duckdb", "database": database}
return (
{"backend": "duckdb", "database": database}
if request.param is _SENTINEL # `None` is a valid value to test
else request.param
)


@pytest.fixture
Expand Down Expand Up @@ -103,12 +109,23 @@ def test_save_extra_params(
("query", (("driver", "ODBC Driver 17 for SQL Server"),)),
),
),
# https://github.com/kedro-org/kedro-plugins/pull/893#discussion_r1804632435
(
None,
(
("backend", "duckdb"),
("database", ":memory:"),
),
),
],
indirect=["connection_config"],
)
def test_connection_config(self, mocker, file_dataset, connection_config, key):
"""Test hashing of more complicated connection configuration."""
mocker.patch(f"ibis.{connection_config['backend']}")
backend = (
connection_config["backend"] if connection_config is not None else "duckdb"
)
mocker.patch(f"ibis.{backend}")
file_dataset.load()
assert key in file_dataset._connections

Expand Down
23 changes: 20 additions & 3 deletions kedro-datasets/tests/ibis/test_table_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

from kedro_datasets.ibis import TableDataset

_SENTINEL = object()


@pytest.fixture(scope="session")
def filepath_csv(tmp_path_factory):
Expand All @@ -19,9 +21,13 @@ def database(tmp_path):
return (tmp_path / "file.db").as_posix()


@pytest.fixture(params=[None])
@pytest.fixture(params=[_SENTINEL])
def connection_config(request, database):
return request.param or {"backend": "duckdb", "database": database}
return (
{"backend": "duckdb", "database": database}
if request.param is _SENTINEL # `None` is a valid value to test
else request.param
)


@pytest.fixture
Expand Down Expand Up @@ -122,11 +128,22 @@ def test_save_no_table_name(self, table_dataset_from_csv, dummy_table):
("query", (("driver", "ODBC Driver 17 for SQL Server"),)),
),
),
# https://github.com/kedro-org/kedro-plugins/pull/893#discussion_r1804632435
(
None,
(
("backend", "duckdb"),
("database", ":memory:"),
),
),
],
indirect=["connection_config"],
)
def test_connection_config(self, mocker, table_dataset, connection_config, key):
"""Test hashing of more complicated connection configuration."""
mocker.patch(f"ibis.{connection_config['backend']}")
backend = (
connection_config["backend"] if connection_config is not None else "duckdb"
)
mocker.patch(f"ibis.{backend}")
table_dataset.load()
assert key in table_dataset._connections

0 comments on commit 677d261

Please sign in to comment.