Skip to content

Commit

Permalink
feat: Adding preview support to yaml datasets (#718)
Browse files Browse the repository at this point in the history
* Adding preview support to yaml datasets

Signed-off-by: Lukas Innig <[email protected]>

* added test

Signed-off-by: Lukas Innig <[email protected]>

* fix the test

Signed-off-by: Lukas Innig <[email protected]>

* formatting

Signed-off-by: Lukas Innig <[email protected]>

* Update pyspark

Signed-off-by: Ankita Katiyar <[email protected]>

* Add release notes

Signed-off-by: Ankita Katiyar <[email protected]>

---------

Signed-off-by: Lukas Innig <[email protected]>
Signed-off-by: Ankita Katiyar <[email protected]>
Co-authored-by: Ankita Katiyar <[email protected]>
Co-authored-by: Ankita Katiyar <[email protected]>
  • Loading branch information
3 people authored Jun 11, 2024
1 parent 6b213c2 commit 8c15f03
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 1 deletion.
6 changes: 6 additions & 0 deletions kedro-datasets/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@
| `langchain.ChatCohereDataset` | A dataset for loading a ChatCohere langchain model. | `kedro_datasets_experimental.langchain` |
| `langchain.OpenAIEmbeddingsDataset` | A dataset for loading a OpenAIEmbeddings langchain model. | `kedro_datasets_experimental.langchain` |
| `langchain.ChatOpenAIDataset` | A dataset for loading a ChatOpenAI langchain model. | `kedro_datasets_experimental.langchain` |
* Extended preview feature to `yaml.YAMLDataset`.

## Community contributions

Many thanks to the following Kedroids for contributing PRs to this release:
* [Lukas Innig](https://github.com/derluke)


# Release 3.0.1
Expand Down
15 changes: 15 additions & 0 deletions kedro-datasets/kedro_datasets/yaml/yaml_dataset.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""``YAMLDataset`` loads/saves data from/to a YAML file using an underlying
filesystem (e.g.: local, S3, GCS). It uses PyYAML to handle the YAML file.
"""

from __future__ import annotations

import json
from copy import deepcopy
from pathlib import PurePosixPath
from typing import Any
Expand All @@ -17,6 +19,8 @@
get_protocol_and_path,
)

from kedro_datasets._typing import JSONPreview


class YAMLDataset(AbstractVersionedDataset[dict, dict]):
"""``YAMLDataset`` loads/saves data from/to a YAML file using an underlying
Expand Down Expand Up @@ -157,3 +161,14 @@ def _invalidate_cache(self) -> None:
"""Invalidate underlying filesystem caches."""
filepath = get_filepath_str(self._filepath, self._protocol)
self._fs.invalidate_cache(filepath)

def preview(self) -> JSONPreview:
"""
Generate a preview of the YAML dataset with a specified number of items.
Returns:
A string representing the YAML data for previewing.
"""
data = self._load()

return JSONPreview(json.dumps(data))
2 changes: 1 addition & 1 deletion kedro-datasets/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ test = [
"pyarrow>=7.0; python_version >= '3.11'", # Adding to avoid numpy build errors
"pyodbc~=5.0",
"pyproj~=3.0",
"pyspark>=3.0, <3.4; python_version < '3.11'",
"pyspark>=3.0; python_version < '3.11'",
"pyspark>=3.4; python_version >= '3.11'",
"pytest-cov~=3.0",
"pytest-mock>=1.7.1, <2.0",
Expand Down
21 changes: 21 additions & 0 deletions kedro-datasets/tests/yaml/test_yaml_dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import inspect
import json
from pathlib import Path, PurePosixPath

import pandas as pd
import pytest
import yaml
from fsspec.implementations.http import HTTPFileSystem
from fsspec.implementations.local import LocalFileSystem
from gcsfs import GCSFileSystem
Expand Down Expand Up @@ -207,3 +210,21 @@ def test_versioning_existing_dataset(
Path(yaml_dataset._filepath.as_posix()).unlink()
versioned_yaml_dataset.save(dummy_data)
assert versioned_yaml_dataset.exists()

def test_preview(self, yaml_dataset, dummy_data):
"""Test the preview method."""
yaml_dataset.save(dummy_data)
preview_data = yaml_dataset.preview()

# Load the data directly for comparison
with yaml_dataset._fs.open(yaml_dataset._get_load_path(), mode="r") as fs_file:
full_data = yaml.safe_load(fs_file)

expected_data = json.dumps(full_data)

assert (
preview_data == expected_data
), "The preview data does not match the expected data."
assert (
inspect.signature(yaml_dataset.preview).return_annotation == "JSONPreview"
)

0 comments on commit 8c15f03

Please sign in to comment.