From 8c6b8b6836c1bd38758b3df8b7411af6de63188d Mon Sep 17 00:00:00 2001 From: itrujnara Date: Tue, 25 Feb 2025 13:18:02 +0100 Subject: [PATCH 1/6] refactor (cli): Rewrite split_split CLI with Click. Rewrite the split_split command to use Click and the new API. --- src/stimulus/cli/main.py | 34 ++++++++++++++++ src/stimulus/cli/split_split.py | 69 ++++++++------------------------- 2 files changed, 50 insertions(+), 53 deletions(-) create mode 100644 src/stimulus/cli/main.py diff --git a/src/stimulus/cli/main.py b/src/stimulus/cli/main.py new file mode 100644 index 00000000..e9ea2fb6 --- /dev/null +++ b/src/stimulus/cli/main.py @@ -0,0 +1,34 @@ +"""Main entry point for stimulus-py cli.""" + +import click +from importlib_metadata import version + +@click.group(context_settings={"help_option_names": ["-h", "--help"]}) +@click.version_option(version("stimulus-py"), "-v", "--version") +def cli() -> None: + """Stimulus is an open-science framework for data processing and model training.""" + +@cli.command() +@click.option( + "-y", + "--yaml", + type=click.Path(exists=True), + required=True, + help="The YAML config file that hold all transform - split - parameter info", +) +@click.option( + "-d", + "--out_dir", + type=click.Path(), + required=False, + default="./", + help="The output dir where all the YAMLs are written to. Output YAML will be called split-#[number].yaml transform-#[number].yaml. Default -> ./", +) +def split_split( + config_yaml: str, + out_dir_path: str, +) -> None: + """Split a YAML configuration file into multiple YAML files, each containing a unique split.""" + from stimulus.cli.split_split import split_split as split_split_func + + split_split_func(config_yaml, out_dir_path) diff --git a/src/stimulus/cli/split_split.py b/src/stimulus/cli/split_split.py index 63e265d9..29d03774 100755 --- a/src/stimulus/cli/split_split.py +++ b/src/stimulus/cli/split_split.py @@ -6,48 +6,18 @@ The resulting YAML files can be used as input configurations for the stimulus package. """ -import argparse +import logging from typing import Any import yaml -from stimulus.utils.yaml_data import ( - YamlConfigDict, - YamlSplitConfigDict, - check_yaml_schema, - dump_yaml_list_into_files, - generate_split_configs, -) - - -def get_args() -> argparse.Namespace: - """Get the arguments when using from the command line.""" - parser = argparse.ArgumentParser(description="") - parser.add_argument( - "-y", - "--yaml", - type=str, - required=True, - metavar="FILE", - help="The YAML config file that hold all transform - split - parameter info", - ) - parser.add_argument( - "-d", - "--out_dir", - type=str, - required=False, - nargs="?", - const="./", - default="./", - metavar="DIR", - # TODO: Change the output name - help="The output dir where all the YAMLs are written to. Output YAML will be called split-#[number].yaml transform-#[number].yaml. Default -> ./", - ) - - return parser.parse_args() - - -def main(config_yaml: str, out_dir_path: str) -> None: +from stimulus.data.interface import data_config_parser + + +logger = logging.getLogger(__name__) + + +def split_split(config_yaml: str, out_dir_path: str) -> None: """Reads a YAML config file and generates a file per unique split. This script reads a YAML with a defined structure and creates all the YAML files ready to be passed to @@ -64,23 +34,16 @@ def main(config_yaml: str, out_dir_path: str) -> None: with open(config_yaml) as conf_file: yaml_config = yaml.safe_load(conf_file) - yaml_config_dict: YamlConfigDict = YamlConfigDict(**yaml_config) - # check if the yaml schema is correct - # FIXME: isn't it redundant to check and already class with pydantic ? - check_yaml_schema(yaml_config_dict) + yaml_config_dict = data_config_parser.YamlConfigDict(**yaml_config) - # generate the yaml files per split - split_configs: list[YamlSplitConfigDict] = generate_split_configs(yaml_config_dict) - - # dump all the YAML configs into files - dump_yaml_list_into_files(split_configs, out_dir_path, "test_split") + logger.info("YAML config loaded successfully.") + # generate the yaml files per split + split_configs = data_config_parser.generate_split_configs(yaml_config_dict) -def run() -> None: - """Run the split_yaml CLI.""" - args = get_args() - main(args.yaml, args.out_dir) + logger.info("Splits generated successfully.") + # dump all the YAML configs into files + data_config_parser.dump_yaml_list_into_files(split_configs, out_dir_path, "test_split") -if __name__ == "__main__": - run() + logger.info("YAML files saved successfully.") From 12a828d7ec65dfa2ada320d084a2e88ffd129df2 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Tue, 25 Feb 2025 14:51:37 +0100 Subject: [PATCH 2/6] test(cli): Change split_split tests to the new format --- src/stimulus/cli/main.py | 8 +-- src/stimulus/cli/split_split.py | 2 +- .../cli/__snapshots__/test_split_splits.ambr | 4 +- tests/cli/test_split_splits.py | 58 +++++++++++++++---- 4 files changed, 53 insertions(+), 19 deletions(-) diff --git a/src/stimulus/cli/main.py b/src/stimulus/cli/main.py index e9ea2fb6..1513cee4 100644 --- a/src/stimulus/cli/main.py +++ b/src/stimulus/cli/main.py @@ -11,14 +11,14 @@ def cli() -> None: @cli.command() @click.option( "-y", - "--yaml", + "--config-yaml", type=click.Path(exists=True), required=True, help="The YAML config file that hold all transform - split - parameter info", ) @click.option( "-d", - "--out_dir", + "--out-dir", type=click.Path(), required=False, default="./", @@ -26,9 +26,9 @@ def cli() -> None: ) def split_split( config_yaml: str, - out_dir_path: str, + out_dir: str, ) -> None: """Split a YAML configuration file into multiple YAML files, each containing a unique split.""" from stimulus.cli.split_split import split_split as split_split_func - split_split_func(config_yaml, out_dir_path) + split_split_func(config_yaml=config_yaml, out_dir_path=out_dir) diff --git a/src/stimulus/cli/split_split.py b/src/stimulus/cli/split_split.py index 29d03774..a45b6b76 100755 --- a/src/stimulus/cli/split_split.py +++ b/src/stimulus/cli/split_split.py @@ -34,7 +34,7 @@ def split_split(config_yaml: str, out_dir_path: str) -> None: with open(config_yaml) as conf_file: yaml_config = yaml.safe_load(conf_file) - yaml_config_dict = data_config_parser.YamlConfigDict(**yaml_config) + yaml_config_dict = data_config_parser.ConfigDict(**yaml_config) logger.info("YAML config loaded successfully.") diff --git a/tests/cli/__snapshots__/test_split_splits.ambr b/tests/cli/__snapshots__/test_split_splits.ambr index 7ab227de..2f3a68a0 100644 --- a/tests/cli/__snapshots__/test_split_splits.ambr +++ b/tests/cli/__snapshots__/test_split_splits.ambr @@ -1,6 +1,6 @@ # serializer version: 1 -# name: test_split_split[correct_yaml_path-None] +# name: test_split_split_main[correct_yaml_path-None] list([ - '42139ca7745259e09d1e56e24570d2c7', + '8bca0bebb576d5ce5bb9de5641f627e4', ]) # --- diff --git a/tests/cli/test_split_splits.py b/tests/cli/test_split_splits.py index 3dea16ee..fc9b2f05 100644 --- a/tests/cli/test_split_splits.py +++ b/tests/cli/test_split_splits.py @@ -3,24 +3,31 @@ import hashlib import os from pathlib import Path -from typing import Any, Callable +import tempfile +from typing import Any, Optional import pytest +from click.testing import CliRunner -from src.stimulus.cli import split_split +from stimulus.cli.main import cli +from stimulus.cli import split_split # Fixtures @pytest.fixture def correct_yaml_path() -> str: """Fixture that returns the path to a correct YAML file.""" - return "tests/test_data/titanic/titanic.yaml" + return str( + Path(__file__).parent.parent / "test_data" / "titanic" / "titanic.yaml", + ) @pytest.fixture def wrong_yaml_path() -> str: """Fixture that returns the path to a wrong YAML file.""" - return "tests/test_data/yaml_files/wrong_field_type.yaml" + return str( + Path(__file__).parent.parent / "test_data" / "yaml_files" / "wrong_field_type.yaml", + ) # Test cases @@ -32,25 +39,52 @@ def wrong_yaml_path() -> str: # Tests @pytest.mark.parametrize(("yaml_type", "error"), test_cases) -def test_split_split( - request: pytest.FixtureRequest, - snapshot: Callable[[], Any], +def test_split_split_main( yaml_type: str, - error: Exception | None, - tmp_path: Path, # Pytest tmp file system + error: Optional[Exception], + request: Any, + snapshot: Any ) -> None: """Tests the CLI command with correct and wrong YAML files.""" yaml_path = request.getfixturevalue(yaml_type) - tmpdir = str(tmp_path) + tmpdir = tempfile.gettempdir() if error: with pytest.raises(error): # type: ignore[call-overload] - split_split.main(yaml_path, tmpdir) + split_split.split_split(yaml_path, tmpdir) else: - split_split.main(yaml_path, tmpdir) # main() returns None, no need to assert + split_split.split_split(yaml_path, tmpdir) # split_split() returns None, no need to assert files = os.listdir(tmpdir) test_out = [f for f in files if f.startswith("test_")] + assert len(test_out) > 0, "No output files were generated" hashes = [] for f in test_out: with open(os.path.join(tmpdir, f)) as file: hashes.append(hashlib.md5(file.read().encode()).hexdigest()) # noqa: S324 assert sorted(hashes) == snapshot # sorted ensures that the order of the hashes does not matter + + +def test_cli_invocation( + correct_yaml_path: str +) -> None: + """Test the CLI invocation of split-split command. + Args: + config_yaml: Path to the YAML config file. + out_dir: Path to the output directory. + """ + runner = CliRunner() + with runner.isolated_filesystem(): + output_path = tempfile.gettempdir() + result = runner.invoke( + cli, + [ + "split-split", + "-y", + correct_yaml_path, + "-d", + output_path, + ], + ) + files = os.listdir(output_path) + test_out = [f for f in files if f.startswith("test_")] + assert result.exit_code == 0 + assert len(test_out) > 0, "No output files were generated" From ec3784b0e0cc7f58432674ec6ce3cf89947c2d9c Mon Sep 17 00:00:00 2001 From: itrujnara Date: Tue, 25 Feb 2025 15:21:48 +0100 Subject: [PATCH 3/6] refactor(cli): Change split_transforms to the new format --- src/stimulus/cli/main.py | 32 +++++++++++++++++-- src/stimulus/cli/split_transforms.py | 47 +++------------------------- 2 files changed, 34 insertions(+), 45 deletions(-) diff --git a/src/stimulus/cli/main.py b/src/stimulus/cli/main.py index 1513cee4..23b2ba62 100644 --- a/src/stimulus/cli/main.py +++ b/src/stimulus/cli/main.py @@ -11,7 +11,7 @@ def cli() -> None: @cli.command() @click.option( "-y", - "--config-yaml", + "--yaml", type=click.Path(exists=True), required=True, help="The YAML config file that hold all transform - split - parameter info", @@ -25,10 +25,36 @@ def cli() -> None: help="The output dir where all the YAMLs are written to. Output YAML will be called split-#[number].yaml transform-#[number].yaml. Default -> ./", ) def split_split( - config_yaml: str, + yaml: str, out_dir: str, ) -> None: """Split a YAML configuration file into multiple YAML files, each containing a unique split.""" from stimulus.cli.split_split import split_split as split_split_func - split_split_func(config_yaml=config_yaml, out_dir_path=out_dir) + split_split_func(config_yaml=yaml, out_dir_path=out_dir) + + +@cli.command() +@click.option( + "-j", + "--yaml", + type=click.Path(exists=True), + required=True, + help="The YAML config file that hold all the transform per split parameter info", +) +@click.option( + "-d", + "--out-dir", + type=click.Path(), + required=False, + default="./", + help="The output dir where all the YAMLs are written to. Output YAML will be called split_transform-#[number].yaml. Default -> ./", +) +def split_transforms( + yaml: str, + out_dir: str, +) -> None: + """Split a YAML configuration file into multiple YAML files, each containing a unique transform.""" + from stimulus.cli.split_transforms import split_transforms as split_transforms_func + + split_transforms_func(config_yaml=yaml, out_dir_path=out_dir) diff --git a/src/stimulus/cli/split_transforms.py b/src/stimulus/cli/split_transforms.py index 6d977460..9518b059 100755 --- a/src/stimulus/cli/split_transforms.py +++ b/src/stimulus/cli/split_transforms.py @@ -6,46 +6,14 @@ The resulting YAML files can be used as input configurations for the stimulus package. """ -import argparse from typing import Any import yaml -from stimulus.utils.yaml_data import ( - YamlSplitConfigDict, - YamlSplitTransformDict, - dump_yaml_list_into_files, - generate_split_transform_configs, -) +from stimulus.data.interface import data_config_parser -def get_args() -> argparse.Namespace: - """Get the arguments when using the command line.""" - parser = argparse.ArgumentParser(description="") - parser.add_argument( - "-j", - "--yaml", - type=str, - required=True, - metavar="FILE", - help="The YAML config file that hold all the transform per split parameter info", - ) - parser.add_argument( - "-d", - "--out-dir", - type=str, - required=False, - nargs="?", - const="./", - default="./", - metavar="DIR", - help="The output dir where all the YAMLs are written to. Output YAML will be called split_transform-#[number].yaml. Default -> ./", - ) - - return parser.parse_args() - - -def main(config_yaml: str, out_dir_path: str) -> None: +def split_transforms(config_yaml: str, out_dir_path: str) -> None: """Reads a YAML config and generates files for all split - transform possible combinations. This script reads a YAML with a defined structure and creates all the YAML files ready to be passed to the stimulus package. @@ -60,15 +28,10 @@ def main(config_yaml: str, out_dir_path: str) -> None: with open(config_yaml) as conf_file: yaml_config = yaml.safe_load(conf_file) - yaml_config_dict: YamlSplitConfigDict = YamlSplitConfigDict(**yaml_config) + yaml_config_dict = data_config_parser.SplitConfigDict(**yaml_config) # Generate the yaml files for each transform - split_transform_configs: list[YamlSplitTransformDict] = generate_split_transform_configs(yaml_config_dict) + split_transform_configs = data_config_parser.generate_split_transform_configs(yaml_config_dict) # Dump all the YAML configs into files - dump_yaml_list_into_files(split_transform_configs, out_dir_path, "test_transforms") - - -if __name__ == "__main__": - args = get_args() - main(args.yaml, args.out_dir) + data_config_parser.dump_yaml_list_into_files(split_transform_configs, out_dir_path, "test_transforms") From 201133b2eab42f01dad0fb8e14f90a8bcf63fea3 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Tue, 25 Feb 2025 15:22:06 +0100 Subject: [PATCH 4/6] test(cli): Change split_transforms tests to the new format --- .../__snapshots__/test_split_transforms.ambr | 5 +- tests/cli/test_split_transforms.py | 46 ++++++++++++++++--- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/tests/cli/__snapshots__/test_split_transforms.ambr b/tests/cli/__snapshots__/test_split_transforms.ambr index 4280f3e6..2f4e42e8 100644 --- a/tests/cli/__snapshots__/test_split_transforms.ambr +++ b/tests/cli/__snapshots__/test_split_transforms.ambr @@ -1,7 +1,8 @@ # serializer version: 1 # name: test_split_transforms[correct_yaml_path-None] list([ - '26d921a1580fb16ef597e91f4defec5607c11a7e822590f39e9ca456cc29819b', - 'db0c550bfe6027c91981423afdc6115c9dcfba61409f961b98831db9e664fd57', + '3a4b035c71d6ef1a46cea25c318634f2331df8993876b71849de80f8c8d0db70', + '7b03cb09d862e4fb884dc0cf4da93ac7efe8068b72a4b086f532a93befb925f1', + 'd9013844e136dfeb3969eefb6fb9787e6bf6886c796c3bca5ed1e0dd3400dd29', ]) # --- diff --git a/tests/cli/test_split_transforms.py b/tests/cli/test_split_transforms.py index d554f879..cd9e19db 100644 --- a/tests/cli/test_split_transforms.py +++ b/tests/cli/test_split_transforms.py @@ -3,24 +3,31 @@ import hashlib import os from pathlib import Path +import tempfile from typing import Any, Callable import pytest +from click.testing import CliRunner -from src.stimulus.cli.split_transforms import main +from stimulus.cli.main import cli +from src.stimulus.cli.split_transforms import split_transforms # Fixtures @pytest.fixture def correct_yaml_path() -> str: - """Fixture that returns the path to a correct YAML file with one split only.""" - return "tests/test_data/titanic/titanic_unique_split.yaml" + """Fixture that returns the path to a correct YAML file.""" + return str( + Path(__file__).parent.parent / "test_data" / "titanic" / "titanic_unique_split.yaml", + ) @pytest.fixture def wrong_yaml_path() -> str: """Fixture that returns the path to a wrong YAML file.""" - return "tests/test_data/yaml_files/wrong_field_type.yaml" + return str( + Path(__file__).parent.parent / "test_data" / "yaml_files" / "wrong_field_type.yaml", + ) # Test cases @@ -41,10 +48,10 @@ def test_split_transforms( tmpdir = str(tmp_path) if error is not None: with pytest.raises(error): # type: ignore[call-overload] - main(yaml_path, tmpdir) + split_transforms(yaml_path, tmpdir) else: # convert tmpdir to str - main(yaml_path, tmpdir) + split_transforms(yaml_path, tmpdir) files = os.listdir(tmpdir) test_out = [f for f in files if f.startswith("test_")] hashes = [] @@ -52,3 +59,30 @@ def test_split_transforms( with open(os.path.join(tmpdir, f)) as file: hashes.append(hashlib.sha256(file.read().encode()).hexdigest()) assert sorted(hashes) == snapshot # Sorted ensures that the order of the hashes does not matter + + +def test_cli_invocation( + correct_yaml_path: str +) -> None: + """Test the CLI invocation of split-split command. + Args: + config_yaml: Path to the YAML config file. + out_dir: Path to the output directory. + """ + runner = CliRunner() + with runner.isolated_filesystem(): + output_path = tempfile.gettempdir() + result = runner.invoke( + cli, + [ + "split-transforms", + "-j", + correct_yaml_path, + "-d", + output_path, + ], + ) + files = os.listdir(output_path) + test_out = [f for f in files if f.startswith("test_")] + assert result.exit_code == 0 + assert len(test_out) > 0, "No output files were generated" From a149b0b93409bae65f4a0cca7e34942fbfa5bec6 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Tue, 25 Feb 2025 15:22:31 +0100 Subject: [PATCH 5/6] test(cli): Use tmp_path in split_split test --- tests/cli/test_split_splits.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/cli/test_split_splits.py b/tests/cli/test_split_splits.py index fc9b2f05..02bbb665 100644 --- a/tests/cli/test_split_splits.py +++ b/tests/cli/test_split_splits.py @@ -43,11 +43,12 @@ def test_split_split_main( yaml_type: str, error: Optional[Exception], request: Any, - snapshot: Any + snapshot: Any, + tmp_path: Path, ) -> None: """Tests the CLI command with correct and wrong YAML files.""" yaml_path = request.getfixturevalue(yaml_type) - tmpdir = tempfile.gettempdir() + tmpdir = str(tmp_path) if error: with pytest.raises(error): # type: ignore[call-overload] split_split.split_split(yaml_path, tmpdir) @@ -64,7 +65,7 @@ def test_split_split_main( def test_cli_invocation( - correct_yaml_path: str + correct_yaml_path: str, ) -> None: """Test the CLI invocation of split-split command. Args: From baaba1b326a377769e61113560d16a5c4479a962 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Tue, 25 Feb 2025 15:27:46 +0100 Subject: [PATCH 6/6] Make format --- src/stimulus/cli/main.py | 2 ++ src/stimulus/cli/split_split.py | 1 - tests/cli/test_split_splits.py | 5 +++-- tests/cli/test_split_transforms.py | 7 ++++--- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/stimulus/cli/main.py b/src/stimulus/cli/main.py index 23b2ba62..d9661809 100644 --- a/src/stimulus/cli/main.py +++ b/src/stimulus/cli/main.py @@ -3,11 +3,13 @@ import click from importlib_metadata import version + @click.group(context_settings={"help_option_names": ["-h", "--help"]}) @click.version_option(version("stimulus-py"), "-v", "--version") def cli() -> None: """Stimulus is an open-science framework for data processing and model training.""" + @cli.command() @click.option( "-y", diff --git a/src/stimulus/cli/split_split.py b/src/stimulus/cli/split_split.py index a45b6b76..b1fee5fe 100755 --- a/src/stimulus/cli/split_split.py +++ b/src/stimulus/cli/split_split.py @@ -13,7 +13,6 @@ from stimulus.data.interface import data_config_parser - logger = logging.getLogger(__name__) diff --git a/tests/cli/test_split_splits.py b/tests/cli/test_split_splits.py index 02bbb665..05c8427b 100644 --- a/tests/cli/test_split_splits.py +++ b/tests/cli/test_split_splits.py @@ -2,15 +2,15 @@ import hashlib import os -from pathlib import Path import tempfile +from pathlib import Path from typing import Any, Optional import pytest from click.testing import CliRunner -from stimulus.cli.main import cli from stimulus.cli import split_split +from stimulus.cli.main import cli # Fixtures @@ -68,6 +68,7 @@ def test_cli_invocation( correct_yaml_path: str, ) -> None: """Test the CLI invocation of split-split command. + Args: config_yaml: Path to the YAML config file. out_dir: Path to the output directory. diff --git a/tests/cli/test_split_transforms.py b/tests/cli/test_split_transforms.py index cd9e19db..7fc3ad96 100644 --- a/tests/cli/test_split_transforms.py +++ b/tests/cli/test_split_transforms.py @@ -2,15 +2,15 @@ import hashlib import os -from pathlib import Path import tempfile +from pathlib import Path from typing import Any, Callable import pytest from click.testing import CliRunner -from stimulus.cli.main import cli from src.stimulus.cli.split_transforms import split_transforms +from stimulus.cli.main import cli # Fixtures @@ -62,9 +62,10 @@ def test_split_transforms( def test_cli_invocation( - correct_yaml_path: str + correct_yaml_path: str, ) -> None: """Test the CLI invocation of split-split command. + Args: config_yaml: Path to the YAML config file. out_dir: Path to the output directory.