From 2cccdaf538bec9da1d7a7737e4522babbd86fb0f Mon Sep 17 00:00:00 2001 From: Martin Hickey Date: Tue, 23 Jul 2024 15:36:27 +0100 Subject: [PATCH 1/2] Add additional unit tests Signed-off-by: Martin Hickey --- src/instructlab/sdg/utils/models.py | 8 +-- tests/conftest.py | 17 +++-- tests/taxonomy.py | 100 +++++++++++++++++++++++++++ tests/test_models.py | 48 +++++++++++++ tests/test_sample_populator_block.py | 2 +- tests/test_taxonomy.py | 35 ++++++++++ 6 files changed, 201 insertions(+), 9 deletions(-) create mode 100644 tests/taxonomy.py create mode 100644 tests/test_models.py create mode 100644 tests/test_taxonomy.py diff --git a/src/instructlab/sdg/utils/models.py b/src/instructlab/sdg/utils/models.py index 788b16e9..b6b03b00 100644 --- a/src/instructlab/sdg/utils/models.py +++ b/src/instructlab/sdg/utils/models.py @@ -19,10 +19,10 @@ } -def get_model_family(forced, model_path): - forced = MODEL_FAMILY_MAPPINGS.get(forced, forced) - if forced and forced.lower() not in MODEL_FAMILIES: - raise GenerateException("Unknown model family: %s" % forced) +def get_model_family(model_family, model_path): + model_family_retrieved = MODEL_FAMILY_MAPPINGS.get(model_family, model_family) + if model_family_retrieved and model_family_retrieved.lower() not in MODEL_FAMILIES: + raise GenerateException("Unknown model family: %s" % model_family_retrieved) # Try to guess the model family based on the model's filename guess = re.match(r"^\w*", os.path.basename(model_path)).group(0).lower() diff --git a/tests/conftest.py b/tests/conftest.py index cb0c308a..d77fce68 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,6 +12,9 @@ # First Party from instructlab.sdg.pipeline import PipelineContext +# Local +from .taxonomy import MockTaxonomy + def get_ctx(**kwargs) -> PipelineContext: kwargs.setdefault("client", mock.MagicMock()) @@ -39,10 +42,16 @@ def single_threaded_ctx() -> PipelineContext: @pytest.fixture -def threaded_ctx() -> PipelineContext: - return get_threaded_ctx() +def sample_dataset(): + return Dataset.from_list([{"foo": i} for i in range(10)]) @pytest.fixture -def sample_dataset(): - return Dataset.from_list([{"foo": i} for i in range(10)]) +def taxonomy_dir(tmp_path): + with MockTaxonomy(tmp_path) as taxonomy: + yield taxonomy + + +@pytest.fixture +def threaded_ctx() -> PipelineContext: + return get_threaded_ctx() diff --git a/tests/taxonomy.py b/tests/taxonomy.py new file mode 100644 index 00000000..cb00af71 --- /dev/null +++ b/tests/taxonomy.py @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Standard +from pathlib import Path +from typing import List, Optional +import shutil + +# Third Party +import git + +TEST_VALID_COMPOSITIONAL_SKILL_YAML = """created_by: rafael-vasquez +version: 1 +seed_examples: +- answer: "Sure thing!" + context: "This is a valid YAML." + question: "Can you help me debug this failing unit test?" +- answer: "answer2" + context: "context2" + question: "question2" +- answer: "answer3" + context: "context3" + question: "question3" +- answer: "answer4" + context: "context4" + question: "question4" +- answer: "answer5" + context: "context5" + question: "question5" +task_description: 'This is a task' +""" + + +class MockTaxonomy: + INIT_COMMIT_FILE = "README.md" + + def __init__(self, path: Path) -> None: + self.root = path + self._repo = git.Repo.init(path, initial_branch="main") + with open(path / self.INIT_COMMIT_FILE, "wb"): + pass + self._repo.index.add([self.INIT_COMMIT_FILE]) + self._repo.index.commit("Initial commit") + + @property + def untracked_files(self) -> List[str]: + """List untracked files in the repository""" + return self._repo.untracked_files + + def create_untracked(self, rel_path: str, contents: Optional[bytes] = None) -> Path: + """Create a new untracked file in the repository. + + Args: + rel_path (str): Relative path (from repository root) to the file. + contents (bytes): (optional) Byte string to be written to the file. + Returns: + file_path: The path to the created file. + """ + taxonomy_path = Path(rel_path) + assert not taxonomy_path.is_absolute() + file_path = self.root.joinpath(taxonomy_path) + file_path.parent.mkdir(exist_ok=True, parents=True) + if not contents: + assert taxonomy_path.parts[0] == "compositional_skills" + file_path.write_text(TEST_VALID_COMPOSITIONAL_SKILL_YAML, encoding="utf-8") + else: + file_path.write_bytes(contents) + return file_path + + def add_tracked(self, rel_path: str) -> Path: + """Add a new tracked file to the repository (and commits it). + + Args: + rel_path (str): Relative path (from repository root) to the file. + Returns: + file_path: The path to the added file. + """ + file_path = self.create_untracked(rel_path) + self._repo.index.add([rel_path]) + self._repo.index.commit("new commit") + return file_path + + def remove_file(self, rel_path: str) -> None: + """Remove a file in the repository (tracked or not) + + Args: + rel_path (str): Relative path (from repository root) to the file. + """ + self.root.joinpath(rel_path).unlink(missing_ok=True) + + def teardown(self) -> None: + """Recursively remove the temporary repository and all of its + subdirectories and files. + """ + shutil.rmtree(self.root) + + def __enter__(self): + return self + + def __exit__(self, *args): + self.teardown() diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 00000000..27cf4d68 --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Third Party +import pytest + +# First Party +from instructlab.sdg.utils import GenerateException, models + + +class TestModels: + """Test model family in instructlab.sdg.utils.models.""" + + def test_granite_model_family(self): + assert ( + models.get_model_family("granite", "./models/granite-7b-lab-Q4_K_M.gguf") + == "merlinite" + ) + + def test_merlinite_model_family(self): + assert ( + models.get_model_family( + "merlinite", "./models/merlinite-7b-lab-Q4_K_M.gguf" + ) + == "merlinite" + ) + + def test_mixtral_model_family(self): + assert ( + models.get_model_family( + "mixtral", "./models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf" + ) + == "mixtral" + ) + + def test_default_model_family(self): + assert ( + models.get_model_family( + "mixtral", "./models/foo-8x7b-instruct-v0.1.Q4_K_M.gguf" + ) + == "merlinite" + ) + + def test_unknown_model_family(self): + with pytest.raises(GenerateException) as exc: + models.get_model_family( + "foobar", "./models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf" + ) + assert "Unknown model family: foobar" in str(exc.value) diff --git a/tests/test_sample_populator_block.py b/tests/test_sample_populator_block.py index c78d995d..e2764de4 100644 --- a/tests/test_sample_populator_block.py +++ b/tests/test_sample_populator_block.py @@ -9,7 +9,7 @@ from instructlab.sdg.utilblocks import SamplePopulatorBlock -class TestFilterByValueBlock(unittest.TestCase): +class TestSamplePopulatorBlock(unittest.TestCase): def setUp(self): self.ctx = MagicMock() self.ctx.dataset_num_procs = 1 diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py new file mode 100644 index 00000000..ffa6171b --- /dev/null +++ b/tests/test_taxonomy.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Third Party +import pytest + +# First Party +from instructlab.sdg.utils import taxonomy + +# Local +from .taxonomy import TEST_VALID_COMPOSITIONAL_SKILL_YAML + +TAXONOMY_BASE = "main" + +TEST_CUSTOM_YAML_RULES = b"""extends: relaxed + +rules: + line-length: + max: 180 +""" + + +class TestTaxonomy: + """Test taxonomy in instructlab.sdg.utils.taxonomy.""" + + @pytest.fixture(autouse=True) + def _init_taxonomy(self, taxonomy_dir): + self.taxonomy = taxonomy_dir + + def test_read_taxonomy_leaf_nodes(self): + tracked_file = "compositional_skills/tracked/qna.yaml" + self.taxonomy.add_tracked(tracked_file) + leaf_node = taxonomy.read_taxonomy_leaf_nodes( + self.taxonomy.root, TAXONOMY_BASE, TEST_CUSTOM_YAML_RULES + ) + assert TEST_VALID_COMPOSITIONAL_SKILL_YAML not in leaf_node From 84dc117695b3231eac56f8f3326bb126cf796278 Mon Sep 17 00:00:00 2001 From: Martin Hickey Date: Fri, 26 Jul 2024 11:21:40 +0100 Subject: [PATCH 2/2] Update the taxonomy tests for improved scope - Extended mock taxonomy to accept contents passed only - Fixed the testing of the leaf node created for leaf node name and check for seed example contained Signed-off-by: Martin Hickey --- tests/taxonomy.py | 46 ++++++------------------------------- tests/test_taxonomy.py | 52 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 53 insertions(+), 45 deletions(-) diff --git a/tests/taxonomy.py b/tests/taxonomy.py index cb00af71..f2b6cb47 100644 --- a/tests/taxonomy.py +++ b/tests/taxonomy.py @@ -2,33 +2,12 @@ # Standard from pathlib import Path -from typing import List, Optional +from typing import List import shutil # Third Party import git -TEST_VALID_COMPOSITIONAL_SKILL_YAML = """created_by: rafael-vasquez -version: 1 -seed_examples: -- answer: "Sure thing!" - context: "This is a valid YAML." - question: "Can you help me debug this failing unit test?" -- answer: "answer2" - context: "context2" - question: "question2" -- answer: "answer3" - context: "context3" - question: "question3" -- answer: "answer4" - context: "context4" - question: "question4" -- answer: "answer5" - context: "context5" - question: "question5" -task_description: 'This is a task' -""" - class MockTaxonomy: INIT_COMMIT_FILE = "README.md" @@ -46,12 +25,12 @@ def untracked_files(self) -> List[str]: """List untracked files in the repository""" return self._repo.untracked_files - def create_untracked(self, rel_path: str, contents: Optional[bytes] = None) -> Path: + def create_untracked(self, rel_path: str, contents: str) -> Path: """Create a new untracked file in the repository. Args: rel_path (str): Relative path (from repository root) to the file. - contents (bytes): (optional) Byte string to be written to the file. + contents (str): String to be written to the file. Returns: file_path: The path to the created file. """ @@ -59,34 +38,23 @@ def create_untracked(self, rel_path: str, contents: Optional[bytes] = None) -> P assert not taxonomy_path.is_absolute() file_path = self.root.joinpath(taxonomy_path) file_path.parent.mkdir(exist_ok=True, parents=True) - if not contents: - assert taxonomy_path.parts[0] == "compositional_skills" - file_path.write_text(TEST_VALID_COMPOSITIONAL_SKILL_YAML, encoding="utf-8") - else: - file_path.write_bytes(contents) + file_path.write_text(contents, encoding="utf-8") return file_path - def add_tracked(self, rel_path: str) -> Path: + def add_tracked(self, rel_path, contents: str) -> Path: """Add a new tracked file to the repository (and commits it). Args: rel_path (str): Relative path (from repository root) to the file. + contents (str): String to be written to the file. Returns: file_path: The path to the added file. """ - file_path = self.create_untracked(rel_path) + file_path = self.create_untracked(rel_path, contents) self._repo.index.add([rel_path]) self._repo.index.commit("new commit") return file_path - def remove_file(self, rel_path: str) -> None: - """Remove a file in the repository (tracked or not) - - Args: - rel_path (str): Relative path (from repository root) to the file. - """ - self.root.joinpath(rel_path).unlink(missing_ok=True) - def teardown(self) -> None: """Recursively remove the temporary repository and all of its subdirectories and files. diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py index ffa6171b..67b6422a 100644 --- a/tests/test_taxonomy.py +++ b/tests/test_taxonomy.py @@ -1,15 +1,39 @@ # SPDX-License-Identifier: Apache-2.0 +# Standard +import os +import pathlib + # Third Party import pytest # First Party from instructlab.sdg.utils import taxonomy -# Local -from .taxonomy import TEST_VALID_COMPOSITIONAL_SKILL_YAML +TEST_VALID_COMPOSITIONAL_SKILL_YAML = """created_by: rafael-vasquez +version: 1 +seed_examples: +- answer: "Sure thing!" + context: "This is a valid YAML." + question: "Can you help me debug this failing unit test?" +- answer: "answer2" + context: "context2" + question: "question2" +- answer: "answer3" + context: "context3" + question: "question3" +- answer: "answer4" + context: "context4" + question: "question4" +- answer: "answer5" + context: "context5" + question: "question5" +task_description: 'This is a task' +""" + +TEST_SEED_EXAMPLE = "Can you help me debug this failing unit test?" -TAXONOMY_BASE = "main" +TEST_TAXONOMY_BASE = "main" TEST_CUSTOM_YAML_RULES = b"""extends: relaxed @@ -28,8 +52,24 @@ def _init_taxonomy(self, taxonomy_dir): def test_read_taxonomy_leaf_nodes(self): tracked_file = "compositional_skills/tracked/qna.yaml" - self.taxonomy.add_tracked(tracked_file) + untracked_file = "compositional_skills/new/qna.yaml" + self.taxonomy.add_tracked(tracked_file, TEST_VALID_COMPOSITIONAL_SKILL_YAML) + self.taxonomy.create_untracked( + untracked_file, TEST_VALID_COMPOSITIONAL_SKILL_YAML + ) + leaf_node = taxonomy.read_taxonomy_leaf_nodes( - self.taxonomy.root, TAXONOMY_BASE, TEST_CUSTOM_YAML_RULES + self.taxonomy.root, TEST_TAXONOMY_BASE, TEST_CUSTOM_YAML_RULES + ) + leaf_node_key = str(pathlib.Path(untracked_file).parent).replace( + os.path.sep, "->" ) - assert TEST_VALID_COMPOSITIONAL_SKILL_YAML not in leaf_node + assert leaf_node_key in leaf_node + + leaf_node_entries = leaf_node.get(leaf_node_key) + seed_example_exists = False + if any( + entry["instruction"] == TEST_SEED_EXAMPLE for entry in leaf_node_entries + ): + seed_example_exists = True + assert seed_example_exists is True