-
Notifications
You must be signed in to change notification settings - Fork 55
Add a YAML based file format for pipelines #86
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit
Hold shift + click to select a range
0435abb
Replace LLMBlock model_prompt param with model_family
markmc 49c87d5
Add a PipelineContext class
markmc 7cfbaa9
Fix multiprocessing issues in FilterByValueBlock
markmc 9d92548
Fix multiprocessing issues in utilblocks
markmc 23dd08e
Allow block_config.config_path to be relative
markmc 9fc272c
Fix block_name handling
markmc 8cb673b
Move FilterByValue multiprocessing config to PipelineContext
markmc b956643
Add `add_num_samples` to LLMBlock config
markmc 18f1513
Fix LLMBlock batch_kwargs constructor param
markmc 82aadd9
Remove batch_kwargs
markmc 07c1c6d
Add a YAML based file format for pipelines
markmc 003c8e8
Merge mmlu_bench block into synth_knowledge pipeline
markmc ab46552
Rename Pipeline.from_flows() to Pipeline.from_file()
markmc beabbf3
Move pipeline configs into a new directory structure
markmc ec94159
YAML format improvement - move block_name up
markmc 2d92cf6
YAML format improvement - remove block_ prefix
markmc a0c9b80
Make "full" and "simple" aliases to a directory of pipeline configs
markmc eb2719f
YAML format improvement - change block_configs to blocks
markmc 46f16c6
Add ImportBlock to allow extending existing pipelines
markmc 82adb4a
generate_data: Allow pipeline arg to be a path to a directory
russellb 5a0b7a6
llm: Set `n` by default in gen_kwargs
russellb 7c5c1c3
pipelines: Add missing drop_duplicates for context in grounded skills
russellb 04f7baa
filterblock: Document block behavior in more detail
russellb b8768ac
Undo changes to how `n` parameter is handled
russellb 88f5003
Re-instate batch_kwargs.num_samples
markmc 804ee3a
Interpret llmblock.config_path relative to the pipeline config path
markmc d1c5d5b
Ensure num_proc is passed as a keyword arg to Dataset.map()
markmc 2c52770
fix: use string instead of boolean in YAML for "YES"
xukai92 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # Third Party | ||
| from datasets import Dataset | ||
|
|
||
| # Local | ||
| from . import pipeline | ||
| from .block import Block | ||
| from .logger_config import setup_logger | ||
|
|
||
| logger = setup_logger(__name__) | ||
|
|
||
|
|
||
| class ImportBlock(Block): | ||
| def __init__( | ||
| self, | ||
| ctx, | ||
| block_name, | ||
| path, | ||
| ) -> None: | ||
| """ | ||
| ImportBlock imports a chain of blocks from another pipeline config file. | ||
|
|
||
| Parameters: | ||
| - ctx (PipelineContext): A PipelineContext object containing runtime parameters. | ||
| - block_name (str): An identifier for this block. | ||
| - path (str): A path (absolute, or relative to the instructlab.sdg package) to a pipeline config file. | ||
| """ | ||
| super().__init__(ctx, block_name) | ||
| self.path = path | ||
| self.pipeline = pipeline.Pipeline.from_file(self.ctx, self.path) | ||
|
|
||
| def generate(self, samples) -> Dataset: | ||
| logger.info("ImportBlock chaining to blocks from {self.path}") | ||
| return self.pipeline.generate(samples) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| # Standard | ||
| from unittest.mock import MagicMock, patch | ||
| import os | ||
| import tempfile | ||
| import unittest | ||
|
|
||
| # Third Party | ||
| from datasets import Dataset, Features, Value | ||
|
|
||
| # First Party | ||
| from instructlab.sdg.importblock import ImportBlock | ||
| from instructlab.sdg.pipeline import Pipeline | ||
|
|
||
|
|
||
| class TestImportBlockWithMockPipeline(unittest.TestCase): | ||
| @patch("instructlab.sdg.pipeline.Pipeline") | ||
| def setUp(self, mock_pipeline): | ||
| self.ctx = MagicMock() | ||
| self.block_name = "test_block" | ||
| self.path = "/path/to/config" | ||
| self.mock_pipeline = mock_pipeline | ||
| self.import_block = ImportBlock(self.ctx, self.block_name, self.path) | ||
| self.dataset = Dataset.from_dict({}) | ||
|
|
||
| def test_initialization(self): | ||
| self.assertEqual(self.import_block.block_name, self.block_name) | ||
| self.assertEqual(self.import_block.path, self.path) | ||
| self.mock_pipeline.from_file.assert_called_once_with(self.ctx, self.path) | ||
|
|
||
| def test_generate(self): | ||
| self.mock_pipeline.from_file.return_value.generate.return_value = self.dataset | ||
| samples = self.import_block.generate(self.dataset) | ||
| self.mock_pipeline.from_file.return_value.generate.assert_called_once_with( | ||
| samples | ||
| ) | ||
| self.assertEqual(samples, self.dataset) | ||
|
|
||
|
|
||
| _CHILD_YAML = """\ | ||
| version: "1.0" | ||
| blocks: | ||
| - name: greater_than_thirty | ||
| type: FilterByValueBlock | ||
| config: | ||
| filter_column: age | ||
| filter_value: 30 | ||
| operation: gt | ||
| convert_dtype: int | ||
| """ | ||
|
|
||
|
|
||
| _PARENT_YAML_FMT = """\ | ||
| version: "1.0" | ||
| blocks: | ||
| - name: forty_or_under | ||
| type: FilterByValueBlock | ||
| config: | ||
| filter_column: age | ||
| filter_value: 40 | ||
| operation: le | ||
| convert_dtype: int | ||
| - name: import_child | ||
| type: ImportBlock | ||
| config: | ||
| path: %s | ||
| - name: big_bdays | ||
| type: FilterByValueBlock | ||
| config: | ||
| filter_column: age | ||
| filter_value: | ||
| - 30 | ||
| - 40 | ||
| operation: eq | ||
| convert_dtype: int | ||
| """ | ||
|
|
||
|
|
||
| class TestImportBlockWithFilterByValue(unittest.TestCase): | ||
| def setUp(self): | ||
| self.ctx = MagicMock() | ||
| self.ctx.num_procs = 1 | ||
| self.child_yaml = self._write_tmp_yaml(_CHILD_YAML) | ||
| self.parent_yaml = self._write_tmp_yaml(_PARENT_YAML_FMT % self.child_yaml) | ||
| self.dataset = Dataset.from_dict( | ||
| {"age": ["25", "30", "35", "40", "45"]}, | ||
| features=Features({"age": Value("string")}), | ||
| ) | ||
|
|
||
| def tearDown(self): | ||
| os.remove(self.parent_yaml) | ||
| os.remove(self.child_yaml) | ||
|
|
||
| def _write_tmp_yaml(self, content): | ||
| tmp_file = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".yaml") | ||
| tmp_file.write(content) | ||
| tmp_file.close() | ||
| return tmp_file.name | ||
|
|
||
| def test_generate(self): | ||
| pipeline = Pipeline.from_file(self.ctx, self.parent_yaml) | ||
| filtered_dataset = pipeline.generate(self.dataset) | ||
| self.assertEqual(len(filtered_dataset), 1) | ||
| self.assertEqual(filtered_dataset["age"], [40]) | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.