diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index a8612c09..a54d9453 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -3,9 +3,10 @@ from openai import OpenAI # First Party -from src.instructlab.sdg import SDG -from src.instructlab.sdg.default_flows import SynthSkillsFlow -from src.instructlab.sdg.pipeline import Pipeline +# pylint: disable=import-error, no-name-in-module +from instructlab.sdg import SDG +from instructlab.sdg.default_flows import SynthSkillsFlow +from instructlab.sdg.pipeline import Pipeline # for vLLM endpoints, the api_key remains "EMPTY" openai_api_key = "EMPTY" diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index 338edb6c..33c6660f 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -3,9 +3,10 @@ from openai import OpenAI # First Party -from src.instructlab.sdg import SDG -from src.instructlab.sdg.default_flows import SynthGroundedSkillsFlow -from src.instructlab.sdg.pipeline import Pipeline +# pylint: disable=import-error, no-name-in-module +from instructlab.sdg import SDG +from instructlab.sdg.default_flows import SynthGroundedSkillsFlow +from instructlab.sdg.pipeline import Pipeline # for vLLM endpoints, the api_key remains "EMPTY" openai_api_key = "EMPTY" diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index aeedcf59..d39c45cf 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -1,16 +1,14 @@ -# Standard -import operator - # Third Party from datasets import Dataset from openai import OpenAI # First Party -from src.instructlab.sdg import SDG -from src.instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow -from src.instructlab.sdg.pipeline import Pipeline +# pylint: disable=import-error, no-name-in-module +from instructlab.sdg import SDG +from instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow +from instructlab.sdg.pipeline import Pipeline -# Please don't add you vLLM endpoint key here +# for vLLM endpoints, the api_key remains "EMPTY" openai_api_key = "EMPTY" openai_api_base = "Add model endpoint here" diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 31edd3d6..ca0c5d9e 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -194,6 +194,7 @@ def get_flow(self) -> list: "block_name": "filter_faithfulness", "filter_column": "judgment", "filter_value": "YES", + "valid_values": ["YES", "NO"], "operation": operator.eq, "batch_kwargs": { "num_procs": 8, @@ -226,9 +227,10 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_relevancy", "filter_column": "score", - "filter_value": 2.0, + "filter_value": 2, + "valid_values": [0, 1, 2], "operation": operator.eq, - "convert_dtype": float, + "convert_dtype": int, "batch_kwargs": { "num_procs": 8, }, @@ -260,7 +262,8 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_verify_question", "filter_column": "rating", - "filter_value": 1.0, + "filter_value": 1, + "valid_values": [0, 1], "operation": operator.eq, "convert_dtype": float, "batch_kwargs": { @@ -312,9 +315,10 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_questions", "filter_column": "score", - "filter_value": 1.0, + "filter_value": 1, + "valid_values": [0, 1], "operation": operator.eq, - "convert_dtype": float, + "convert_dtype": int, "batch_kwargs": { "num_procs": 8, }, @@ -356,9 +360,10 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_qa_pair", "filter_column": "score", - "filter_value": 2.0, + "filter_value": 2, + "valid_values": [1, 2, 3], "operation": operator.ge, - "convert_dtype": float, + "convert_dtype": int, "batch_kwargs": { "num_procs": 8, }, @@ -406,6 +411,7 @@ def get_flow(self) -> list: "output_cols": ["question"], "batch_kwargs": { "num_procs": 8, + "num_samples": 3, "batched": self.batched, }, }, @@ -432,9 +438,10 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_grounded_questions", "filter_column": "score", - "filter_value": 1.0, + "filter_value": 1, + "valid_values": [0, 1], "operation": operator.eq, - "convert_dtype": float, + "convert_dtype": int, "batch_kwargs": { "num_procs": 8, }, @@ -476,9 +483,10 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_grounded_qa_pair", "filter_column": "score", - "filter_value": 2.0, + "filter_value": 2, + "valid_values": [1, 2, 3], "operation": operator.ge, - "convert_dtype": float, + "convert_dtype": int, "batch_kwargs": { "num_procs": 8, }, diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index e6d4eb24..738a4d00 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -11,25 +11,48 @@ class FilterByValueBlock(Block): def __init__( - self, filter_column, filter_value, operation, convert_dtype=None, **batch_kwargs + self, + filter_column, + filter_value, + operation, + valid_values, + convert_dtype=None, + **batch_kwargs, ) -> None: super().__init__(block_name=self.__class__.__name__) self.value = filter_value self.column_name = filter_column self.operation = operation + self.valid_values = valid_values self.convert_dtype = convert_dtype self.num_procs = batch_kwargs.get("num_procs", 1) + def _fiter_invalid_values(self, samples): + samples = samples.filter( + lambda x: x[self.column_name] in self.valid_values, + num_proc=self.num_procs, + ) + return samples + + def _convert_dtype(self, sample): + try: + sample[self.column_name] = self.convert_dtype(sample[self.column_name]) + except ValueError as e: + logger.error( + "Error converting dtype: %s, filling with None to be filtered later", e + ) + sample[self.column_name] = None + return sample + def generate(self, samples) -> Dataset: if self.convert_dtype: samples = samples.map( - lambda x: { - **x, - self.column_name: self.convert_dtype(x[self.column_name]), - }, + self._convert_dtype, num_proc=self.num_procs, ) + samples = self._fiter_invalid_values(samples) + return samples.filter( lambda x: self.operation(x[self.column_name], self.value), num_proc=self.num_procs, diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index fc93f78d..705ce64e 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -46,7 +46,7 @@ def generate(self, dataset) -> Dataset: block = block_type(**block_config) logger.info("Running block: %s", block_config["block_name"]) - logger.info(dataset) + logger.info("Input: %s", dataset) dataset = block.generate(dataset, **gen_kwargs) @@ -57,4 +57,6 @@ def generate(self, dataset) -> Dataset: if drop_duplicates_cols: dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols) + logger.info("Output: %s\n\n", dataset) + return dataset