From 6406d5ea592f09a1ae4fdefb288f89858ff800b0 Mon Sep 17 00:00:00 2001 From: shiv Date: Tue, 2 Jul 2024 23:00:58 +0000 Subject: [PATCH 1/8] :ambulance: Making filterblock more robust by introducing valid values Signed-off-by: shiv --- src/instructlab/sdg/default_flows.py | 29 +++++++++++++++++----------- src/instructlab/sdg/filterblock.py | 24 ++++++++++++++++++----- 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 31edd3d6..f66c4d9d 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -194,6 +194,7 @@ def get_flow(self) -> list: "block_name": "filter_faithfulness", "filter_column": "judgment", "filter_value": "YES", + "valid_values": ["YES", "NO"], "operation": operator.eq, "batch_kwargs": { "num_procs": 8, @@ -226,9 +227,10 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_relevancy", "filter_column": "score", - "filter_value": 2.0, + "filter_value": 2, + "valid_values": [0, 1, 2], "operation": operator.eq, - "convert_dtype": float, + "convert_dtype": int, "batch_kwargs": { "num_procs": 8, }, @@ -260,7 +262,8 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_verify_question", "filter_column": "rating", - "filter_value": 1.0, + "filter_value": 1, + "valid_values": [0, 1], "operation": operator.eq, "convert_dtype": float, "batch_kwargs": { @@ -312,9 +315,10 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_questions", "filter_column": "score", - "filter_value": 1.0, + "filter_value": 1, + "valid_values": [0, 1], "operation": operator.eq, - "convert_dtype": float, + "convert_dtype": int, "batch_kwargs": { "num_procs": 8, }, @@ -356,9 +360,10 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_qa_pair", "filter_column": "score", - "filter_value": 2.0, + "filter_value": 2, + "valid_values": [1, 2, 3], "operation": operator.ge, - "convert_dtype": float, + "convert_dtype": int, "batch_kwargs": { "num_procs": 8, }, @@ -432,9 +437,10 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_grounded_questions", "filter_column": "score", - "filter_value": 1.0, + "filter_value": 1, + "valid_values": [0, 1], "operation": operator.eq, - "convert_dtype": float, + "convert_dtype": int, "batch_kwargs": { "num_procs": 8, }, @@ -476,9 +482,10 @@ def get_flow(self) -> list: "block_config": { "block_name": "filter_grounded_qa_pair", "filter_column": "score", - "filter_value": 2.0, + "filter_value": 2, + "valid_values": [1, 2, 3], "operation": operator.ge, - "convert_dtype": float, + "convert_dtype": int, "batch_kwargs": { "num_procs": 8, }, diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index e6d4eb24..71327e8a 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -11,25 +11,39 @@ class FilterByValueBlock(Block): def __init__( - self, filter_column, filter_value, operation, convert_dtype=None, **batch_kwargs + self, filter_column, filter_value, operation, valid_values, convert_dtype=None, **batch_kwargs ) -> None: super().__init__(block_name=self.__class__.__name__) self.value = filter_value self.column_name = filter_column self.operation = operation + self.valid_values = valid_values self.convert_dtype = convert_dtype self.num_procs = batch_kwargs.get("num_procs", 1) + + def _fiter_invalid_values(self, samples): + samples = samples.filter( + lambda x: x[self.column_name] in self.valid_values, + num_proc=self.num_procs, + ) + return samples + + def _convert_dtype(self, sample): + try: + sample[self.column_name] = self.convert_dtype(sample[self.column_name]) + except ValueError: + sample[self.column_name] = None + return sample def generate(self, samples) -> Dataset: if self.convert_dtype: samples = samples.map( - lambda x: { - **x, - self.column_name: self.convert_dtype(x[self.column_name]), - }, + self._convert_dtype, num_proc=self.num_procs, ) + samples = self._fiter_invalid_values(samples) + return samples.filter( lambda x: self.operation(x[self.column_name], self.value), num_proc=self.num_procs, From 507e3256feb79ad0ef79b38b4bbd545f05202ec1 Mon Sep 17 00:00:00 2001 From: shiv Date: Wed, 3 Jul 2024 03:43:47 +0000 Subject: [PATCH 2/8] :loud_sound: slightly better logging Signed-off-by: shiv --- src/instructlab/sdg/pipeline.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index fc93f78d..d32a0903 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -46,7 +46,7 @@ def generate(self, dataset) -> Dataset: block = block_type(**block_config) logger.info("Running block: %s", block_config["block_name"]) - logger.info(dataset) + logger.info("Input: %s", dataset) dataset = block.generate(dataset, **gen_kwargs) @@ -56,5 +56,7 @@ def generate(self, dataset) -> Dataset: if drop_duplicates_cols: dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols) + + logger.info("Output: %s\n\n", dataset) return dataset From bada90e5e98f72b7d7fe4a8ea7086aab12683e63 Mon Sep 17 00:00:00 2001 From: shiv Date: Wed, 3 Jul 2024 03:46:55 +0000 Subject: [PATCH 3/8] :wrench: updating grounded synth skills default flow Signed-off-by: shiv --- src/instructlab/sdg/default_flows.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index f66c4d9d..ca0c5d9e 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -411,6 +411,7 @@ def get_flow(self) -> list: "output_cols": ["question"], "batch_kwargs": { "num_procs": 8, + "num_samples": 3, "batched": self.batched, }, }, From d5fc59da742b3c671ae34952e94bb96286a0c8e7 Mon Sep 17 00:00:00 2001 From: shiv Date: Wed, 3 Jul 2024 03:49:52 +0000 Subject: [PATCH 4/8] :white_check_mark: updating test script imports Signed-off-by: shiv --- scripts/test_freeform_skills.py | 7 ++++--- scripts/test_grounded_skills.py | 7 ++++--- scripts/test_knowledge.py | 9 +++++---- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index a8612c09..13ae235a 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -3,9 +3,10 @@ from openai import OpenAI # First Party -from src.instructlab.sdg import SDG -from src.instructlab.sdg.default_flows import SynthSkillsFlow -from src.instructlab.sdg.pipeline import Pipeline +from instructlab.sdg import SDG +from instructlab.sdg.default_flows import SynthSkillsFlow +from instructlab.sdg.pipeline import Pipeline + # for vLLM endpoints, the api_key remains "EMPTY" openai_api_key = "EMPTY" diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index 338edb6c..f0e3f1a9 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -3,9 +3,10 @@ from openai import OpenAI # First Party -from src.instructlab.sdg import SDG -from src.instructlab.sdg.default_flows import SynthGroundedSkillsFlow -from src.instructlab.sdg.pipeline import Pipeline +from instructlab.sdg import SDG +from instructlab.sdg.default_flows import SynthGroundedSkillsFlow +from instructlab.sdg.pipeline import Pipeline + # for vLLM endpoints, the api_key remains "EMPTY" openai_api_key = "EMPTY" diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index aeedcf59..05d01c70 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -6,11 +6,12 @@ from openai import OpenAI # First Party -from src.instructlab.sdg import SDG -from src.instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow -from src.instructlab.sdg.pipeline import Pipeline +from instructlab.sdg import SDG +from instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow +from instructlab.sdg.pipeline import Pipeline -# Please don't add you vLLM endpoint key here + +# for vLLM endpoints, the api_key remains "EMPTY" openai_api_key = "EMPTY" openai_api_base = "Add model endpoint here" From 1af5b74005347c84195a430ddc80954a619c669a Mon Sep 17 00:00:00 2001 From: shiv Date: Wed, 3 Jul 2024 04:16:16 +0000 Subject: [PATCH 5/8] :rotating_light: fixing linter issues Signed-off-by: shiv --- src/instructlab/sdg/filterblock.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index 71327e8a..15b7e106 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -20,14 +20,14 @@ def __init__( self.valid_values = valid_values self.convert_dtype = convert_dtype self.num_procs = batch_kwargs.get("num_procs", 1) - + def _fiter_invalid_values(self, samples): samples = samples.filter( lambda x: x[self.column_name] in self.valid_values, num_proc=self.num_procs, ) return samples - + def _convert_dtype(self, sample): try: sample[self.column_name] = self.convert_dtype(sample[self.column_name]) From face918fbdc4041a0a191b104d3335376dc5f044 Mon Sep 17 00:00:00 2001 From: shiv Date: Wed, 3 Jul 2024 17:24:55 +0000 Subject: [PATCH 6/8] :loud_sound: chore: Improve robustness of FilterByValueBlock by handling ValueError + minor linting The FilterByValueBlock class now handles ValueError exceptions when converting data types. If a ValueError occurs, the block logs an error message and fills the column with None to be filtered later. Signed-off-by: shiv --- src/instructlab/sdg/filterblock.py | 3 ++- src/instructlab/sdg/pipeline.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index 15b7e106..38c5d1a9 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -31,7 +31,8 @@ def _fiter_invalid_values(self, samples): def _convert_dtype(self, sample): try: sample[self.column_name] = self.convert_dtype(sample[self.column_name]) - except ValueError: + except ValueError as e: + logger.error("Error converting dtype: %s, filling with None to be filtered later", e) sample[self.column_name] = None return sample diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index d32a0903..705ce64e 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -56,7 +56,7 @@ def generate(self, dataset) -> Dataset: if drop_duplicates_cols: dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols) - + logger.info("Output: %s\n\n", dataset) return dataset From 7d33095d09b5f322139d15409788d0190b45b346 Mon Sep 17 00:00:00 2001 From: shiv Date: Wed, 3 Jul 2024 17:48:15 +0000 Subject: [PATCH 7/8] :rotating_light: fixing linter issues + removing unnecessary import in test scripts Signed-off-by: shiv --- scripts/test_freeform_skills.py | 1 + scripts/test_grounded_skills.py | 1 + scripts/test_knowledge.py | 4 +--- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index 13ae235a..b26a8f7e 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -3,6 +3,7 @@ from openai import OpenAI # First Party +# pylint: disable=import-error, no-name-in-module from instructlab.sdg import SDG from instructlab.sdg.default_flows import SynthSkillsFlow from instructlab.sdg.pipeline import Pipeline diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index f0e3f1a9..74a43c4b 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -3,6 +3,7 @@ from openai import OpenAI # First Party +# pylint: disable=import-error, no-name-in-module from instructlab.sdg import SDG from instructlab.sdg.default_flows import SynthGroundedSkillsFlow from instructlab.sdg.pipeline import Pipeline diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index 05d01c70..6b4ee413 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -1,11 +1,9 @@ -# Standard -import operator - # Third Party from datasets import Dataset from openai import OpenAI # First Party +# pylint: disable=import-error, no-name-in-module from instructlab.sdg import SDG from instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow from instructlab.sdg.pipeline import Pipeline From ed7d74bf318cabe8b9994f9aec76201f1639736e Mon Sep 17 00:00:00 2001 From: shiv Date: Wed, 3 Jul 2024 18:26:19 +0000 Subject: [PATCH 8/8] :rotating_light: fix linter warnings Signed-off-by: shiv --- scripts/test_freeform_skills.py | 1 - scripts/test_grounded_skills.py | 1 - scripts/test_knowledge.py | 1 - src/instructlab/sdg/filterblock.py | 12 ++++++++++-- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index b26a8f7e..a54d9453 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -8,7 +8,6 @@ from instructlab.sdg.default_flows import SynthSkillsFlow from instructlab.sdg.pipeline import Pipeline - # for vLLM endpoints, the api_key remains "EMPTY" openai_api_key = "EMPTY" openai_api_base = "Add model endpoint here" diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index 74a43c4b..33c6660f 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -8,7 +8,6 @@ from instructlab.sdg.default_flows import SynthGroundedSkillsFlow from instructlab.sdg.pipeline import Pipeline - # for vLLM endpoints, the api_key remains "EMPTY" openai_api_key = "EMPTY" openai_api_base = "Add model endpoint here" diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index 6b4ee413..d39c45cf 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -8,7 +8,6 @@ from instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow from instructlab.sdg.pipeline import Pipeline - # for vLLM endpoints, the api_key remains "EMPTY" openai_api_key = "EMPTY" openai_api_base = "Add model endpoint here" diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index 38c5d1a9..738a4d00 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -11,7 +11,13 @@ class FilterByValueBlock(Block): def __init__( - self, filter_column, filter_value, operation, valid_values, convert_dtype=None, **batch_kwargs + self, + filter_column, + filter_value, + operation, + valid_values, + convert_dtype=None, + **batch_kwargs, ) -> None: super().__init__(block_name=self.__class__.__name__) self.value = filter_value @@ -32,7 +38,9 @@ def _convert_dtype(self, sample): try: sample[self.column_name] = self.convert_dtype(sample[self.column_name]) except ValueError as e: - logger.error("Error converting dtype: %s, filling with None to be filtered later", e) + logger.error( + "Error converting dtype: %s, filling with None to be filtered later", e + ) sample[self.column_name] = None return sample