From 6406d5ea592f09a1ae4fdefb288f89858ff800b0 Mon Sep 17 00:00:00 2001
From: shiv <shivchander.s30@gmail.com>
Date: Tue, 2 Jul 2024 23:00:58 +0000
Subject: [PATCH 1/8] :ambulance: Making filterblock more robust by introducing
 valid values

Signed-off-by: shiv <shivchander.s30@gmail.com>
---
 src/instructlab/sdg/default_flows.py | 29 +++++++++++++++++-----------
 src/instructlab/sdg/filterblock.py   | 24 ++++++++++++++++++-----
 2 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
index 31edd3d6..f66c4d9d 100644
--- a/src/instructlab/sdg/default_flows.py
+++ b/src/instructlab/sdg/default_flows.py
@@ -194,6 +194,7 @@ def get_flow(self) -> list:
                     "block_name": "filter_faithfulness",
                     "filter_column": "judgment",
                     "filter_value": "YES",
+                    "valid_values": ["YES", "NO"],
                     "operation": operator.eq,
                     "batch_kwargs": {
                         "num_procs": 8,
@@ -226,9 +227,10 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "filter_relevancy",
                     "filter_column": "score",
-                    "filter_value": 2.0,
+                    "filter_value": 2,
+                    "valid_values": [0, 1, 2],
                     "operation": operator.eq,
-                    "convert_dtype": float,
+                    "convert_dtype": int,
                     "batch_kwargs": {
                         "num_procs": 8,
                     },
@@ -260,7 +262,8 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "filter_verify_question",
                     "filter_column": "rating",
-                    "filter_value": 1.0,
+                    "filter_value": 1,
+                    "valid_values": [0, 1],
                     "operation": operator.eq,
                     "convert_dtype": float,
                     "batch_kwargs": {
@@ -312,9 +315,10 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "filter_questions",
                     "filter_column": "score",
-                    "filter_value": 1.0,
+                    "filter_value": 1,
+                    "valid_values": [0, 1],
                     "operation": operator.eq,
-                    "convert_dtype": float,
+                    "convert_dtype": int,
                     "batch_kwargs": {
                         "num_procs": 8,
                     },
@@ -356,9 +360,10 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "filter_qa_pair",
                     "filter_column": "score",
-                    "filter_value": 2.0,
+                    "filter_value": 2,
+                    "valid_values": [1, 2, 3],
                     "operation": operator.ge,
-                    "convert_dtype": float,
+                    "convert_dtype": int,
                     "batch_kwargs": {
                         "num_procs": 8,
                     },
@@ -432,9 +437,10 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "filter_grounded_questions",
                     "filter_column": "score",
-                    "filter_value": 1.0,
+                    "filter_value": 1,
+                    "valid_values": [0, 1],
                     "operation": operator.eq,
-                    "convert_dtype": float,
+                    "convert_dtype": int,
                     "batch_kwargs": {
                         "num_procs": 8,
                     },
@@ -476,9 +482,10 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "filter_grounded_qa_pair",
                     "filter_column": "score",
-                    "filter_value": 2.0,
+                    "filter_value": 2,
+                    "valid_values": [1, 2, 3],
                     "operation": operator.ge,
-                    "convert_dtype": float,
+                    "convert_dtype": int,
                     "batch_kwargs": {
                         "num_procs": 8,
                     },
diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
index e6d4eb24..71327e8a 100644
--- a/src/instructlab/sdg/filterblock.py
+++ b/src/instructlab/sdg/filterblock.py
@@ -11,25 +11,39 @@
 
 class FilterByValueBlock(Block):
     def __init__(
-        self, filter_column, filter_value, operation, convert_dtype=None, **batch_kwargs
+        self, filter_column, filter_value, operation, valid_values, convert_dtype=None, **batch_kwargs
     ) -> None:
         super().__init__(block_name=self.__class__.__name__)
         self.value = filter_value
         self.column_name = filter_column
         self.operation = operation
+        self.valid_values = valid_values
         self.convert_dtype = convert_dtype
         self.num_procs = batch_kwargs.get("num_procs", 1)
+    
+    def _fiter_invalid_values(self, samples):
+        samples = samples.filter(
+            lambda x: x[self.column_name] in self.valid_values,
+            num_proc=self.num_procs,
+        )
+        return samples
+    
+    def _convert_dtype(self, sample):
+        try:
+            sample[self.column_name] = self.convert_dtype(sample[self.column_name])
+        except ValueError:
+            sample[self.column_name] = None
+        return sample
 
     def generate(self, samples) -> Dataset:
         if self.convert_dtype:
             samples = samples.map(
-                lambda x: {
-                    **x,
-                    self.column_name: self.convert_dtype(x[self.column_name]),
-                },
+                self._convert_dtype,
                 num_proc=self.num_procs,
             )
 
+        samples = self._fiter_invalid_values(samples)
+
         return samples.filter(
             lambda x: self.operation(x[self.column_name], self.value),
             num_proc=self.num_procs,

From 507e3256feb79ad0ef79b38b4bbd545f05202ec1 Mon Sep 17 00:00:00 2001
From: shiv <shivchander.s30@gmail.com>
Date: Wed, 3 Jul 2024 03:43:47 +0000
Subject: [PATCH 2/8] :loud_sound: slightly better logging

Signed-off-by: shiv <shivchander.s30@gmail.com>
---
 src/instructlab/sdg/pipeline.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index fc93f78d..d32a0903 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -46,7 +46,7 @@ def generate(self, dataset) -> Dataset:
                 block = block_type(**block_config)
 
             logger.info("Running block: %s", block_config["block_name"])
-            logger.info(dataset)
+            logger.info("Input: %s", dataset)
 
             dataset = block.generate(dataset, **gen_kwargs)
 
@@ -56,5 +56,7 @@ def generate(self, dataset) -> Dataset:
 
             if drop_duplicates_cols:
                 dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
+            
+            logger.info("Output: %s\n\n", dataset)
 
         return dataset

From bada90e5e98f72b7d7fe4a8ea7086aab12683e63 Mon Sep 17 00:00:00 2001
From: shiv <shivchander.s30@gmail.com>
Date: Wed, 3 Jul 2024 03:46:55 +0000
Subject: [PATCH 3/8] :wrench: updating grounded synth skills default flow

Signed-off-by: shiv <shivchander.s30@gmail.com>
---
 src/instructlab/sdg/default_flows.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
index f66c4d9d..ca0c5d9e 100644
--- a/src/instructlab/sdg/default_flows.py
+++ b/src/instructlab/sdg/default_flows.py
@@ -411,6 +411,7 @@ def get_flow(self) -> list:
                     "output_cols": ["question"],
                     "batch_kwargs": {
                         "num_procs": 8,
+                        "num_samples": 3,
                         "batched": self.batched,
                     },
                 },

From d5fc59da742b3c671ae34952e94bb96286a0c8e7 Mon Sep 17 00:00:00 2001
From: shiv <shivchander.s30@gmail.com>
Date: Wed, 3 Jul 2024 03:49:52 +0000
Subject: [PATCH 4/8] :white_check_mark: updating test script imports

Signed-off-by: shiv <shivchander.s30@gmail.com>
---
 scripts/test_freeform_skills.py | 7 ++++---
 scripts/test_grounded_skills.py | 7 ++++---
 scripts/test_knowledge.py       | 9 +++++----
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
index a8612c09..13ae235a 100644
--- a/scripts/test_freeform_skills.py
+++ b/scripts/test_freeform_skills.py
@@ -3,9 +3,10 @@
 from openai import OpenAI
 
 # First Party
-from src.instructlab.sdg import SDG
-from src.instructlab.sdg.default_flows import SynthSkillsFlow
-from src.instructlab.sdg.pipeline import Pipeline
+from instructlab.sdg import SDG
+from instructlab.sdg.default_flows import SynthSkillsFlow
+from instructlab.sdg.pipeline import Pipeline
+
 
 # for vLLM endpoints, the api_key remains "EMPTY"
 openai_api_key = "EMPTY"
diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
index 338edb6c..f0e3f1a9 100644
--- a/scripts/test_grounded_skills.py
+++ b/scripts/test_grounded_skills.py
@@ -3,9 +3,10 @@
 from openai import OpenAI
 
 # First Party
-from src.instructlab.sdg import SDG
-from src.instructlab.sdg.default_flows import SynthGroundedSkillsFlow
-from src.instructlab.sdg.pipeline import Pipeline
+from instructlab.sdg import SDG
+from instructlab.sdg.default_flows import SynthGroundedSkillsFlow
+from instructlab.sdg.pipeline import Pipeline
+
 
 # for vLLM endpoints, the api_key remains "EMPTY"
 openai_api_key = "EMPTY"
diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
index aeedcf59..05d01c70 100644
--- a/scripts/test_knowledge.py
+++ b/scripts/test_knowledge.py
@@ -6,11 +6,12 @@
 from openai import OpenAI
 
 # First Party
-from src.instructlab.sdg import SDG
-from src.instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow
-from src.instructlab.sdg.pipeline import Pipeline
+from instructlab.sdg import SDG
+from instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow
+from instructlab.sdg.pipeline import Pipeline
 
-# Please don't add you vLLM endpoint key here
+
+# for vLLM endpoints, the api_key remains "EMPTY"
 openai_api_key = "EMPTY"
 openai_api_base = "Add model endpoint here"
 

From 1af5b74005347c84195a430ddc80954a619c669a Mon Sep 17 00:00:00 2001
From: shiv <shivchander.s30@gmail.com>
Date: Wed, 3 Jul 2024 04:16:16 +0000
Subject: [PATCH 5/8] :rotating_light: fixing linter issues

Signed-off-by: shiv <shivchander.s30@gmail.com>
---
 src/instructlab/sdg/filterblock.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
index 71327e8a..15b7e106 100644
--- a/src/instructlab/sdg/filterblock.py
+++ b/src/instructlab/sdg/filterblock.py
@@ -20,14 +20,14 @@ def __init__(
         self.valid_values = valid_values
         self.convert_dtype = convert_dtype
         self.num_procs = batch_kwargs.get("num_procs", 1)
-    
+
     def _fiter_invalid_values(self, samples):
         samples = samples.filter(
             lambda x: x[self.column_name] in self.valid_values,
             num_proc=self.num_procs,
         )
         return samples
-    
+
     def _convert_dtype(self, sample):
         try:
             sample[self.column_name] = self.convert_dtype(sample[self.column_name])

From face918fbdc4041a0a191b104d3335376dc5f044 Mon Sep 17 00:00:00 2001
From: shiv <shivchander.s30@gmail.com>
Date: Wed, 3 Jul 2024 17:24:55 +0000
Subject: [PATCH 6/8] :loud_sound: chore: Improve robustness of
 FilterByValueBlock by handling ValueError + minor linting

The FilterByValueBlock class now handles ValueError exceptions when converting data types. If a ValueError occurs, the block logs an error message and fills the column with None to be filtered later.

Signed-off-by: shiv <shivchander.s30@gmail.com>
---
 src/instructlab/sdg/filterblock.py | 3 ++-
 src/instructlab/sdg/pipeline.py    | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
index 15b7e106..38c5d1a9 100644
--- a/src/instructlab/sdg/filterblock.py
+++ b/src/instructlab/sdg/filterblock.py
@@ -31,7 +31,8 @@ def _fiter_invalid_values(self, samples):
     def _convert_dtype(self, sample):
         try:
             sample[self.column_name] = self.convert_dtype(sample[self.column_name])
-        except ValueError:
+        except ValueError as e:
+            logger.error("Error converting dtype: %s, filling with None to be filtered later", e)
             sample[self.column_name] = None
         return sample
 
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index d32a0903..705ce64e 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -56,7 +56,7 @@ def generate(self, dataset) -> Dataset:
 
             if drop_duplicates_cols:
                 dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
-            
+
             logger.info("Output: %s\n\n", dataset)
 
         return dataset

From 7d33095d09b5f322139d15409788d0190b45b346 Mon Sep 17 00:00:00 2001
From: shiv <shivchander.s30@gmail.com>
Date: Wed, 3 Jul 2024 17:48:15 +0000
Subject: [PATCH 7/8] :rotating_light: fixing linter issues + removing
 unnecessary import in test scripts

Signed-off-by: shiv <shivchander.s30@gmail.com>
---
 scripts/test_freeform_skills.py | 1 +
 scripts/test_grounded_skills.py | 1 +
 scripts/test_knowledge.py       | 4 +---
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
index 13ae235a..b26a8f7e 100644
--- a/scripts/test_freeform_skills.py
+++ b/scripts/test_freeform_skills.py
@@ -3,6 +3,7 @@
 from openai import OpenAI
 
 # First Party
+# pylint: disable=import-error, no-name-in-module
 from instructlab.sdg import SDG
 from instructlab.sdg.default_flows import SynthSkillsFlow
 from instructlab.sdg.pipeline import Pipeline
diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
index f0e3f1a9..74a43c4b 100644
--- a/scripts/test_grounded_skills.py
+++ b/scripts/test_grounded_skills.py
@@ -3,6 +3,7 @@
 from openai import OpenAI
 
 # First Party
+# pylint: disable=import-error, no-name-in-module
 from instructlab.sdg import SDG
 from instructlab.sdg.default_flows import SynthGroundedSkillsFlow
 from instructlab.sdg.pipeline import Pipeline
diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
index 05d01c70..6b4ee413 100644
--- a/scripts/test_knowledge.py
+++ b/scripts/test_knowledge.py
@@ -1,11 +1,9 @@
-# Standard
-import operator
-
 # Third Party
 from datasets import Dataset
 from openai import OpenAI
 
 # First Party
+# pylint: disable=import-error, no-name-in-module
 from instructlab.sdg import SDG
 from instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow
 from instructlab.sdg.pipeline import Pipeline

From ed7d74bf318cabe8b9994f9aec76201f1639736e Mon Sep 17 00:00:00 2001
From: shiv <shivchander.s30@gmail.com>
Date: Wed, 3 Jul 2024 18:26:19 +0000
Subject: [PATCH 8/8] :rotating_light: fix linter warnings

Signed-off-by: shiv <shivchander.s30@gmail.com>
---
 scripts/test_freeform_skills.py    |  1 -
 scripts/test_grounded_skills.py    |  1 -
 scripts/test_knowledge.py          |  1 -
 src/instructlab/sdg/filterblock.py | 12 ++++++++++--
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
index b26a8f7e..a54d9453 100644
--- a/scripts/test_freeform_skills.py
+++ b/scripts/test_freeform_skills.py
@@ -8,7 +8,6 @@
 from instructlab.sdg.default_flows import SynthSkillsFlow
 from instructlab.sdg.pipeline import Pipeline
 
-
 # for vLLM endpoints, the api_key remains "EMPTY"
 openai_api_key = "EMPTY"
 openai_api_base = "Add model endpoint here"
diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
index 74a43c4b..33c6660f 100644
--- a/scripts/test_grounded_skills.py
+++ b/scripts/test_grounded_skills.py
@@ -8,7 +8,6 @@
 from instructlab.sdg.default_flows import SynthGroundedSkillsFlow
 from instructlab.sdg.pipeline import Pipeline
 
-
 # for vLLM endpoints, the api_key remains "EMPTY"
 openai_api_key = "EMPTY"
 openai_api_base = "Add model endpoint here"
diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
index 6b4ee413..d39c45cf 100644
--- a/scripts/test_knowledge.py
+++ b/scripts/test_knowledge.py
@@ -8,7 +8,6 @@
 from instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow
 from instructlab.sdg.pipeline import Pipeline
 
-
 # for vLLM endpoints, the api_key remains "EMPTY"
 openai_api_key = "EMPTY"
 openai_api_base = "Add model endpoint here"
diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
index 38c5d1a9..738a4d00 100644
--- a/src/instructlab/sdg/filterblock.py
+++ b/src/instructlab/sdg/filterblock.py
@@ -11,7 +11,13 @@
 
 class FilterByValueBlock(Block):
     def __init__(
-        self, filter_column, filter_value, operation, valid_values, convert_dtype=None, **batch_kwargs
+        self,
+        filter_column,
+        filter_value,
+        operation,
+        valid_values,
+        convert_dtype=None,
+        **batch_kwargs,
     ) -> None:
         super().__init__(block_name=self.__class__.__name__)
         self.value = filter_value
@@ -32,7 +38,9 @@ def _convert_dtype(self, sample):
         try:
             sample[self.column_name] = self.convert_dtype(sample[self.column_name])
         except ValueError as e:
-            logger.error("Error converting dtype: %s, filling with None to be filtered later", e)
+            logger.error(
+                "Error converting dtype: %s, filling with None to be filtered later", e
+            )
             sample[self.column_name] = None
         return sample