diff --git a/src/instructlab/sdg/block.py b/src/instructlab/sdg/block.py index 686ab783..c072908a 100644 --- a/src/instructlab/sdg/block.py +++ b/src/instructlab/sdg/block.py @@ -2,15 +2,13 @@ # Standard from abc import ABC from typing import Any, Dict, Union +import logging import os.path # Third Party import yaml -# Local -from .logger_config import setup_logger - -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) # This is part of the public API. diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index 2b6e6d30..9d5c10f0 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -1,6 +1,7 @@ # Standard from typing import Optional import json +import logging import os.path import random import uuid @@ -10,11 +11,10 @@ import yaml # First Party -from instructlab.sdg.logger_config import setup_logger from instructlab.sdg.utils import GenerateException, pandas ALLOWED_COLS = ["id", "messages", "metadata"] -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) def _adjust_train_sample_size(ds: Dataset, num_samples: int): diff --git a/src/instructlab/sdg/eval_data.py b/src/instructlab/sdg/eval_data.py index a95bd597..509f3a32 100644 --- a/src/instructlab/sdg/eval_data.py +++ b/src/instructlab/sdg/eval_data.py @@ -1,6 +1,7 @@ # Standard from importlib import resources from typing import Any +import logging import re # Third Party @@ -10,10 +11,7 @@ # First Party from instructlab.sdg.pipeline import EVAL_PIPELINES_PKG, Pipeline -# Local -from .logger_config import setup_logger - -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) def _extract_options(text: str) -> list[Any]: diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index 1ae3e5b9..a82c8c7d 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # Standard +import logging import operator # Third Party @@ -7,9 +8,8 @@ # Local from .block import Block -from .logger_config import setup_logger -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) # This is part of the public API. diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 730c8b39..4fa606b6 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import Optional import json +import logging import os import time @@ -35,6 +36,8 @@ read_taxonomy_leaf_nodes, ) +logger = logging.getLogger(__name__) + _SYS_PROMPT = "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant." @@ -73,9 +76,7 @@ def _convert_to_messages(sample): return sample -def _gen_train_data( - logger, machine_instruction_data, output_file_train, output_file_messages -): +def _gen_train_data(machine_instruction_data, output_file_train, output_file_messages): """ Generate training data in the legacy system/user/assistant format used in train_*.jsonl as well as the legacy messages format used @@ -257,9 +258,9 @@ def _mixer_init(ctx, output_dir, date_suffix): # This is part of the public API, and used by instructlab. # TODO - parameter removal needs to be done in sync with a CLI change. -# pylint: disable=unused-argument +# to be removed: logger, prompt_file_path, rouge_threshold, tls_* def generate_data( - logger, + logger: logging.Logger = logger, # pylint: disable=redefined-outer-name api_base: Optional[str] = None, api_key: Optional[str] = None, model_family: Optional[str] = None, @@ -270,9 +271,9 @@ def generate_data( taxonomy_base: Optional[str] = None, output_dir: Optional[str] = None, # TODO - not used and should be removed from the CLI - prompt_file_path: Optional[str] = None, + prompt_file_path: Optional[str] = None, # pylint: disable=unused-argument # TODO - probably should be removed - rouge_threshold: Optional[float] = None, + rouge_threshold: Optional[float] = None, # pylint: disable=unused-argument console_output=True, yaml_rules: Optional[str] = None, chunk_word_count=None, @@ -382,9 +383,9 @@ def generate_data( else: sdg = sdg_freeform_skill - logger.debug("Samples: %s" % samples) + logger.debug("Samples: %s", samples) ds = Dataset.from_list(samples) - logger.debug("Dataset: %s" % ds) + logger.debug("Dataset: %s", ds) new_generated_data = sdg.generate(ds) if len(new_generated_data) == 0: raise EmptyDatasetError( @@ -395,8 +396,8 @@ def generate_data( if generated_data is None else generated_data + [new_generated_data] ) - logger.info("Generated %d samples" % len(generated_data)) - logger.debug("Generated data: %s" % generated_data) + logger.info("Generated %d samples", len(generated_data)) + logger.debug("Generated data: %s", generated_data) if is_knowledge: # generate mmlubench data for the current leaf node @@ -414,7 +415,6 @@ def generate_data( generated_data = [] _gen_train_data( - logger, generated_data, os.path.join(output_dir, output_file_train), os.path.join(output_dir, output_file_messages), diff --git a/src/instructlab/sdg/importblock.py b/src/instructlab/sdg/importblock.py index c2e98b31..e6aeab7c 100644 --- a/src/instructlab/sdg/importblock.py +++ b/src/instructlab/sdg/importblock.py @@ -1,12 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 +# Standard +import logging + # Third Party from datasets import Dataset # Local from .block import Block -from .logger_config import setup_logger -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) # This is part of the public API. diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index 3133f932..69c7708a 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -2,6 +2,7 @@ # Standard from collections import ChainMap from typing import Any, Dict +import logging import re # Third Party @@ -10,9 +11,8 @@ # Local from .block import Block -from .logger_config import setup_logger -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) MODEL_FAMILY_MIXTRAL = "mixtral" MODEL_FAMILY_MERLINITE = "merlinite" diff --git a/src/instructlab/sdg/logger_config.py b/src/instructlab/sdg/logger_config.py deleted file mode 100644 index 37c958ab..00000000 --- a/src/instructlab/sdg/logger_config.py +++ /dev/null @@ -1,18 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Standard -import logging - -# Third Party -from rich.logging import RichHandler - - -def setup_logger(name): - # Set up the logger - logging.basicConfig( - level=logging.INFO, - format="%(message)s", - datefmt="[%X]", - handlers=[RichHandler()], - ) - logger = logging.getLogger(name) - return logger diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 1263c974..fa78a601 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from importlib import resources from typing import Iterable, Optional +import logging import math import os.path @@ -18,9 +19,8 @@ # Local from . import filterblock, importblock, llmblock, utilblocks from .block import Block -from .logger_config import setup_logger -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) # This is part of the public API. diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py index 2eec3a38..c16acb7d 100644 --- a/src/instructlab/sdg/utilblocks.py +++ b/src/instructlab/sdg/utilblocks.py @@ -1,4 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +# Standard +import logging + # Third Party from datasets import Dataset @@ -7,9 +10,8 @@ # Local from .block import Block -from .logger_config import setup_logger -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) # This is part of the public API.