diff --git a/src/autogluon_assistant/assistant.py b/src/autogluon_assistant/assistant.py index 10d475e..115ed38 100644 --- a/src/autogluon_assistant/assistant.py +++ b/src/autogluon_assistant/assistant.py @@ -1,5 +1,4 @@ import logging -import os import signal from typing import Any, Dict, Union @@ -102,15 +101,7 @@ def preprocess_task(self, task: TabularPredictionTask) -> TabularPredictionTask: task = self.inference_task(task) if self.feature_transformers_config: logger.info("Automatic feature generation starts...") - if "OPENAI_API_KEY" not in os.environ: - logger.info("No OpenAI API keys found, therefore, skip CAAFE") - fe_transformers = [ - instantiate(ft_config) - for ft_config in self.feature_transformers_config - if ft_config["_target_"] != "autogluon_assistant.transformer.CAAFETransformer" - ] - else: - fe_transformers = [instantiate(ft_config) for ft_config in self.feature_transformers_config] + fe_transformers = [instantiate(ft_config) for ft_config in self.feature_transformers_config] for fe_transformer in fe_transformers: try: with timeout( @@ -122,7 +113,7 @@ def preprocess_task(self, task: TabularPredictionTask) -> TabularPredictionTask: self.handle_exception(f"Task preprocessing: {fe_transformer.name}", e) logger.info("Automatic feature generation complete!") else: - logger.info("Automatic feature generation is disabled.") + logger.info("Automatic feature generation is disabled. ") return task def fit_predictor(self, task: TabularPredictionTask): diff --git a/src/autogluon_assistant/constants.py b/src/autogluon_assistant/constants.py index 5c3d921..bdf65eb 100644 --- a/src/autogluon_assistant/constants.py +++ b/src/autogluon_assistant/constants.py @@ -71,3 +71,10 @@ MULTICLASS: ROC_AUC, REGRESSION: ROOT_MEAN_SQUARED_ERROR, } + +WHITE_LIST_LLM = [ + "anthropic.claude-3-5-sonnet-20241022-v2:0", + "meta.llama3-1-405b-instruct-v1:0", + "anthropic.claude-3-5-haiku-20241022-v1:0", + "gpt-4o-2024-08-06" "anthropic.claude-3-5-sonnet-20240620-v1:0", +] diff --git a/src/autogluon_assistant/llm/llm.py b/src/autogluon_assistant/llm/llm.py index c6f311b..30c1998 100644 --- a/src/autogluon_assistant/llm/llm.py +++ b/src/autogluon_assistant/llm/llm.py @@ -13,6 +13,8 @@ from pydantic import BaseModel, Field from tenacity import retry, stop_after_attempt, wait_exponential +from ..constants import WHITE_LIST_LLM + logger = logging.getLogger(__name__) @@ -115,11 +117,7 @@ def get_bedrock_models() -> List[str]: try: bedrock = boto3.client("bedrock", region_name="us-west-2") response = bedrock.list_foundation_models() - return [ - model["modelId"] - for model in response["modelSummaries"] - if model["modelId"].startswith("anthropic.claude") - ] + return [model["modelId"] for model in response["modelSummaries"]] except Exception as e: print(f"Error fetching Bedrock models: {e}") return [] @@ -181,6 +179,9 @@ def get_chat_model(cls, config: DictConfig) -> Union[AssistantChatOpenAI, Assist config.model in valid_models ), f"{config.model} is not a valid model in: {valid_models} for provider {config.provider}" + if config.model not in WHITE_LIST_LLM: + logger.warning(f"{config.model} is not on the white list. Our white list models include {WHITE_LIST_LLM}") + if config.provider == "openai": return LLMFactory._get_openai_chat_model(config) elif config.provider == "bedrock": diff --git a/src/autogluon_assistant/transformer/feature_transformers/scentenceFT.py b/src/autogluon_assistant/transformer/feature_transformers/scentenceFT.py index 6876b92..0225c29 100644 --- a/src/autogluon_assistant/transformer/feature_transformers/scentenceFT.py +++ b/src/autogluon_assistant/transformer/feature_transformers/scentenceFT.py @@ -29,48 +29,46 @@ def get_device_info(): return DeviceInfo(cpu_count, gpu_devices) -def huggingface_run(model, data): - if all(isinstance(x, str) for x in data) and any(len(x.split(" ")) > 10 for x in data): - data = np.where(pd.isna(data), "", data) - return model.encode(data).astype("float32") - else: - return np.zeros(len(data)) - - -def glove_run_one_proc(model, data): - embeddings = [] - if all(isinstance(x, str) for x in data) and any(len(x.split(" ")) > 10 for x in data): - for text in data: - token_list = list(tokenize(text)) - embed = model.get_mean_vector(token_list) - embeddings.append(embed) - else: - return np.zeros(len(data)) - return np.stack(embeddings).astype("float32") - - class PretrainedEmbeddingTransformer(BaseFeatureTransformer): def __init__(self, model_name, **kwargs) -> None: - self.model_name = model_name if torch.cuda.is_available(): - try: - self.model = SentenceTransformer(self.model_name) - except: - logger.warning(f"No model {self.model_name} is found.") + self.model_name = model_name - else: + if not torch.cuda.is_available(): logger.warning("CUDA is not found. For an optimized user experience, we switched to the glove embeddings") self.model_name = "glove-wiki-gigaword" self.dim = 300 self.max_num_procs = 16 + self.cpu_count = int(os.environ.get("NUM_VISIBLE_CPUS", os.cpu_count())) + + def _fit_dataframes(self, train_X: pd.DataFrame, train_y: pd.Series, **kwargs) -> None: + pass + + def glove_run_one_proc(self, data): + embeddings = [] + if all(isinstance(x, str) for x in data) and any(len(x.split(" ")) > 10 for x in data): try: self.model = api.load(f"{self.model_name}-{self.dim}") except: logger.warning(f"No model {self.model_name}-{self.dim} is found.") - self.cpu_count = int(os.environ.get("NUM_VISIBLE_CPUS", os.cpu_count())) + for text in data: + token_list = list(tokenize(text)) + embed = self.model.get_mean_vector(token_list) + embeddings.append(embed) + else: + return np.zeros(len(data)) + return np.stack(embeddings).astype("float32") - def _fit_dataframes(self, train_X: pd.DataFrame, train_y: pd.Series, **kwargs) -> None: - pass + def huggingface_run(self, data): + if all(isinstance(x, str) for x in data) and any(len(x.split(" ")) > 10 for x in data): + try: + self.model = SentenceTransformer(self.model_name) + except: + logger.warning(f"No model {self.model_name} is found.") + data = np.where(pd.isna(data), "", data) + return self.model.encode(data).astype("float32") + else: + return np.zeros(len(data)) def _transform_dataframes(self, train_X: pd.DataFrame, test_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: assert ( @@ -79,15 +77,11 @@ def _transform_dataframes(self, train_X: pd.DataFrame, test_X: pd.DataFrame) -> for series_name in train_X.columns.values.tolist(): if torch.cuda.is_available(): - transformed_train_column = huggingface_run(self.model, np.transpose(train_X[series_name].to_numpy()).T) - transformed_test_column = huggingface_run(self.model, np.transpose(test_X[series_name].to_numpy()).T) + transformed_train_column = self.huggingface_run(np.transpose(train_X[series_name].to_numpy()).T) + transformed_test_column = self.huggingface_run(np.transpose(test_X[series_name].to_numpy()).T) else: - transformed_train_column = glove_run_one_proc( - self.model, np.transpose(train_X[series_name].to_numpy()).T - ) - transformed_test_column = glove_run_one_proc( - self.model, np.transpose(test_X[series_name].to_numpy()).T - ) + transformed_train_column = self.glove_run_one_proc(np.transpose(train_X[series_name].to_numpy()).T) + transformed_test_column = self.glove_run_one_proc(np.transpose(test_X[series_name].to_numpy()).T) if transformed_train_column.any() and transformed_test_column.any(): transformed_train_column = pd.DataFrame(transformed_train_column)