Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix vaious issues #99

Merged
merged 9 commits into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 2 additions & 11 deletions src/autogluon_assistant/assistant.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
import os
import signal
from typing import Any, Dict, Union

Expand Down Expand Up @@ -102,15 +101,7 @@ def preprocess_task(self, task: TabularPredictionTask) -> TabularPredictionTask:
task = self.inference_task(task)
if self.feature_transformers_config:
logger.info("Automatic feature generation starts...")
if "OPENAI_API_KEY" not in os.environ:
logger.info("No OpenAI API keys found, therefore, skip CAAFE")
fe_transformers = [
instantiate(ft_config)
for ft_config in self.feature_transformers_config
if ft_config["_target_"] != "autogluon_assistant.transformer.CAAFETransformer"
]
else:
fe_transformers = [instantiate(ft_config) for ft_config in self.feature_transformers_config]
fe_transformers = [instantiate(ft_config) for ft_config in self.feature_transformers_config]
for fe_transformer in fe_transformers:
try:
with timeout(
Expand All @@ -122,7 +113,7 @@ def preprocess_task(self, task: TabularPredictionTask) -> TabularPredictionTask:
self.handle_exception(f"Task preprocessing: {fe_transformer.name}", e)
logger.info("Automatic feature generation complete!")
else:
logger.info("Automatic feature generation is disabled.")
logger.info("Automatic feature generation is disabled. ")
return task

def fit_predictor(self, task: TabularPredictionTask):
Expand Down
7 changes: 7 additions & 0 deletions src/autogluon_assistant/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,10 @@
MULTICLASS: ROC_AUC,
REGRESSION: ROOT_MEAN_SQUARED_ERROR,
}

WHITE_LIST_LLM = [
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@boranhan why should we limit the users to GPT4o only? What if the user wants to run GPT 3.5T as it is more inexpensive? Same with Claude?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The user can use gpt 3.5, but we didn't test it, so it will produce a warning.

"anthropic.claude-3-5-sonnet-20241022-v2:0",
"meta.llama3-1-405b-instruct-v1:0",
"anthropic.claude-3-5-haiku-20241022-v1:0",
"gpt-4o-2024-08-06" "anthropic.claude-3-5-sonnet-20240620-v1:0",
]
11 changes: 6 additions & 5 deletions src/autogluon_assistant/llm/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from pydantic import BaseModel, Field
from tenacity import retry, stop_after_attempt, wait_exponential

from ..constants import WHITE_LIST_LLM

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -115,11 +117,7 @@ def get_bedrock_models() -> List[str]:
try:
bedrock = boto3.client("bedrock", region_name="us-west-2")
response = bedrock.list_foundation_models()
return [
model["modelId"]
for model in response["modelSummaries"]
if model["modelId"].startswith("anthropic.claude")
]
return [model["modelId"] for model in response["modelSummaries"]]
except Exception as e:
print(f"Error fetching Bedrock models: {e}")
return []
Expand Down Expand Up @@ -181,6 +179,9 @@ def get_chat_model(cls, config: DictConfig) -> Union[AssistantChatOpenAI, Assist
config.model in valid_models
), f"{config.model} is not a valid model in: {valid_models} for provider {config.provider}"

if config.model not in WHITE_LIST_LLM:
logger.warning(f"{config.model} is not on the white list. Our white list models include {WHITE_LIST_LLM}")

if config.provider == "openai":
return LLMFactory._get_openai_chat_model(config)
elif config.provider == "bedrock":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,48 +29,46 @@ def get_device_info():
return DeviceInfo(cpu_count, gpu_devices)


def huggingface_run(model, data):
if all(isinstance(x, str) for x in data) and any(len(x.split(" ")) > 10 for x in data):
data = np.where(pd.isna(data), "", data)
return model.encode(data).astype("float32")
else:
return np.zeros(len(data))


def glove_run_one_proc(model, data):
embeddings = []
if all(isinstance(x, str) for x in data) and any(len(x.split(" ")) > 10 for x in data):
for text in data:
token_list = list(tokenize(text))
embed = model.get_mean_vector(token_list)
embeddings.append(embed)
else:
return np.zeros(len(data))
return np.stack(embeddings).astype("float32")


class PretrainedEmbeddingTransformer(BaseFeatureTransformer):
def __init__(self, model_name, **kwargs) -> None:
self.model_name = model_name
if torch.cuda.is_available():
try:
self.model = SentenceTransformer(self.model_name)
except:
logger.warning(f"No model {self.model_name} is found.")
self.model_name = model_name

else:
if not torch.cuda.is_available():
logger.warning("CUDA is not found. For an optimized user experience, we switched to the glove embeddings")
self.model_name = "glove-wiki-gigaword"
self.dim = 300
self.max_num_procs = 16
self.cpu_count = int(os.environ.get("NUM_VISIBLE_CPUS", os.cpu_count()))

def _fit_dataframes(self, train_X: pd.DataFrame, train_y: pd.Series, **kwargs) -> None:
pass

def glove_run_one_proc(self, data):
embeddings = []
if all(isinstance(x, str) for x in data) and any(len(x.split(" ")) > 10 for x in data):
try:
self.model = api.load(f"{self.model_name}-{self.dim}")
except:
logger.warning(f"No model {self.model_name}-{self.dim} is found.")
self.cpu_count = int(os.environ.get("NUM_VISIBLE_CPUS", os.cpu_count()))
for text in data:
token_list = list(tokenize(text))
embed = self.model.get_mean_vector(token_list)
embeddings.append(embed)
else:
return np.zeros(len(data))
return np.stack(embeddings).astype("float32")

def _fit_dataframes(self, train_X: pd.DataFrame, train_y: pd.Series, **kwargs) -> None:
pass
def huggingface_run(self, data):
if all(isinstance(x, str) for x in data) and any(len(x.split(" ")) > 10 for x in data):
try:
self.model = SentenceTransformer(self.model_name)
except:
logger.warning(f"No model {self.model_name} is found.")
data = np.where(pd.isna(data), "", data)
return self.model.encode(data).astype("float32")
else:
return np.zeros(len(data))

def _transform_dataframes(self, train_X: pd.DataFrame, test_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
assert (
Expand All @@ -79,15 +77,11 @@ def _transform_dataframes(self, train_X: pd.DataFrame, test_X: pd.DataFrame) ->

for series_name in train_X.columns.values.tolist():
if torch.cuda.is_available():
transformed_train_column = huggingface_run(self.model, np.transpose(train_X[series_name].to_numpy()).T)
transformed_test_column = huggingface_run(self.model, np.transpose(test_X[series_name].to_numpy()).T)
transformed_train_column = self.huggingface_run(np.transpose(train_X[series_name].to_numpy()).T)
transformed_test_column = self.huggingface_run(np.transpose(test_X[series_name].to_numpy()).T)
else:
transformed_train_column = glove_run_one_proc(
self.model, np.transpose(train_X[series_name].to_numpy()).T
)
transformed_test_column = glove_run_one_proc(
self.model, np.transpose(test_X[series_name].to_numpy()).T
)
transformed_train_column = self.glove_run_one_proc(np.transpose(train_X[series_name].to_numpy()).T)
transformed_test_column = self.glove_run_one_proc(np.transpose(test_X[series_name].to_numpy()).T)

if transformed_train_column.any() and transformed_test_column.any():
transformed_train_column = pd.DataFrame(transformed_train_column)
Expand Down
Loading