Skip to content

Commit

Permalink
fix vaious issues (#99)
Browse files Browse the repository at this point in the history
  • Loading branch information
boranhan authored Nov 11, 2024
1 parent d554a91 commit b0e469a
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 53 deletions.
13 changes: 2 additions & 11 deletions src/autogluon_assistant/assistant.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
import os
import signal
from typing import Any, Dict, Union

Expand Down Expand Up @@ -102,15 +101,7 @@ def preprocess_task(self, task: TabularPredictionTask) -> TabularPredictionTask:
task = self.inference_task(task)
if self.feature_transformers_config:
logger.info("Automatic feature generation starts...")
if "OPENAI_API_KEY" not in os.environ:
logger.info("No OpenAI API keys found, therefore, skip CAAFE")
fe_transformers = [
instantiate(ft_config)
for ft_config in self.feature_transformers_config
if ft_config["_target_"] != "autogluon_assistant.transformer.CAAFETransformer"
]
else:
fe_transformers = [instantiate(ft_config) for ft_config in self.feature_transformers_config]
fe_transformers = [instantiate(ft_config) for ft_config in self.feature_transformers_config]
for fe_transformer in fe_transformers:
try:
with timeout(
Expand All @@ -122,7 +113,7 @@ def preprocess_task(self, task: TabularPredictionTask) -> TabularPredictionTask:
self.handle_exception(f"Task preprocessing: {fe_transformer.name}", e)
logger.info("Automatic feature generation complete!")
else:
logger.info("Automatic feature generation is disabled.")
logger.info("Automatic feature generation is disabled. ")
return task

def fit_predictor(self, task: TabularPredictionTask):
Expand Down
7 changes: 7 additions & 0 deletions src/autogluon_assistant/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,10 @@
MULTICLASS: ROC_AUC,
REGRESSION: ROOT_MEAN_SQUARED_ERROR,
}

WHITE_LIST_LLM = [
"anthropic.claude-3-5-sonnet-20241022-v2:0",
"meta.llama3-1-405b-instruct-v1:0",
"anthropic.claude-3-5-haiku-20241022-v1:0",
"gpt-4o-2024-08-06" "anthropic.claude-3-5-sonnet-20240620-v1:0",
]
11 changes: 6 additions & 5 deletions src/autogluon_assistant/llm/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from pydantic import BaseModel, Field
from tenacity import retry, stop_after_attempt, wait_exponential

from ..constants import WHITE_LIST_LLM

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -115,11 +117,7 @@ def get_bedrock_models() -> List[str]:
try:
bedrock = boto3.client("bedrock", region_name="us-west-2")
response = bedrock.list_foundation_models()
return [
model["modelId"]
for model in response["modelSummaries"]
if model["modelId"].startswith("anthropic.claude")
]
return [model["modelId"] for model in response["modelSummaries"]]
except Exception as e:
print(f"Error fetching Bedrock models: {e}")
return []
Expand Down Expand Up @@ -181,6 +179,9 @@ def get_chat_model(cls, config: DictConfig) -> Union[AssistantChatOpenAI, Assist
config.model in valid_models
), f"{config.model} is not a valid model in: {valid_models} for provider {config.provider}"

if config.model not in WHITE_LIST_LLM:
logger.warning(f"{config.model} is not on the white list. Our white list models include {WHITE_LIST_LLM}")

if config.provider == "openai":
return LLMFactory._get_openai_chat_model(config)
elif config.provider == "bedrock":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,48 +29,46 @@ def get_device_info():
return DeviceInfo(cpu_count, gpu_devices)


def huggingface_run(model, data):
if all(isinstance(x, str) for x in data) and any(len(x.split(" ")) > 10 for x in data):
data = np.where(pd.isna(data), "", data)
return model.encode(data).astype("float32")
else:
return np.zeros(len(data))


def glove_run_one_proc(model, data):
embeddings = []
if all(isinstance(x, str) for x in data) and any(len(x.split(" ")) > 10 for x in data):
for text in data:
token_list = list(tokenize(text))
embed = model.get_mean_vector(token_list)
embeddings.append(embed)
else:
return np.zeros(len(data))
return np.stack(embeddings).astype("float32")


class PretrainedEmbeddingTransformer(BaseFeatureTransformer):
def __init__(self, model_name, **kwargs) -> None:
self.model_name = model_name
if torch.cuda.is_available():
try:
self.model = SentenceTransformer(self.model_name)
except:
logger.warning(f"No model {self.model_name} is found.")
self.model_name = model_name

else:
if not torch.cuda.is_available():
logger.warning("CUDA is not found. For an optimized user experience, we switched to the glove embeddings")
self.model_name = "glove-wiki-gigaword"
self.dim = 300
self.max_num_procs = 16
self.cpu_count = int(os.environ.get("NUM_VISIBLE_CPUS", os.cpu_count()))

def _fit_dataframes(self, train_X: pd.DataFrame, train_y: pd.Series, **kwargs) -> None:
pass

def glove_run_one_proc(self, data):
embeddings = []
if all(isinstance(x, str) for x in data) and any(len(x.split(" ")) > 10 for x in data):
try:
self.model = api.load(f"{self.model_name}-{self.dim}")
except:
logger.warning(f"No model {self.model_name}-{self.dim} is found.")
self.cpu_count = int(os.environ.get("NUM_VISIBLE_CPUS", os.cpu_count()))
for text in data:
token_list = list(tokenize(text))
embed = self.model.get_mean_vector(token_list)
embeddings.append(embed)
else:
return np.zeros(len(data))
return np.stack(embeddings).astype("float32")

def _fit_dataframes(self, train_X: pd.DataFrame, train_y: pd.Series, **kwargs) -> None:
pass
def huggingface_run(self, data):
if all(isinstance(x, str) for x in data) and any(len(x.split(" ")) > 10 for x in data):
try:
self.model = SentenceTransformer(self.model_name)
except:
logger.warning(f"No model {self.model_name} is found.")
data = np.where(pd.isna(data), "", data)
return self.model.encode(data).astype("float32")
else:
return np.zeros(len(data))

def _transform_dataframes(self, train_X: pd.DataFrame, test_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
assert (
Expand All @@ -79,15 +77,11 @@ def _transform_dataframes(self, train_X: pd.DataFrame, test_X: pd.DataFrame) ->

for series_name in train_X.columns.values.tolist():
if torch.cuda.is_available():
transformed_train_column = huggingface_run(self.model, np.transpose(train_X[series_name].to_numpy()).T)
transformed_test_column = huggingface_run(self.model, np.transpose(test_X[series_name].to_numpy()).T)
transformed_train_column = self.huggingface_run(np.transpose(train_X[series_name].to_numpy()).T)
transformed_test_column = self.huggingface_run(np.transpose(test_X[series_name].to_numpy()).T)
else:
transformed_train_column = glove_run_one_proc(
self.model, np.transpose(train_X[series_name].to_numpy()).T
)
transformed_test_column = glove_run_one_proc(
self.model, np.transpose(test_X[series_name].to_numpy()).T
)
transformed_train_column = self.glove_run_one_proc(np.transpose(train_X[series_name].to_numpy()).T)
transformed_test_column = self.glove_run_one_proc(np.transpose(test_X[series_name].to_numpy()).T)

if transformed_train_column.any() and transformed_test_column.any():
transformed_train_column = pd.DataFrame(transformed_train_column)
Expand Down

0 comments on commit b0e469a

Please sign in to comment.