From ff3de2b5c9c063b16a65ef88cb783f4d75889529 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Fri, 17 Jan 2025 14:18:31 +0100 Subject: [PATCH 1/8] improve pvc name error message by failing early and clear message with correct name example Signed-off-by: mahdikhashan --- .../kubeflow/katib/api/katib_client.py | 371 +++++++++--------- .../v1beta1/kubeflow/katib/utils/utils.py | 6 + 2 files changed, 197 insertions(+), 180 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index b641800290f..dffede0d0be 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -33,11 +33,11 @@ class KatibClient(object): def __init__( - self, - config_file: Optional[str] = None, - context: Optional[str] = None, - client_configuration: Optional[client.Configuration] = None, - namespace: str = utils.get_default_target_namespace(), + self, + config_file: Optional[str] = None, + context: Optional[str] = None, + client_configuration: Optional[client.Configuration] = None, + namespace: str = utils.get_default_target_namespace(), ): """KatibClient constructor. Configure logging in your application as follows to see detailed information from the KatibClient APIs: @@ -89,9 +89,9 @@ def _is_ipython(self): return True def create_experiment( - self, - experiment: models.V1beta1Experiment, - namespace: Optional[str] = None, + self, + experiment: models.V1beta1Experiment, + namespace: Optional[str] = None, ): """Create the Katib Experiment. @@ -164,46 +164,46 @@ def create_experiment( ) def tune( - self, - # TODO (andreyvelich): How to be consistent with other APIs (name) ? - name: str, - model_provider_parameters: Optional[ - "HuggingFaceModelParams" # noqa: F821 - ] = None, - dataset_provider_parameters: Optional[ - Union["HuggingFaceDatasetParams", "S3DatasetParams"] # noqa: F821 - ] = None, - trainer_parameters: Optional["HuggingFaceTrainerParams"] = None, # noqa: F821 - storage_config: Optional[Dict[str, Optional[Union[str, List[str]]]]] = { - "size": constants.PVC_DEFAULT_SIZE, - "storage_class": None, - "access_modes": constants.PVC_DEFAULT_ACCESS_MODES, - }, - objective: Optional[Callable] = None, - base_image: Optional[str] = constants.BASE_IMAGE_TENSORFLOW, - parameters: Optional[Dict[str, Any]] = None, - namespace: Optional[str] = None, - env_per_trial: Optional[ - Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] - ] = None, - algorithm_name: str = "random", - algorithm_settings: Union[ - dict, List[models.V1beta1AlgorithmSetting], None - ] = None, - objective_metric_name: str = None, - additional_metric_names: List[str] = [], - objective_type: str = "maximize", - objective_goal: float = None, - max_trial_count: int = None, - parallel_trial_count: int = None, - max_failed_trial_count: int = None, - resources_per_trial: Optional[ - Union[dict, client.V1ResourceRequirements, TrainerResources] - ] = None, - retain_trials: bool = False, - packages_to_install: List[str] = None, - pip_index_url: str = "https://pypi.org/simple", - metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, + self, + # TODO (andreyvelich): How to be consistent with other APIs (name) ? + name: str, + model_provider_parameters: Optional[ + "HuggingFaceModelParams" # noqa: F821 + ] = None, + dataset_provider_parameters: Optional[ + Union["HuggingFaceDatasetParams", "S3DatasetParams"] # noqa: F821 + ] = None, + trainer_parameters: Optional["HuggingFaceTrainerParams"] = None, # noqa: F821 + storage_config: Optional[Dict[str, Optional[Union[str, List[str]]]]] = { + "size": constants.PVC_DEFAULT_SIZE, + "storage_class": None, + "access_modes": constants.PVC_DEFAULT_ACCESS_MODES, + }, + objective: Optional[Callable] = None, + base_image: Optional[str] = constants.BASE_IMAGE_TENSORFLOW, + parameters: Optional[Dict[str, Any]] = None, + namespace: Optional[str] = None, + env_per_trial: Optional[ + Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] + ] = None, + algorithm_name: str = "random", + algorithm_settings: Union[ + dict, List[models.V1beta1AlgorithmSetting], None + ] = None, + objective_metric_name: str = None, + additional_metric_names: List[str] = [], + objective_type: str = "maximize", + objective_goal: float = None, + max_trial_count: int = None, + parallel_trial_count: int = None, + max_failed_trial_count: int = None, + resources_per_trial: Optional[ + Union[dict, client.V1ResourceRequirements, TrainerResources] + ] = None, + retain_trials: bool = False, + packages_to_install: List[str] = None, + pip_index_url: str = "https://pypi.org/simple", + metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, ): """ Create HyperParameter Tuning Katib Experiment using one of the following @@ -349,19 +349,19 @@ class name in this argument. """ if ( - ( - model_provider_parameters is not None - or dataset_provider_parameters is not None - or trainer_parameters is not None - ) - and (objective is not None or parameters is not None) + ( + model_provider_parameters is not None + or dataset_provider_parameters is not None + or trainer_parameters is not None + ) + and (objective is not None or parameters is not None) ) or ( - ( - model_provider_parameters is None - and dataset_provider_parameters is None - and trainer_parameters is None - ) - and (objective is None and parameters is None) + ( + model_provider_parameters is None + and dataset_provider_parameters is None + and trainer_parameters is None + ) + and (objective is None and parameters is None) ): raise ValueError( "Invalid configuration for creating a Katib Experiment for hyperparameter " @@ -506,9 +506,9 @@ class name in this argument. # If users choose to use external models and datasets. else: if ( - not model_provider_parameters - or not dataset_provider_parameters - or not trainer_parameters + not model_provider_parameters + or not dataset_provider_parameters + or not trainer_parameters ): raise ValueError("One of the required parameters is None") @@ -560,6 +560,17 @@ class name in this argument. # Create PVC for the Storage Initializer. # TODO (helenxie-bit): PVC Creation should be part of Katib Controller. try: + if not utils.is_valid_pvc_name(name): + raise ValueError(f""" + Invalid PVC name '{name}'. It must comply with RFC 1123. + + A lowercase RFC 1123 subdomain must consist of lowercase alphanumeric characters, '-' or '.', + and must start and end with an alphanumeric character. + For example, 'example.com' is valid. + The regex used for validation is: + '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*' + """) + self.core_api.create_namespaced_persistent_volume_claim( namespace=namespace, body=training_utils.get_pvc_spec( @@ -732,10 +743,10 @@ class name in this argument. self.create_experiment(experiment, namespace) def get_experiment( - self, - name: str, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Get the Katib Experiment. @@ -774,9 +785,9 @@ def get_experiment( raise RuntimeError(f"Failed to get Katib Experiment: {namespace}/{name}") def list_experiments( - self, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """List of all Katib Experiments in namespace. @@ -823,11 +834,11 @@ def list_experiments( return result def get_experiment_conditions( - self, - name: str, - namespace: Optional[str] = None, - experiment: models.V1beta1Experiment = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + experiment: models.V1beta1Experiment = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Get the Experiment conditions. Experiment is in the condition when `status` is True for the appropriate condition `type`. @@ -856,20 +867,20 @@ def get_experiment_conditions( experiment = self.get_experiment(name, namespace, timeout) if ( - experiment.status - and experiment.status.conditions - and len(experiment.status.conditions) > 0 + experiment.status + and experiment.status.conditions + and len(experiment.status.conditions) > 0 ): return experiment.status.conditions return [] def is_experiment_created( - self, - name: str, - namespace: Optional[str] = None, - experiment: models.V1beta1Experiment = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + experiment: models.V1beta1Experiment = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Check if Experiment is Created. @@ -896,11 +907,11 @@ def is_experiment_created( ) def is_experiment_running( - self, - name: str, - namespace: Optional[str] = None, - experiment: models.V1beta1Experiment = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + experiment: models.V1beta1Experiment = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Check if Experiment is Running. @@ -927,11 +938,11 @@ def is_experiment_running( ) def is_experiment_restarting( - self, - name: str, - namespace: Optional[str] = None, - experiment: models.V1beta1Experiment = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + experiment: models.V1beta1Experiment = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Check if Experiment is Restarting. Args: @@ -957,11 +968,11 @@ def is_experiment_restarting( ) def is_experiment_succeeded( - self, - name: str, - namespace: Optional[str] = None, - experiment: models.V1beta1Experiment = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + experiment: models.V1beta1Experiment = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Check if Experiment is Succeeded. Args: @@ -987,11 +998,11 @@ def is_experiment_succeeded( ) def is_experiment_failed( - self, - name: str, - namespace: Optional[str] = None, - experiment: models.V1beta1Experiment = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + experiment: models.V1beta1Experiment = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Check if Experiment is Failed. Args: @@ -1017,13 +1028,13 @@ def is_experiment_failed( ) def wait_for_experiment_condition( - self, - name: str, - namespace: Optional[str] = None, - expected_condition: str = constants.EXPERIMENT_CONDITION_SUCCEEDED, - timeout: int = 600, - polling_interval: int = 15, - apiserver_timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + expected_condition: str = constants.EXPERIMENT_CONDITION_SUCCEEDED, + timeout: int = 600, + polling_interval: int = 15, + apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): """Wait until Experiment reaches specific condition. By default it waits for the Succeeded condition. @@ -1055,10 +1066,10 @@ def wait_for_experiment_condition( # Wait for Failed condition. if ( - expected_condition == constants.EXPERIMENT_CONDITION_FAILED - and self.is_experiment_failed( - name, namespace, experiment, apiserver_timeout - ) + expected_condition == constants.EXPERIMENT_CONDITION_FAILED + and self.is_experiment_failed( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) logger.debug( @@ -1068,7 +1079,7 @@ def wait_for_experiment_condition( # Raise exception if Experiment is Failed. elif self.is_experiment_failed( - name, namespace, experiment, apiserver_timeout + name, namespace, experiment, apiserver_timeout ): raise RuntimeError( f"Experiment: {namespace}/{name} is Failed. " @@ -1077,10 +1088,10 @@ def wait_for_experiment_condition( # Check if Experiment reaches Created condition. elif ( - expected_condition == constants.EXPERIMENT_CONDITION_CREATED - and self.is_experiment_created( - name, namespace, experiment, apiserver_timeout - ) + expected_condition == constants.EXPERIMENT_CONDITION_CREATED + and self.is_experiment_created( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) logger.debug( @@ -1090,10 +1101,10 @@ def wait_for_experiment_condition( # Check if Experiment reaches Running condition. elif ( - expected_condition == constants.EXPERIMENT_CONDITION_RUNNING - and self.is_experiment_running( - name, namespace, experiment, apiserver_timeout - ) + expected_condition == constants.EXPERIMENT_CONDITION_RUNNING + and self.is_experiment_running( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) logger.debug( @@ -1103,10 +1114,10 @@ def wait_for_experiment_condition( # Check if Experiment reaches Restarting condition. elif ( - expected_condition == constants.EXPERIMENT_CONDITION_RESTARTING - and self.is_experiment_restarting( - name, namespace, experiment, apiserver_timeout - ) + expected_condition == constants.EXPERIMENT_CONDITION_RESTARTING + and self.is_experiment_restarting( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) logger.debug( @@ -1116,10 +1127,10 @@ def wait_for_experiment_condition( # Check if Experiment reaches Succeeded condition. elif ( - expected_condition == constants.EXPERIMENT_CONDITION_SUCCEEDED - and self.is_experiment_succeeded( - name, namespace, experiment, apiserver_timeout - ) + expected_condition == constants.EXPERIMENT_CONDITION_SUCCEEDED + and self.is_experiment_succeeded( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) @@ -1143,13 +1154,13 @@ def wait_for_experiment_condition( ) def edit_experiment_budget( - self, - name: str, - namespace: Optional[str] = None, - max_trial_count: int = None, - parallel_trial_count: int = None, - max_failed_trial_count: int = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + max_trial_count: int = None, + parallel_trial_count: int = None, + max_failed_trial_count: int = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Update Experiment budget for the running Trials. You can modify Trial budget to resume Succeeded Experiments with `LongRunning` and `FromVolume` @@ -1179,9 +1190,9 @@ def edit_experiment_budget( # The new Trial budget must be set. if ( - max_trial_count is None - and parallel_trial_count is None - and max_failed_trial_count is None + max_trial_count is None + and parallel_trial_count is None + and max_failed_trial_count is None ): raise ValueError( "Invalid input arguments. " @@ -1216,10 +1227,10 @@ def edit_experiment_budget( logger.debug(f"Experiment {namespace}/{name} has been updated") def delete_experiment( - self, - name: str, - namespace: Optional[str] = None, - delete_options: client.V1DeleteOptions = None, + self, + name: str, + namespace: Optional[str] = None, + delete_options: client.V1DeleteOptions = None, ): """Delete the Katib Experiment. @@ -1255,10 +1266,10 @@ def delete_experiment( logger.debug(f"Experiment {namespace}/{name} has been deleted") def get_suggestion( - self, - name: str, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Get the Katib Suggestion. @@ -1297,9 +1308,9 @@ def get_suggestion( raise RuntimeError(f"Failed to get Katib Suggestion: {namespace}/{name}") def list_suggestions( - self, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """List of all Katib Suggestion in namespace. @@ -1346,10 +1357,10 @@ def list_suggestions( return result def get_trial( - self, - name: str, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Get the Katib Trial. @@ -1388,10 +1399,10 @@ def get_trial( raise RuntimeError(f"Failed to get Katib Trial: {namespace}/{name}") def list_trials( - self, - experiment_name: str = None, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + experiment_name: str = None, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """List of all Trials in namespace. If Experiment name is set, it returns all Trials belong to the Experiment. @@ -1448,10 +1459,10 @@ def list_trials( return result def get_success_trial_details( - self, - experiment_name: str = None, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + experiment_name: str = None, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Get the Succeeded Trial details. If Experiment name is set, it returns Succeeded Trials details belong to the Experiment. @@ -1498,12 +1509,12 @@ def get_success_trial_details( utils.FakeResponse(item), models.V1beta1Trial ) if ( - trial.status - and trial.status.conditions - and len(trial.status.conditions) > 0 + trial.status + and trial.status.conditions + and len(trial.status.conditions) > 0 ): if utils.has_condition( - trial.status.conditions, constants.TRIAL_CONDITION_SUCCEEDED + trial.status.conditions, constants.TRIAL_CONDITION_SUCCEEDED ): output = {} output["name"] = trial.metadata.name @@ -1521,10 +1532,10 @@ def get_success_trial_details( return result def get_optimal_hyperparameters( - self, - name: str, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Get the current optimal Trial from the Experiment. @@ -1547,20 +1558,20 @@ def get_optimal_hyperparameters( experiment = self.get_experiment(name, namespace, timeout) if ( - experiment.status - and experiment.status.current_optimal_trial - and experiment.status.current_optimal_trial.observation.metrics + experiment.status + and experiment.status.current_optimal_trial + and experiment.status.current_optimal_trial.observation.metrics ): return experiment.status.current_optimal_trial else: return None def get_trial_metrics( - self, - name: str, - namespace: Optional[str] = None, - db_manager_address: str = constants.DEFAULT_DB_MANAGER_ADDRESS, - timeout: str = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + db_manager_address: str = constants.DEFAULT_DB_MANAGER_ADDRESS, + timeout: str = constants.DEFAULT_TIMEOUT, ): """Get the Trial Metric Results from the Katib DB. Katib DB Manager service should be accessible while calling this API. diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 28f3126bbfa..c740b46fdf4 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -17,6 +17,7 @@ import json import logging import os +import re import textwrap from typing import Any, Callable, Dict, List, Optional, Union @@ -267,3 +268,8 @@ def get_exec_script_from_objective( # Return executable script to execute objective function. return exec_script + + +def is_valid_pvc_name(name: str) -> bool: + # RFC 1123 regex for valid PVC names: lowercase alphanumeric, '-', or '.'. + return bool(re.match(r'^[a-z0-9]([a-z0-9\-]*[a-z0-9])?(\.[a-z0-9]([a-z0-9\-]*[a-z0-9])?)*$', name)) From eea6de6c822f9037244ef1fe2aee3093afed4398 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Fri, 17 Jan 2025 14:30:34 +0100 Subject: [PATCH 2/8] fix lint Signed-off-by: mahdikhashan --- .../kubeflow/katib/api/katib_client.py | 366 +++++++++--------- .../v1beta1/kubeflow/katib/utils/utils.py | 6 +- 2 files changed, 189 insertions(+), 183 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index dffede0d0be..755b329cabc 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -33,11 +33,11 @@ class KatibClient(object): def __init__( - self, - config_file: Optional[str] = None, - context: Optional[str] = None, - client_configuration: Optional[client.Configuration] = None, - namespace: str = utils.get_default_target_namespace(), + self, + config_file: Optional[str] = None, + context: Optional[str] = None, + client_configuration: Optional[client.Configuration] = None, + namespace: str = utils.get_default_target_namespace(), ): """KatibClient constructor. Configure logging in your application as follows to see detailed information from the KatibClient APIs: @@ -89,9 +89,9 @@ def _is_ipython(self): return True def create_experiment( - self, - experiment: models.V1beta1Experiment, - namespace: Optional[str] = None, + self, + experiment: models.V1beta1Experiment, + namespace: Optional[str] = None, ): """Create the Katib Experiment. @@ -164,46 +164,46 @@ def create_experiment( ) def tune( - self, - # TODO (andreyvelich): How to be consistent with other APIs (name) ? - name: str, - model_provider_parameters: Optional[ - "HuggingFaceModelParams" # noqa: F821 - ] = None, - dataset_provider_parameters: Optional[ - Union["HuggingFaceDatasetParams", "S3DatasetParams"] # noqa: F821 - ] = None, - trainer_parameters: Optional["HuggingFaceTrainerParams"] = None, # noqa: F821 - storage_config: Optional[Dict[str, Optional[Union[str, List[str]]]]] = { - "size": constants.PVC_DEFAULT_SIZE, - "storage_class": None, - "access_modes": constants.PVC_DEFAULT_ACCESS_MODES, - }, - objective: Optional[Callable] = None, - base_image: Optional[str] = constants.BASE_IMAGE_TENSORFLOW, - parameters: Optional[Dict[str, Any]] = None, - namespace: Optional[str] = None, - env_per_trial: Optional[ - Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] - ] = None, - algorithm_name: str = "random", - algorithm_settings: Union[ - dict, List[models.V1beta1AlgorithmSetting], None - ] = None, - objective_metric_name: str = None, - additional_metric_names: List[str] = [], - objective_type: str = "maximize", - objective_goal: float = None, - max_trial_count: int = None, - parallel_trial_count: int = None, - max_failed_trial_count: int = None, - resources_per_trial: Optional[ - Union[dict, client.V1ResourceRequirements, TrainerResources] - ] = None, - retain_trials: bool = False, - packages_to_install: List[str] = None, - pip_index_url: str = "https://pypi.org/simple", - metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, + self, + # TODO (andreyvelich): How to be consistent with other APIs (name) ? + name: str, + model_provider_parameters: Optional[ + "HuggingFaceModelParams" # noqa: F821 + ] = None, + dataset_provider_parameters: Optional[ + Union["HuggingFaceDatasetParams", "S3DatasetParams"] # noqa: F821 + ] = None, + trainer_parameters: Optional["HuggingFaceTrainerParams"] = None, # noqa: F821 + storage_config: Optional[Dict[str, Optional[Union[str, List[str]]]]] = { + "size": constants.PVC_DEFAULT_SIZE, + "storage_class": None, + "access_modes": constants.PVC_DEFAULT_ACCESS_MODES, + }, + objective: Optional[Callable] = None, + base_image: Optional[str] = constants.BASE_IMAGE_TENSORFLOW, + parameters: Optional[Dict[str, Any]] = None, + namespace: Optional[str] = None, + env_per_trial: Optional[ + Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]] + ] = None, + algorithm_name: str = "random", + algorithm_settings: Union[ + dict, List[models.V1beta1AlgorithmSetting], None + ] = None, + objective_metric_name: str = None, + additional_metric_names: List[str] = [], + objective_type: str = "maximize", + objective_goal: float = None, + max_trial_count: int = None, + parallel_trial_count: int = None, + max_failed_trial_count: int = None, + resources_per_trial: Optional[ + Union[dict, client.V1ResourceRequirements, TrainerResources] + ] = None, + retain_trials: bool = False, + packages_to_install: List[str] = None, + pip_index_url: str = "https://pypi.org/simple", + metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, ): """ Create HyperParameter Tuning Katib Experiment using one of the following @@ -349,19 +349,19 @@ class name in this argument. """ if ( - ( - model_provider_parameters is not None - or dataset_provider_parameters is not None - or trainer_parameters is not None - ) - and (objective is not None or parameters is not None) + ( + model_provider_parameters is not None + or dataset_provider_parameters is not None + or trainer_parameters is not None + ) + and (objective is not None or parameters is not None) ) or ( - ( - model_provider_parameters is None - and dataset_provider_parameters is None - and trainer_parameters is None - ) - and (objective is None and parameters is None) + ( + model_provider_parameters is None + and dataset_provider_parameters is None + and trainer_parameters is None + ) + and (objective is None and parameters is None) ): raise ValueError( "Invalid configuration for creating a Katib Experiment for hyperparameter " @@ -506,9 +506,9 @@ class name in this argument. # If users choose to use external models and datasets. else: if ( - not model_provider_parameters - or not dataset_provider_parameters - or not trainer_parameters + not model_provider_parameters + or not dataset_provider_parameters + or not trainer_parameters ): raise ValueError("One of the required parameters is None") @@ -561,7 +561,8 @@ class name in this argument. # TODO (helenxie-bit): PVC Creation should be part of Katib Controller. try: if not utils.is_valid_pvc_name(name): - raise ValueError(f""" + raise ValueError( + f""" Invalid PVC name '{name}'. It must comply with RFC 1123. A lowercase RFC 1123 subdomain must consist of lowercase alphanumeric characters, '-' or '.', @@ -569,7 +570,8 @@ class name in this argument. For example, 'example.com' is valid. The regex used for validation is: '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*' - """) + """ + ) self.core_api.create_namespaced_persistent_volume_claim( namespace=namespace, @@ -743,10 +745,10 @@ class name in this argument. self.create_experiment(experiment, namespace) def get_experiment( - self, - name: str, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Get the Katib Experiment. @@ -785,9 +787,9 @@ def get_experiment( raise RuntimeError(f"Failed to get Katib Experiment: {namespace}/{name}") def list_experiments( - self, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """List of all Katib Experiments in namespace. @@ -834,11 +836,11 @@ def list_experiments( return result def get_experiment_conditions( - self, - name: str, - namespace: Optional[str] = None, - experiment: models.V1beta1Experiment = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + experiment: models.V1beta1Experiment = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Get the Experiment conditions. Experiment is in the condition when `status` is True for the appropriate condition `type`. @@ -867,20 +869,20 @@ def get_experiment_conditions( experiment = self.get_experiment(name, namespace, timeout) if ( - experiment.status - and experiment.status.conditions - and len(experiment.status.conditions) > 0 + experiment.status + and experiment.status.conditions + and len(experiment.status.conditions) > 0 ): return experiment.status.conditions return [] def is_experiment_created( - self, - name: str, - namespace: Optional[str] = None, - experiment: models.V1beta1Experiment = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + experiment: models.V1beta1Experiment = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Check if Experiment is Created. @@ -907,11 +909,11 @@ def is_experiment_created( ) def is_experiment_running( - self, - name: str, - namespace: Optional[str] = None, - experiment: models.V1beta1Experiment = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + experiment: models.V1beta1Experiment = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Check if Experiment is Running. @@ -938,11 +940,11 @@ def is_experiment_running( ) def is_experiment_restarting( - self, - name: str, - namespace: Optional[str] = None, - experiment: models.V1beta1Experiment = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + experiment: models.V1beta1Experiment = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Check if Experiment is Restarting. Args: @@ -968,11 +970,11 @@ def is_experiment_restarting( ) def is_experiment_succeeded( - self, - name: str, - namespace: Optional[str] = None, - experiment: models.V1beta1Experiment = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + experiment: models.V1beta1Experiment = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Check if Experiment is Succeeded. Args: @@ -998,11 +1000,11 @@ def is_experiment_succeeded( ) def is_experiment_failed( - self, - name: str, - namespace: Optional[str] = None, - experiment: models.V1beta1Experiment = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + experiment: models.V1beta1Experiment = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Check if Experiment is Failed. Args: @@ -1028,13 +1030,13 @@ def is_experiment_failed( ) def wait_for_experiment_condition( - self, - name: str, - namespace: Optional[str] = None, - expected_condition: str = constants.EXPERIMENT_CONDITION_SUCCEEDED, - timeout: int = 600, - polling_interval: int = 15, - apiserver_timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + expected_condition: str = constants.EXPERIMENT_CONDITION_SUCCEEDED, + timeout: int = 600, + polling_interval: int = 15, + apiserver_timeout: int = constants.DEFAULT_TIMEOUT, ): """Wait until Experiment reaches specific condition. By default it waits for the Succeeded condition. @@ -1066,10 +1068,10 @@ def wait_for_experiment_condition( # Wait for Failed condition. if ( - expected_condition == constants.EXPERIMENT_CONDITION_FAILED - and self.is_experiment_failed( - name, namespace, experiment, apiserver_timeout - ) + expected_condition == constants.EXPERIMENT_CONDITION_FAILED + and self.is_experiment_failed( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) logger.debug( @@ -1079,7 +1081,7 @@ def wait_for_experiment_condition( # Raise exception if Experiment is Failed. elif self.is_experiment_failed( - name, namespace, experiment, apiserver_timeout + name, namespace, experiment, apiserver_timeout ): raise RuntimeError( f"Experiment: {namespace}/{name} is Failed. " @@ -1088,10 +1090,10 @@ def wait_for_experiment_condition( # Check if Experiment reaches Created condition. elif ( - expected_condition == constants.EXPERIMENT_CONDITION_CREATED - and self.is_experiment_created( - name, namespace, experiment, apiserver_timeout - ) + expected_condition == constants.EXPERIMENT_CONDITION_CREATED + and self.is_experiment_created( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) logger.debug( @@ -1101,10 +1103,10 @@ def wait_for_experiment_condition( # Check if Experiment reaches Running condition. elif ( - expected_condition == constants.EXPERIMENT_CONDITION_RUNNING - and self.is_experiment_running( - name, namespace, experiment, apiserver_timeout - ) + expected_condition == constants.EXPERIMENT_CONDITION_RUNNING + and self.is_experiment_running( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) logger.debug( @@ -1114,10 +1116,10 @@ def wait_for_experiment_condition( # Check if Experiment reaches Restarting condition. elif ( - expected_condition == constants.EXPERIMENT_CONDITION_RESTARTING - and self.is_experiment_restarting( - name, namespace, experiment, apiserver_timeout - ) + expected_condition == constants.EXPERIMENT_CONDITION_RESTARTING + and self.is_experiment_restarting( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) logger.debug( @@ -1127,10 +1129,10 @@ def wait_for_experiment_condition( # Check if Experiment reaches Succeeded condition. elif ( - expected_condition == constants.EXPERIMENT_CONDITION_SUCCEEDED - and self.is_experiment_succeeded( - name, namespace, experiment, apiserver_timeout - ) + expected_condition == constants.EXPERIMENT_CONDITION_SUCCEEDED + and self.is_experiment_succeeded( + name, namespace, experiment, apiserver_timeout + ) ): utils.print_experiment_status(experiment) @@ -1154,13 +1156,13 @@ def wait_for_experiment_condition( ) def edit_experiment_budget( - self, - name: str, - namespace: Optional[str] = None, - max_trial_count: int = None, - parallel_trial_count: int = None, - max_failed_trial_count: int = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + max_trial_count: int = None, + parallel_trial_count: int = None, + max_failed_trial_count: int = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Update Experiment budget for the running Trials. You can modify Trial budget to resume Succeeded Experiments with `LongRunning` and `FromVolume` @@ -1190,9 +1192,9 @@ def edit_experiment_budget( # The new Trial budget must be set. if ( - max_trial_count is None - and parallel_trial_count is None - and max_failed_trial_count is None + max_trial_count is None + and parallel_trial_count is None + and max_failed_trial_count is None ): raise ValueError( "Invalid input arguments. " @@ -1227,10 +1229,10 @@ def edit_experiment_budget( logger.debug(f"Experiment {namespace}/{name} has been updated") def delete_experiment( - self, - name: str, - namespace: Optional[str] = None, - delete_options: client.V1DeleteOptions = None, + self, + name: str, + namespace: Optional[str] = None, + delete_options: client.V1DeleteOptions = None, ): """Delete the Katib Experiment. @@ -1266,10 +1268,10 @@ def delete_experiment( logger.debug(f"Experiment {namespace}/{name} has been deleted") def get_suggestion( - self, - name: str, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Get the Katib Suggestion. @@ -1308,9 +1310,9 @@ def get_suggestion( raise RuntimeError(f"Failed to get Katib Suggestion: {namespace}/{name}") def list_suggestions( - self, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """List of all Katib Suggestion in namespace. @@ -1357,10 +1359,10 @@ def list_suggestions( return result def get_trial( - self, - name: str, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Get the Katib Trial. @@ -1399,10 +1401,10 @@ def get_trial( raise RuntimeError(f"Failed to get Katib Trial: {namespace}/{name}") def list_trials( - self, - experiment_name: str = None, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + experiment_name: str = None, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """List of all Trials in namespace. If Experiment name is set, it returns all Trials belong to the Experiment. @@ -1459,10 +1461,10 @@ def list_trials( return result def get_success_trial_details( - self, - experiment_name: str = None, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + experiment_name: str = None, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Get the Succeeded Trial details. If Experiment name is set, it returns Succeeded Trials details belong to the Experiment. @@ -1509,12 +1511,12 @@ def get_success_trial_details( utils.FakeResponse(item), models.V1beta1Trial ) if ( - trial.status - and trial.status.conditions - and len(trial.status.conditions) > 0 + trial.status + and trial.status.conditions + and len(trial.status.conditions) > 0 ): if utils.has_condition( - trial.status.conditions, constants.TRIAL_CONDITION_SUCCEEDED + trial.status.conditions, constants.TRIAL_CONDITION_SUCCEEDED ): output = {} output["name"] = trial.metadata.name @@ -1532,10 +1534,10 @@ def get_success_trial_details( return result def get_optimal_hyperparameters( - self, - name: str, - namespace: Optional[str] = None, - timeout: int = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, ): """Get the current optimal Trial from the Experiment. @@ -1558,20 +1560,20 @@ def get_optimal_hyperparameters( experiment = self.get_experiment(name, namespace, timeout) if ( - experiment.status - and experiment.status.current_optimal_trial - and experiment.status.current_optimal_trial.observation.metrics + experiment.status + and experiment.status.current_optimal_trial + and experiment.status.current_optimal_trial.observation.metrics ): return experiment.status.current_optimal_trial else: return None def get_trial_metrics( - self, - name: str, - namespace: Optional[str] = None, - db_manager_address: str = constants.DEFAULT_DB_MANAGER_ADDRESS, - timeout: str = constants.DEFAULT_TIMEOUT, + self, + name: str, + namespace: Optional[str] = None, + db_manager_address: str = constants.DEFAULT_DB_MANAGER_ADDRESS, + timeout: str = constants.DEFAULT_TIMEOUT, ): """Get the Trial Metric Results from the Katib DB. Katib DB Manager service should be accessible while calling this API. diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index c740b46fdf4..4a50bb76b24 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -272,4 +272,8 @@ def get_exec_script_from_objective( def is_valid_pvc_name(name: str) -> bool: # RFC 1123 regex for valid PVC names: lowercase alphanumeric, '-', or '.'. - return bool(re.match(r'^[a-z0-9]([a-z0-9\-]*[a-z0-9])?(\.[a-z0-9]([a-z0-9\-]*[a-z0-9])?)*$', name)) + return bool( + re.match( + r"^[a-z0-9]([a-z0-9\-]*[a-z0-9])?(\.[a-z0-9]([a-z0-9\-]*[a-z0-9])?)*$", name + ) + ) From 33028c16c5d6a6d7b49b7dc018bc7fcd22a2d54f Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Fri, 17 Jan 2025 14:46:45 +0100 Subject: [PATCH 3/8] fix lint Signed-off-by: mahdikhashan --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 755b329cabc..4b39d6427d9 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -564,8 +564,9 @@ class name in this argument. raise ValueError( f""" Invalid PVC name '{name}'. It must comply with RFC 1123. - - A lowercase RFC 1123 subdomain must consist of lowercase alphanumeric characters, '-' or '.', + + A lowercase RFC 1123 subdomain must consist of lowercase + alphanumeric characters, '-' or '.', and must start and end with an alphanumeric character. For example, 'example.com' is valid. The regex used for validation is: From 516b4b5f2b90d8b626b7abcf223858632e3bb3a4 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Mon, 27 Jan 2025 19:26:14 +0100 Subject: [PATCH 4/8] raise value error for wrong name format by reconciliation Signed-off-by: mahdikhashan --- .../kubeflow/katib/api/katib_client.py | 19 +++++-------------- .../kubeflow/katib/api/katib_client_test.py | 7 +++++++ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 4b39d6427d9..97b58576cb0 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -560,20 +560,6 @@ class name in this argument. # Create PVC for the Storage Initializer. # TODO (helenxie-bit): PVC Creation should be part of Katib Controller. try: - if not utils.is_valid_pvc_name(name): - raise ValueError( - f""" - Invalid PVC name '{name}'. It must comply with RFC 1123. - - A lowercase RFC 1123 subdomain must consist of lowercase - alphanumeric characters, '-' or '.', - and must start and end with an alphanumeric character. - For example, 'example.com' is valid. - The regex used for validation is: - '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*' - """ - ) - self.core_api.create_namespaced_persistent_volume_claim( namespace=namespace, body=training_utils.get_pvc_spec( @@ -583,6 +569,11 @@ class name in this argument. ), ) except Exception as e: + if hasattr(e, "status") and e.status == 422: + raise ValueError( + f"An Experiment with the name {name} is not valid." + ) + pvc_list = self.core_api.list_namespaced_persistent_volume_claim( namespace=namespace ) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py index 0a78d75f3bb..674a5bdad61 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py @@ -310,6 +310,13 @@ def create_experiment( }, ValueError, ), + ( + "wrong name format", + { + "name": "Llama3.1-fine-tune", + }, + ValueError, + ), ( "invalid hybrid parameters - objective and model_provider_parameters", { From 651b2315710135bc96fa0287fe65d69a203500f3 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Mon, 27 Jan 2025 19:37:15 +0100 Subject: [PATCH 5/8] revert created utils Signed-off-by: mahdikhashan --- sdk/python/v1beta1/kubeflow/katib/utils/utils.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py index 4a50bb76b24..28f3126bbfa 100644 --- a/sdk/python/v1beta1/kubeflow/katib/utils/utils.py +++ b/sdk/python/v1beta1/kubeflow/katib/utils/utils.py @@ -17,7 +17,6 @@ import json import logging import os -import re import textwrap from typing import Any, Callable, Dict, List, Optional, Union @@ -268,12 +267,3 @@ def get_exec_script_from_objective( # Return executable script to execute objective function. return exec_script - - -def is_valid_pvc_name(name: str) -> bool: - # RFC 1123 regex for valid PVC names: lowercase alphanumeric, '-', or '.'. - return bool( - re.match( - r"^[a-z0-9]([a-z0-9\-]*[a-z0-9])?(\.[a-z0-9]([a-z0-9\-]*[a-z0-9])?)*$", name - ) - ) From 86a1841bbe71f8a75f2963d9e0af34c98e518300 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Mon, 27 Jan 2025 20:40:45 +0100 Subject: [PATCH 6/8] improve test case name Signed-off-by: mahdikhashan --- sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py index 674a5bdad61..f6a17017c34 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py @@ -311,7 +311,7 @@ def create_experiment( ValueError, ), ( - "wrong name format", + "invalid name format", { "name": "Llama3.1-fine-tune", }, From d6c7319b477ead767f42e8edaa8f7d66e0dc0517 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Mon, 27 Jan 2025 21:06:42 +0100 Subject: [PATCH 7/8] improve value error message Signed-off-by: mahdikhashan --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 97b58576cb0..f9573473bcd 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -571,7 +571,9 @@ class name in this argument. except Exception as e: if hasattr(e, "status") and e.status == 422: raise ValueError( - f"An Experiment with the name {name} is not valid." + f"The Experiment name '{name}' is invalid. It must use only lowercase " + f"alphanumeric characters ('a-z', '0-9'), hyphens ('-'), or periods ('.'). " + f"It must also start and end with an alphanumeric character." ) pvc_list = self.core_api.list_namespaced_persistent_volume_claim( From 7a03bce8f77916b80e5eefd17efc0796c270ee16 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Mon, 27 Jan 2025 23:34:56 +0100 Subject: [PATCH 8/8] improve code flow Signed-off-by: mahdikhashan --- .../v1beta1/kubeflow/katib/api/katib_client.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index f9573473bcd..caf2a45aaca 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -575,17 +575,8 @@ class name in this argument. f"alphanumeric characters ('a-z', '0-9'), hyphens ('-'), or periods ('.'). " f"It must also start and end with an alphanumeric character." ) - - pvc_list = self.core_api.list_namespaced_persistent_volume_claim( - namespace=namespace - ) - # Check if the PVC with the specified name exists. - for pvc in pvc_list.items: - if pvc.metadata.name == name: - print( - f"PVC '{name}' already exists in namespace " f"{namespace}." - ) - break + elif hasattr(e, "status") and e.status == 409: + print(f"PVC '{name}' already exists in namespace " f"{namespace}.") else: raise RuntimeError(f"failed to create PVC. Error: {e}")