kubeflow · truc0 · Feb 5, 2025 · Mar 13, 2025 · Mar 13, 2025 · Electronic-Waste
diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
@@ -16,7 +16,7 @@
 import logging
 import multiprocessing
 import time
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, TypedDict, Union
 
 import grpc
 import kubeflow.katib.katib_api_pb2 as katib_api_pb2
@@ -42,6 +42,14 @@
 
 logger = logging.getLogger(__name__)
 
+TuneStoragePerTrialType = TypedDict(
+    "TuneStoragePerTrial",
+    {
+        "volume": Union[client.V1Volume, Dict[str, Any]],
+        "mount_path": Union[str, client.V1VolumeMount],
+    },
+)
+
 
 class KatibClient(object):
     def __init__(
@@ -198,6 +206,7 @@ def tune(
         env_per_trial: Optional[
             Union[Dict[str, str], List[Union[client.V1EnvVar, client.V1EnvFromSource]]]
         ] = None,
+        storage_per_trial: Optional[List[TuneStoragePerTrialType]] = None,
         algorithm_name: str = "random",
         algorithm_settings: Union[
             dict, List[models.V1beta1AlgorithmSetting], None
@@ -288,6 +297,21 @@ class name in this argument.
                 https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1EnvVar.md)
                 or a kubernetes.client.models.V1EnvFromSource (documented here:
                 https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1EnvFromSource.md)
+            storage_per_trial: List of storage configurations for each trial container.
+                Each element in the list should be a dictionary with two keys:
+                - volume: Either a kubernetes.client.V1Volume object or a dictionary
+                  containing volume configuration with required fields:
+                  - name: Name of the volume
+                  - type: One of "pvc", "secret", "configmap", or "empty_dir"
+                  Additional fields based on volume type:
+                  - For pvc: claim_name, read_only (optional)
+                  - For secret: secret_name, items (optional), default_mode (optional),
+                    optional (optional)
+                  - For configmap: configmap_name, items (optional), default_mode
+                    (optional), optional (optional)
+                  - For empty_dir: medium (optional), size_limit (optional)
+                - mount_path: Either a kubernetes.client.V1VolumeMount object or a string
+                  specifying the path where the volume should be mounted in the container
             algorithm_name: Search algorithm for the HyperParameter tuning.
             algorithm_settings: Settings for the search algorithm given.
                 For available fields, check this doc:
@@ -503,21 +527,135 @@ class name in this argument.
             container_spec.env = env if env else None
             container_spec.env_from = env_from if env_from else None
 
+            volumes: List[client.V1Volume] = []
+            volume_mounts: List[client.V1VolumeMount] = []
+            if storage_per_trial:
+                if isinstance(storage_per_trial, dict):
+                    storage_per_trial = [storage_per_trial]
+                elif not isinstance(storage_per_trial, list):
+                    raise ValueError("storage_per_trial must be a list of dictionaries")
+                for storage in storage_per_trial:
+                    volume = None
+                    if isinstance(storage["volume"], client.V1Volume):
+                        volume = storage["volume"]
+                    elif isinstance(storage["volume"], dict):
+                        volume_name = storage["volume"].get("name")
+                        volume_type = storage["volume"].get("type")
+
+                        if not volume_name:
+                            raise ValueError(
+                                "storage_per_trial['volume'] does not have a 'name' key"
+                            )
+                        if not volume_type:
+                            raise ValueError(
+                                "storage_per_trial['volume'] does not have a 'type' key"
+                            )
+
+                        if volume_type == "pvc":
+                            volume_claim_name = storage["volume"].get("claim_name")
+                            if not volume_claim_name:
+                                raise ValueError(
+                                    "storage_per_trial['volume'] should have a "
+                                    "'claim_name' key for type pvc"
+                                )
+                            volume = client.V1Volume(
+                                name=volume_name,
+                                persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
+                                    claim_name=volume_claim_name,
+                                    read_only=storage["volume"].get("read_only", False),
+                                ),
+                            )
+                        elif volume_type == "secret":
+                            volume = client.V1Volume(
+                                name=volume_name,
+                                secret=client.V1SecretVolumeSource(
+                                    secret_name=storage["volume"].get("secret_name"),
+                                    items=storage["volume"].get("items", None),
+                                    default_mode=storage["volume"].get(
+                                        "default_mode", None
+                                    ),
+                                    optional=storage["volume"].get("optional", False),
+                                ),
+                            )
+                        elif volume_type == "configmap":
+                            volume = client.V1Volume(
+                                name=volume_name,
+                                configmap=client.V1ConfigMapVolumeSource(
+                                    name=storage["volume"].get("configmap_name"),
+                                    items=storage["volume"].get("items", []),
+                                    default_mode=storage["volume"].get(
+                                        "default_mode", None
+                                    ),
+                                    optional=storage["volume"].get("optional", False),
+                                ),
+                            )
+                        elif volume_type == "empty_dir":
+                            volume = client.V1Volume(
+                                name=volume_name,
+                                empty_dir=client.V1EmptyDirVolumeSource(
+                                    medium=storage["volume"].get("medium", None),
+                                    size_limit=storage["volume"].get(
+                                        "size_limit", None
+                                    ),
+                                ),
+                            )
+                        else:
+                            raise ValueError(
+                                "storage_per_trial['volume'] must be a client.V1Volume or a dict"
+                            )
+
+                    else:
+                        raise ValueError(
+                            "storage_per_trial['volume'] must be a client.V1Volume or a dict"
+                        )
+
+                    volumes.append(volume)
+
+                    if isinstance(storage["mount_path"], client.V1VolumeMount):
+                        volume_mounts.append(storage["mount_path"])
+                    elif isinstance(storage["mount_path"], str):
+                        volume_mounts.append(
+                            client.V1VolumeMount(
+                                name=volume_name, mount_path=storage["mount_path"]
+                            )
+                        )
+                    else:
+                        raise ValueError(
+                            "storage_per_trial['mount_path'] must be a "
+                            "client.V1VolumeMount or a str"
+                        )
+
+                # inject volume mounts to the container spec, do nothing if volume_mounts is empty
+                if volume_mounts:
+                    if isinstance(container_spec.volume_mounts, list):
+                        container_spec.volume_mounts.extend(volume_mounts)
+                    else:
+                        container_spec.volume_mounts = volume_mounts
+
             # Trial uses PyTorchJob for distributed training if TrainerResources is set.
             if isinstance(resources_per_trial, TrainerResources):
                 trial_template = utils.get_trial_template_with_pytorchjob(
                     retain_trials,
                     trial_parameters,
                     resources_per_trial,
-                    training_utils.get_pod_template_spec(containers=[container_spec]),
-                    training_utils.get_pod_template_spec(containers=[container_spec]),
+                    training_utils.get_pod_template_spec(
+                        containers=[container_spec],
+                        volumes=volumes if volumes else None,
+                    ),
+                    training_utils.get_pod_template_spec(
+                        containers=[container_spec],
+                        volumes=volumes if volumes else None,
+                    ),
                 )
             # Otherwise, Trial uses Job for model training.
             else:
                 trial_template = utils.get_trial_template_with_job(
                     retain_trials,
                     trial_parameters,
-                    training_utils.get_pod_template_spec(containers=[container_spec]),
+                    training_utils.get_pod_template_spec(
+                        containers=[container_spec],
+                        volumes=volumes if volumes else None,
+                    ),
                 )
 
         # If users choose to use external models and datasets.
@@ -584,7 +722,7 @@ class name in this argument.
                         f"It must also start and end with an alphanumeric character."
                     )
                 elif hasattr(e, "status") and e.status == 409:
-                    print(f"PVC '{name}' already exists in namespace " f"{namespace}.")
+                    print(f"PVC '{name}' already exists in namespace {namespace}.")
                 else:
                     raise RuntimeError(f"failed to create PVC. Error: {e}")
 

diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py
@@ -404,6 +404,50 @@ def create_experiment(
         },
         ValueError,
     ),
+    (
+        "invalid storage_per_trial - type mismatch",
+        {
+            "name": "tune_test",
+            "objective": lambda x: print(f"a={x}"),
+            "parameters": {"a": katib.search.int(min=10, max=100)},
+            "storage_per_trial": "invalid",
+        },
+        ValueError,
+    ),
+    (
+        "invalid storage_per_trial - volume has no name",
+        {
+            "name": "tune_test",
+            "objective": lambda x: print(f"a={x}"),
+            "parameters": {"a": katib.search.int(min=10, max=100)},
+            "storage_per_trial": [
+                {
+                    "volume": {
+                        # do not provide name
+                        "type": "pvc",
+                    }
+                }
+            ],
+        },
+        ValueError,
+    ),
+    (
+        "invalid storage_per_trial - volume has no type",
+        {
+            "name": "tune_test",
+            "objective": lambda x: print(f"a={x}"),
+            "parameters": {"a": katib.search.int(min=10, max=100)},
+            "storage_per_trial": [
+                {
+                    "volume": {
+                        "name": "test-pvc",
+                        # do not provide type
+                    }
+                }
+            ],
+        },
+        ValueError,
+    ),
     (
         "invalid model_provider_parameters",
         {