Skip to content

Commit 8b1fbbd

Browse files
committed
feat: enable Ray cluster head pod persistency
Signed-off-by: kramaranya <[email protected]>
1 parent c311665 commit 8b1fbbd

File tree

3 files changed

+107
-4
lines changed

3 files changed

+107
-4
lines changed

docs/sphinx/user-docs/cluster-configuration.rst

+42
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,48 @@ Custom Volumes/Volume Mounts
9898
| For more information on creating Volumes and Volume Mounts with Python check out the Python Kubernetes docs (`Volumes <https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Volume.md>`__, `Volume Mounts <https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1VolumeMount.md>`__).
9999
| You can also find further information on Volumes and Volume Mounts by visiting the Kubernetes `documentation <https://kubernetes.io/docs/concepts/storage/volumes/>`__.
100100
101+
GCS Fault Tolerance
102+
------------------
103+
By default, the state of the Ray cluster is transient to the head Pod. Whatever triggers a restart of the head Pod results in losing that state, including Ray Cluster history. To make Ray cluster state persistent you can enable Global Control Service (GCS) fault tolerance with an external Redis storage.
104+
105+
To configure GCS fault tolerance you need to set the following parameters:
106+
107+
.. list-table::
108+
:header-rows: 1
109+
:widths: auto
110+
111+
* - Parameter
112+
- Description
113+
* - ``enable_gcs_ft``
114+
- Boolean to enable GCS fault tolerance
115+
* - ``redis_address``
116+
- Address of the external Redis service, ex: "redis:6379"
117+
* - ``redis_password_secret``
118+
- Dictionary with 'name' and 'key' fields specifying the Kubernetes secret for Redis password
119+
* - ``external_storage_namespace``
120+
- Custom storage namespace for GCS fault tolerance (by default, KubeRay sets it to the RayCluster's UID)
121+
122+
Example configuration:
123+
124+
.. code:: python
125+
126+
from codeflare_sdk import Cluster, ClusterConfiguration
127+
128+
cluster = Cluster(ClusterConfiguration(
129+
name='ray-cluster-with-persistence',
130+
num_workers=2,
131+
enable_gcs_ft=True,
132+
redis_address="redis:6379",
133+
redis_password_secret={
134+
"name": "redis-password-secret",
135+
"key": "password"
136+
},
137+
# external_storage_namespace="my-custom-namespace" # Optional: Custom namespace for GCS data in Redis
138+
))
139+
140+
.. note::
141+
You need to have a Redis instance deployed in your Kubernetes cluster before using this feature.
142+
101143
Deprecating Parameters
102144
----------------------
103145

src/codeflare_sdk/ray/cluster/build_ray_cluster.py

+25
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,31 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"):
170170
},
171171
}
172172

173+
if cluster.config.enable_gcs_ft:
174+
if not cluster.config.redis_address:
175+
raise ValueError(
176+
"redis_address must be provided when enable_gcs_ft is True"
177+
)
178+
179+
gcs_ft_options = {"redisAddress": cluster.config.redis_address}
180+
181+
if cluster.config.external_storage_namespace:
182+
gcs_ft_options[
183+
"externalStorageNamespace"
184+
] = cluster.config.external_storage_namespace
185+
186+
if cluster.config.redis_password_secret:
187+
gcs_ft_options["redisPassword"] = {
188+
"valueFrom": {
189+
"secretKeyRef": {
190+
"name": cluster.config.redis_password_secret["name"],
191+
"key": cluster.config.redis_password_secret["key"],
192+
}
193+
}
194+
}
195+
196+
resource["spec"]["gcsFaultToleranceOptions"] = gcs_ft_options
197+
173198
config_check()
174199
k8s_client = get_api_client() or client.ApiClient()
175200

src/codeflare_sdk/ray/cluster/config.py

+40-4
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,14 @@ class ClusterConfiguration:
100100
A list of V1Volume objects to add to the Cluster
101101
volume_mounts:
102102
A list of V1VolumeMount objects to add to the Cluster
103+
enable_gcs_ft:
104+
A boolean indicating whether to enable GCS fault tolerance.
105+
redis_address:
106+
The address of the Redis server to use for GCS fault tolerance, required when enable_gcs_ft is True.
107+
redis_password_secret:
108+
Kubernetes secret reference containing Redis password. ex: {"name": "secret-name", "key": "password-key"}
109+
external_storage_namespace:
110+
The storage namespace to use for GCS fault tolerance. By default, KubeRay sets it to the UID of RayCluster.
103111
"""
104112

105113
name: str
@@ -142,13 +150,38 @@ class ClusterConfiguration:
142150
annotations: Dict[str, str] = field(default_factory=dict)
143151
volumes: list[V1Volume] = field(default_factory=list)
144152
volume_mounts: list[V1VolumeMount] = field(default_factory=list)
153+
enable_gcs_ft: bool = False
154+
redis_address: Optional[str] = None
155+
redis_password_secret: Optional[Dict[str, str]] = None
156+
external_storage_namespace: Optional[str] = None
145157

146158
def __post_init__(self):
147159
if not self.verify_tls:
148160
print(
149161
"Warning: TLS verification has been disabled - Endpoint checks will be bypassed"
150162
)
151163

164+
if self.enable_gcs_ft:
165+
if not self.redis_address:
166+
raise ValueError(
167+
"redis_address must be provided when enable_gcs_ft is True"
168+
)
169+
170+
if self.redis_password_secret and not isinstance(
171+
self.redis_password_secret, dict
172+
):
173+
raise ValueError(
174+
"redis_password_secret must be a dictionary with 'name' and 'key' fields"
175+
)
176+
177+
if self.redis_password_secret and (
178+
"name" not in self.redis_password_secret
179+
or "key" not in self.redis_password_secret
180+
):
181+
raise ValueError(
182+
"redis_password_secret must contain both 'name' and 'key' fields"
183+
)
184+
152185
self._validate_types()
153186
self._memory_to_resource()
154187
self._memory_to_string()
@@ -283,10 +316,13 @@ def check_type(value, expected_type):
283316
else:
284317
return True
285318
if origin_type is dict:
286-
return all(
287-
check_type(k, args[0]) and check_type(v, args[1])
288-
for k, v in value.items()
289-
)
319+
if value is not None:
320+
return all(
321+
check_type(k, args[0]) and check_type(v, args[1])
322+
for k, v in value.items()
323+
)
324+
else:
325+
return True
290326
if origin_type is tuple:
291327
return all(check_type(elem, etype) for elem, etype in zip(value, args))
292328
if expected_type is int:

0 commit comments

Comments
 (0)