Skip to content

Commit fd7a2c0

Browse files
Add snapshot method
1 parent 1a0aaf2 commit fd7a2c0

5 files changed

Lines changed: 473 additions & 16 deletions

File tree

clients/python/agentic-sandbox-client/k8s_agent_sandbox/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,7 @@
2929

3030
PODSNAPSHOT_API_GROUP = "podsnapshot.gke.io"
3131
PODSNAPSHOT_API_VERSION = "v1alpha1"
32+
PODSNAPSHOT_PLURAL = "podsnapshots"
33+
PODSNAPSHOTMANUALTRIGGER_PLURAL = "podsnapshotmanualtriggers"
34+
PODSNAPSHOTMANUALTRIGGER_API_KIND = "PodSnapshotManualTrigger"
3235
PODSNAPSHOT_API_KIND = "PodSnapshot"

clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot.md

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,24 @@ This directory contains the Python client extension for interacting with the Age
44

55
## `podsnapshot_client.py`
66

7-
This file defines the `PodSnapshotSandboxClient` class, which extend the base `SandboxClient` to provide snapshot capabilities.
7+
This file defines the `PodSnapshotSandboxClient` class, which extends the base `SandboxClient` to provide snapshot capabilities.
88

99
### `PodSnapshotSandboxClient`
1010

11-
A specialized Sandbox client for interacting with the gke pod snapshot controller.
11+
A specialized Sandbox client for interacting with the GKE Pod Snapshot Controller.
1212

1313
### Key Features:
1414

15-
* **`PodSnapshotSandboxClient(template_name: str, ...)`**:
16-
* Initializes the client with optional server port.
17-
15+
* **`PodSnapshotSandboxClient(template_name: str, podsnapshot_timeout: int = 180, ...)`**:
16+
* Initializes the client with optional podsnapshot timeout.
17+
* **`snapshot(self, trigger_name: str) -> SnapshotResponse`**:
18+
* Triggers a manual snapshot of the current sandbox pod by creating a `PodSnapshotManualTrigger` resource.
19+
* The `trigger_name` is suffixed with a timestamp and unique hash.
20+
* Waits for the snapshot to be processed.
21+
* The Pod Snapshot Controller creates a `PodSnapshot` resource automatically.
22+
* Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name, snapshot_uid).
1823
* **`__exit__(self)`**:
24+
* Cleans up the `PodSnapshotManualTrigger` resources.
1925
* Cleans up the `SandboxClaim` resources.
2026

2127
## `test_podsnapshot_extension.py`
@@ -24,8 +30,10 @@ This file, located in the parent directory (`clients/python/agentic-sandbox-clie
2430

2531
### Test Phases:
2632

27-
1. **Phase 1: Starting Counter Sandbox**:
33+
1. **Phase 1: Starting Counter Sandbox & Snapshotting**:
2834
* Starts a sandbox with a counter application.
35+
* Takes a snapshot (`test-snapshot-10`) after ~10 seconds.
36+
* Takes a snapshot (`test-snapshot-20`) after ~20 seconds.
2937

3038
### Prerequisites
3139

@@ -59,4 +67,4 @@ python3 clients/python/agentic-sandbox-client/test_podsnapshot_extension.py \
5967
--namespace sandbox-test
6068
```
6169

62-
Adjust the `--namespace`, `--template-name` as needed for your environment.
70+
Adjust the `--namespace`, `--template-name` as needed for your environment.

clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot_client.py

Lines changed: 202 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,34 +13,66 @@
1313
# limitations under the License.
1414

1515
import logging
16-
from kubernetes import client
16+
import os
17+
import uuid
18+
from datetime import datetime
19+
from typing import Any
20+
from dataclasses import dataclass
21+
from kubernetes import client, watch
1722
from kubernetes.client import ApiException
1823
from ..sandbox_client import SandboxClient
1924
from ..constants import (
2025
PODSNAPSHOT_API_GROUP,
2126
PODSNAPSHOT_API_VERSION,
2227
PODSNAPSHOT_API_KIND,
28+
PODSNAPSHOTMANUALTRIGGER_API_KIND,
29+
PODSNAPSHOTMANUALTRIGGER_PLURAL,
2330
)
2431

32+
SNAPSHOT_SUCCESS_CODE = 0
33+
SNAPSHOT_ERROR_CODE = 1
34+
2535
logger = logging.getLogger(__name__)
2636

2737

38+
@dataclass
39+
class SnapshotResult:
40+
"""Result of a snapshot processing operation."""
41+
42+
snapshot_uid: str
43+
snapshot_timestamp: str
44+
45+
46+
@dataclass
47+
class SnapshotResponse:
48+
"""Structured response for snapshot operations."""
49+
50+
success: bool
51+
trigger_name: str
52+
snapshot_uid: str
53+
error_reason: str
54+
error_code: int
55+
56+
2857
class PodSnapshotSandboxClient(SandboxClient):
2958
"""
3059
A specialized Sandbox client for interacting with the GKE Pod Snapshot Controller.
31-
32-
TODO: This class enables users to take a snapshot of their sandbox and restore from the taken snapshot.
60+
This class enables users to take a manual trigger snapshot of their sandbox and restore from the taken snapshot.
3361
"""
3462

3563
def __init__(
3664
self,
3765
template_name: str,
66+
podsnapshot_timeout: int = 180,
3867
**kwargs,
3968
):
4069
super().__init__(template_name, **kwargs)
4170

4271
self.snapshot_crd_installed = False
4372
self.core_v1_api = client.CoreV1Api()
73+
self.podsnapshot_timeout = podsnapshot_timeout
74+
75+
self.created_manual_triggers = []
4476

4577
def __enter__(self) -> "PodSnapshotSandboxClient":
4678
try:
@@ -61,7 +93,9 @@ def __enter__(self) -> "PodSnapshotSandboxClient":
6193

6294
def _check_snapshot_crd_installed(self) -> bool:
6395
"""
64-
Checks if the PodSnapshot CRD is installed in the cluster.
96+
Checks if the PodSnapshot CRD is installed and available in the cluster.
97+
Returns:
98+
bool: True if the CRD is installed, False otherwise.
6599
"""
66100

67101
if self.snapshot_crd_installed:
@@ -87,8 +121,172 @@ def _check_snapshot_crd_installed(self) -> bool:
87121
return False
88122
raise
89123

124+
def _parse_created_snapshot_info(self, obj: dict[str, Any]) -> SnapshotResult:
125+
"""Parses the object to extract snapshot details."""
126+
status = obj.get("status", {})
127+
conditions = status.get("conditions", [])
128+
for condition in conditions:
129+
if (
130+
condition.get("type") == "Triggered"
131+
and condition.get("status") == "True"
132+
and condition.get("reason") == "Complete"
133+
):
134+
snapshot_uid = status.get("snapshotCreated", {}).get("name")
135+
snapshot_timestamp = condition.get("lastTransitionTime")
136+
return SnapshotResult(
137+
snapshot_uid=snapshot_uid,
138+
snapshot_timestamp=snapshot_timestamp,
139+
)
140+
raise ValueError("Snapshot is not yet complete.")
141+
142+
def _wait_for_snapshot_to_be_completed(
143+
self, trigger_name: str, resource_version: str | None = None
144+
) -> SnapshotResult:
145+
"""
146+
Waits for the PodSnapshotManualTrigger to be processed and returns SnapshotResult.
147+
"""
148+
w = watch.Watch()
149+
logger.info(
150+
f"Waiting for snapshot manual trigger '{trigger_name}' to be processed..."
151+
)
152+
153+
kwargs = {}
154+
if resource_version:
155+
kwargs["resource_version"] = resource_version
156+
157+
try:
158+
for event in w.stream(
159+
func=self.custom_objects_api.list_namespaced_custom_object,
160+
namespace=self.namespace,
161+
group=PODSNAPSHOT_API_GROUP,
162+
version=PODSNAPSHOT_API_VERSION,
163+
plural=PODSNAPSHOTMANUALTRIGGER_PLURAL,
164+
field_selector=f"metadata.name={trigger_name}",
165+
timeout_seconds=self.podsnapshot_timeout,
166+
**kwargs,
167+
):
168+
if event["type"] in ["ADDED", "MODIFIED"]:
169+
obj = event["object"]
170+
try:
171+
result = self._parse_created_snapshot_info(obj)
172+
logger.info(
173+
f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {result.snapshot_uid}"
174+
)
175+
return result
176+
except ValueError:
177+
# Continue watching if snapshot is not yet complete
178+
continue
179+
except Exception as e:
180+
logger.error(f"Error watching snapshot: {e}")
181+
raise
182+
finally:
183+
w.stop()
184+
185+
raise TimeoutError(
186+
f"Snapshot manual trigger '{trigger_name}' was not processed within {self.podsnapshot_timeout} seconds."
187+
)
188+
189+
def snapshot(self, trigger_name: str) -> SnapshotResponse:
190+
"""
191+
Triggers a snapshot of the specified pod by creating a PodSnapshotManualTrigger resource.
192+
The trigger_name will be suffixed with a timestamp and random hex string.
193+
Returns:
194+
SnapshotResponse: The result of the operation.
195+
"""
196+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
197+
suffix = uuid.uuid4().hex[:8]
198+
trigger_name = f"{trigger_name}-{timestamp}-{suffix}"
199+
200+
if not self.snapshot_crd_installed:
201+
return SnapshotResponse(
202+
success=False,
203+
trigger_name=trigger_name,
204+
snapshot_uid=None,
205+
error_reason="Snapshot CRD is not installed. Ensure it is installed and running.",
206+
error_code=SNAPSHOT_ERROR_CODE,
207+
)
208+
if not self.pod_name:
209+
return SnapshotResponse(
210+
success=False,
211+
trigger_name=trigger_name,
212+
snapshot_uid=None,
213+
error_reason="Sandbox pod name not found. Ensure sandbox is created.",
214+
error_code=SNAPSHOT_ERROR_CODE,
215+
)
216+
217+
manifest = {
218+
"apiVersion": f"{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}",
219+
"kind": f"{PODSNAPSHOTMANUALTRIGGER_API_KIND}",
220+
"metadata": {"name": trigger_name, "namespace": self.namespace},
221+
"spec": {"targetPod": self.pod_name},
222+
}
223+
224+
try:
225+
created_obj = self.custom_objects_api.create_namespaced_custom_object(
226+
group=PODSNAPSHOT_API_GROUP,
227+
version=PODSNAPSHOT_API_VERSION,
228+
namespace=self.namespace,
229+
plural=PODSNAPSHOTMANUALTRIGGER_PLURAL,
230+
body=manifest,
231+
)
232+
self.created_manual_triggers.append(trigger_name)
233+
234+
# Start watching from the version we just created to avoid missing updates
235+
resource_version = created_obj.get("metadata", {}).get("resourceVersion")
236+
snapshot_result = self._wait_for_snapshot_to_be_completed(
237+
trigger_name, resource_version
238+
)
239+
240+
return SnapshotResponse(
241+
success=True,
242+
trigger_name=trigger_name,
243+
snapshot_uid=snapshot_result.snapshot_uid,
244+
error_reason="",
245+
error_code=SNAPSHOT_SUCCESS_CODE,
246+
)
247+
except ApiException as e:
248+
logger.exception(
249+
f"Failed to create PodSnapshotManualTrigger '{trigger_name}': {e}"
250+
)
251+
return SnapshotResponse(
252+
success=False,
253+
trigger_name=trigger_name,
254+
snapshot_uid=None,
255+
error_reason=f"Failed to create PodSnapshotManualTrigger: {e}",
256+
error_code=SNAPSHOT_ERROR_CODE,
257+
)
258+
except TimeoutError as e:
259+
logger.exception(
260+
f"Snapshot creation timed out for trigger '{trigger_name}': {e}"
261+
)
262+
return SnapshotResponse(
263+
success=False,
264+
trigger_name=trigger_name,
265+
snapshot_uid=None,
266+
error_reason=f"Snapshot creation timed out: {e}",
267+
error_code=SNAPSHOT_ERROR_CODE,
268+
)
269+
90270
def __exit__(self, exc_type, exc_val, exc_tb):
91271
"""
272+
Cleans up the PodSnapshotManualTrigger Resources.
92273
Automatically cleans up the Sandbox.
93274
"""
275+
for trigger_name in self.created_manual_triggers:
276+
try:
277+
self.custom_objects_api.delete_namespaced_custom_object(
278+
group=PODSNAPSHOT_API_GROUP,
279+
version=PODSNAPSHOT_API_VERSION,
280+
namespace=self.namespace,
281+
plural=PODSNAPSHOTMANUALTRIGGER_PLURAL,
282+
name=trigger_name,
283+
)
284+
logger.info(f"Deleted PodSnapshotManualTrigger '{trigger_name}'")
285+
except ApiException as e:
286+
if e.status == 404:
287+
# Ignore if the resource is already deleted
288+
continue
289+
logger.error(
290+
f"Failed to delete PodSnapshotManualTrigger '{trigger_name}': {e}"
291+
)
94292
super().__exit__(exc_type, exc_val, exc_tb)

0 commit comments

Comments
 (0)