From b70d4ad2025c44984b03ddf44e0b8f4c6078949f Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Wed, 21 Jan 2026 05:21:40 +0000 Subject: [PATCH 01/12] PodsnapshotSandboxclient base methods --- .../agentic_sandbox/constants.py | 38 +++ .../agentic_sandbox/extensions/__init__.py | 2 + .../agentic_sandbox/extensions/podsnapshot.py | 219 ++++++++++++++++++ .../agentic_sandbox/podsnapshot.md | 70 ++++++ .../agentic_sandbox/sandbox_client.py | 15 +- .../test_podsnapshot_extension.py | 162 +++++++++++++ 6 files changed, 492 insertions(+), 14 deletions(-) create mode 100644 clients/python/agentic-sandbox-client/agentic_sandbox/constants.py create mode 100644 clients/python/agentic-sandbox-client/agentic_sandbox/extensions/podsnapshot.py create mode 100644 clients/python/agentic-sandbox-client/agentic_sandbox/podsnapshot.md create mode 100644 clients/python/agentic-sandbox-client/test_podsnapshot_extension.py diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py b/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py new file mode 100644 index 000000000..19b399724 --- /dev/null +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py @@ -0,0 +1,38 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Constants for API Groups and Resources +GATEWAY_API_GROUP = "gateway.networking.k8s.io" +GATEWAY_API_VERSION = "v1" +GATEWAY_PLURAL = "gateways" + +CLAIM_API_GROUP = "extensions.agents.x-k8s.io" +CLAIM_API_VERSION = "v1alpha1" +CLAIM_PLURAL_NAME = "sandboxclaims" + +SANDBOX_API_GROUP = "agents.x-k8s.io" +SANDBOX_API_VERSION = "v1alpha1" +SANDBOX_PLURAL_NAME = "sandboxes" + +POD_NAME_ANNOTATION = "agents.x-k8s.io/pod-name" + +PODSNAPSHOT_API_GROUP = "podsnapshot.gke.io" +PODSNAPSHOT_API_VERSION = "v1alpha1" +PODSNAPSHOT_PLURAL = "podsnapshots" +PODSNAPSHOTMANUALTRIGGER_PLURAL = "podsnapshotmanualtriggers" + +SNAPSHOT_NAMESPACE_SELF_INSTALLED = "gps-system" +SNAPSHOT_NAMESPACE_MANAGED = "gke-managed-pod-snapshots" +SNAPSHOT_CONTROLLER_NAME = "pod-snapshot-controller" +SNAPSHOT_AGENT = "pod-snapshot-agent" \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py index 1a72134ef..0a5b29227 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py @@ -12,3 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .computer_use import ComputerUseSandbox +from .podsnapshot import PodSnapshotSandboxClient diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/podsnapshot.py b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/podsnapshot.py new file mode 100644 index 000000000..401867f64 --- /dev/null +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/podsnapshot.py @@ -0,0 +1,219 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import sys +from kubernetes import client, config, watch +from kubernetes.client import ApiException +from ..sandbox_client import SandboxClient, ExecutionResult +from ..constants import * + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + stream=sys.stdout, +) + + +class PodSnapshotSandboxClient(SandboxClient): + """ + A specialized Sandbox client for interacting with the snapshot controller. + Handles the case only when triggerConfig is type manual. + """ + + def __init__( + self, + template_name: str, + podsnapshot_timeout: int = 180, + server_port: int = 8080, + **kwargs, + ): + + self.controller_ready = False + self.podsnapshot_timeout = podsnapshot_timeout + self.created_snapshots = [] + self.controller_ready = self.snapshot_controller_ready() + + super().__init__( + template_name, server_port=server_port, **kwargs + ) + + def _wait_for_snapshot_processed(self, trigger_name: str): + """ + Waits for the PodSnapshotManualTrigger to be processed and a snapshot created. + """ + w = watch.Watch() + logging.info(f"Waiting for snapshot manual trigger '{trigger_name}' to be processed...") + + try: + for event in w.stream( + func=self.custom_objects_api.list_namespaced_custom_object, + namespace=self.namespace, + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + field_selector=f"metadata.name={trigger_name}", + timeout_seconds=self.podsnapshot_timeout + ): + if event["type"] in ["ADDED", "MODIFIED"]: + obj = event["object"] + status = obj.get("status", {}) + conditions = status.get("conditions", []) + + for condition in conditions: + if ( + condition.get("type") == "Triggered" + and condition.get("status") == "True" + and condition.get("reason") == "Complete" + ): + logging.info(f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {status.get('snapshotCreated', {}).get('name')}") + w.stop() + return + except Exception as e: + logging.error(f"Error watching snapshot: {e}") + raise + + raise TimeoutError(f"Snapshot manual trigger '{trigger_name}' was not processed within {self.podsnapshot_timeout} seconds.") + + def snapshot_controller_ready(self) -> bool: + """ + Checks if the snapshot controller and agent pods are running. + Checks both self-installed (gps-system) and GKE-managed pod snapshot systems. + """ + + if self.controller_ready: + return True + + v1 = client.CoreV1Api() + + def check_namespace(namespace: str, required_components: list[str]) -> bool: + try: + pods = v1.list_namespaced_pod(namespace) + found_components = {component: False for component in required_components} + + for pod in pods.items: + if pod.status.phase == "Running": + name = pod.metadata.name + for component in required_components: + if component in name: + found_components[component] = True + + return all(found_components.values()) + except ApiException: + return False + + # Check self-installed: requires both controller and agent in gps-system + if check_namespace(SNAPSHOT_NAMESPACE_SELF_INSTALLED, [SNAPSHOT_CONTROLLER_NAME, SNAPSHOT_AGENT]): + self.controller_ready = True + return True + + # Check managed: requires only agent in gke-managed-pod-snapshots + if check_namespace(SNAPSHOT_NAMESPACE_MANAGED, [SNAPSHOT_AGENT]): + self.controller_ready = True + return True + + self.controller_ready = False + return self.controller_ready + + def checkpoint(self, trigger_name: str) -> ExecutionResult: + """ + Triggers a snapshot of the specified pod by creating a PodSnapshotManualTrigger resource. + """ + if not self.controller_ready: + return ExecutionResult( + stdout="", + stderr="Snapshot controller is not ready. Ensure it is installed and running.", + exit_code=1 + ) + if not self.pod_name: + return ExecutionResult( + stdout="", + stderr="Sandbox pod name not found. Ensure sandbox is created.", + exit_code=1 + ) + + manifest = { + "apiVersion": f"{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}", + "kind": "PodSnapshotManualTrigger", + "metadata": { + "name": trigger_name, + "namespace": self.namespace + }, + "spec": { + "targetPod": self.pod_name + } + } + + try: + self.custom_objects_api.create_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + body=manifest + ) + self.created_snapshots.append(trigger_name) + self._wait_for_snapshot_processed(trigger_name) + return ExecutionResult( + stdout=f"PodSnapshotManualTrigger '{trigger_name}' created successfully.", + stderr="", + exit_code=0 + ) + except ApiException as e: + return ExecutionResult( + stdout="", + stderr=f"Failed to create PodSnapshotManualTrigger: {e}", + exit_code=1 + ) + except TimeoutError as e: + return ExecutionResult( + stdout="", + stderr=f"Snapshot creation timed out: {e}", + exit_code=1 + ) + + def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None: + """ + Checks for existing snapshots matching the label selector and optional policy name. + Returns a list of valid snapshots sorted by creation timestamp (newest first). + policy_name: Filters snapshots by their spec.policyName. + ready_only: If True, filters out snapshots that are only in 'Ready' state. + """ + pass + + def delete_snapshots(self, **filters) -> int: + """ + Deletes snapshots matching the provided filters. + Returns the count of successfully deleted snapshots. + """ + pass + + def __exit__(self, exc_type, exc_val, exc_tb): + """ + Automatically cleans up the Sandbox and the PSMT Trigger Requests. + """ + for trigger_name in self.created_snapshots: + try: + logging.info(f"Cleaning up Trigger request: {trigger_name}") + self.custom_objects_api.delete_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + name=trigger_name + ) + except ApiException as e: + if e.status != 404: + logging.warning(f"Failed to cleanup trigger '{trigger_name}': {e}") + super().__exit__(exc_type, exc_val, exc_tb) \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/podsnapshot.md b/clients/python/agentic-sandbox-client/agentic_sandbox/podsnapshot.md new file mode 100644 index 000000000..452b21a2b --- /dev/null +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/podsnapshot.md @@ -0,0 +1,70 @@ +# Agentic Sandbox Pod Snapshot Extension + +This directory contains the Python client extension for interacting with the Agentic Sandbox to manage Pod Snapshots. This extension allows you to trigger checkpoints (snapshots) of a running sandbox and restore a new sandbox from the recently created snapshot. + +## `podsnapshot.py` + +This file defines the `PodSnapshotSandboxClient` class, which extends the base `SandboxClient` to provide snapshot capabilities. + +### Key Features: + +* **`PodSnapshotSandboxClient(template_name: str, ...)`**: + * Initializes the client. + * If snapshot exists, the pod snapshot controller restores from the most recent snapshot matching the label of the `SandboxTemplate`, otherwise creates a new `Sandbox`. +* **`snapshot_controller_ready(self) -> bool`**: + * Checks if the snapshot agent (both self-installed and GKE managed) is running and ready. +* **`checkpoint(self, trigger_name: str) -> ExecutionResult`**: + * Triggers a manual snapshot of the current sandbox pod by creating a `PodSnapshotManualTrigger` resource. + * Waits for the snapshot to be processed. + * The pod snapshot controller creates a `PodSnapshot` resource automatically. +* **`list_snapshots(self, policy_name: str, ready_only: bool) -> list`**: + * TBD +* **`delete_snapshots(self, **filters) -> int`**: + * TBD +* **Automatic Cleanup**: + * The `__exit__` method attempts to clean up triggers `PodSnapshotManualTrigger` created during the session, keeping the `PodSnapshot` resources alive after session exit. + +## `test_podsnapshot_extension.py` + +This file, located in the parent directory (`clients/python/agentic-sandbox-client/`), contains an integration test script for the `PodSnapshotSandboxClient` extension. It verifies the checkpoint and restore functionality. + +### Test Phases: + +1. **Phase 1: Starting Counter & Checkpointing**: + * Starts a sandbox with a counter application. + * Takes a snapshot (`test-snapshot-10`) after ~10 seconds. + * Takes a second snapshot (`test-snapshot-20`) after another ~10 seconds. +2. **Phase 2: Restoring from Recent Snapshot**: + * Restores a sandbox from the second snapshot. + * Verifies that the counter continues from where it left off (>= 20), proving the state was preserved. + +### Prerequisites + +1. **Python Virtual Environment**: + ```bash + python3 -m venv .venv + source .venv/bin/activate + ``` + +2. **Install Dependencies**: + ```bash + pip install kubernetes + pip install -e clients/python/agentic-sandbox-client/ + ``` + +3. **Pod Snapshot Controller**: The Pod Snapshot controller must be installed in the standard cluster running inside gVisor(Userguide). The GCS bucket to store the pod snapshot states and respective permissions must be applied. +4. **CRDs**: `PodSnapshotStorageConfig`, `PodSnapshotPolicy` CRDs must be applied. `PodSnapshotPolicy` should specify the selector match labels. +5. **Sandbox Template**: A `SandboxTemplate` (e.g., `python-counter-template`) with runtime gVisor and label that matches that selector label in `PodSnapshotPolicy` must be available in the cluster. + +### Running Tests: + +To run the integration test, execute the script with the appropriate arguments: + +```bash +python3 clients/python/agentic-sandbox-client/test_podsnapshot_extension.py \ + --labels app=agent-sandbox-workload \ + --template-name python-counter-template \ + --namespace sandbox-test +``` + +Adjust the `--namespace`, `--template-name`, and `--labels` as needed for your environment. \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/sandbox_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/sandbox_client.py index 6fc3260ff..e3aabeee8 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/sandbox_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/sandbox_client.py @@ -36,20 +36,7 @@ initialize_tracer, TracerManager, trace_span, trace, OPENTELEMETRY_AVAILABLE ) -# Constants for API Groups and Resources -GATEWAY_API_GROUP = "gateway.networking.k8s.io" -GATEWAY_API_VERSION = "v1" -GATEWAY_PLURAL = "gateways" - -CLAIM_API_GROUP = "extensions.agents.x-k8s.io" -CLAIM_API_VERSION = "v1alpha1" -CLAIM_PLURAL_NAME = "sandboxclaims" - -SANDBOX_API_GROUP = "agents.x-k8s.io" -SANDBOX_API_VERSION = "v1alpha1" -SANDBOX_PLURAL_NAME = "sandboxes" - -POD_NAME_ANNOTATION = "agents.x-k8s.io/pod-name" +from .constants import * logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py new file mode 100644 index 000000000..7781d8c74 --- /dev/null +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py @@ -0,0 +1,162 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import asyncio +import time +from kubernetes import client, config +import re +from agentic_sandbox.extensions import PodSnapshotSandboxClient + +POD_NAME_ANNOTATION = "agents.x-k8s.io/pod-name" + + +async def main(template_name: str, api_url: str | None, namespace: str, server_port: int, labels: dict[str, str]): + """ + Tests the Sandbox client by creating a sandbox, running a command, + and then cleaning up. + """ + + print( + f"--- Starting Sandbox Client Test (Namespace: {namespace}, Port: {server_port}) ---") + + # Load kube config + try: + config.load_incluster_config() + except config.ConfigException: + config.load_kube_config() + + wait_time = 10 + first_snapshot_name = "test-snapshot-10" + second_snapshot_name = "test-snapshot-20" + v1 = client.CoreV1Api() + + try: + print("\n***** Phase 1: Starting Counter *****") + + with PodSnapshotSandboxClient( + template_name=template_name, + namespace=namespace, + api_url=api_url, + server_port=server_port + ) as sandbox: + print("\n======= Testing Pod Snapshot Extension =======") + assert sandbox.controller_ready == True, "Sandbox controller is not ready." + + time.sleep(wait_time) + print(f"Creating first pod snapshot '{first_snapshot_name}' after {wait_time} seconds...") + snapshot_result = sandbox.checkpoint(first_snapshot_name) + print(f"Trigger Snapshot Command Stdout: {snapshot_result.stdout.strip()}") + print(f"Trigger Command Stderr: {snapshot_result.stderr.strip()}") + print(f"Trigger Command Exit Code: {snapshot_result.exit_code}") + + assert snapshot_result.exit_code == 0 + + time.sleep(wait_time) + + print(f"\nCreating second pod snapshot '{second_snapshot_name}' after {wait_time} seconds...") + snapshot_result = sandbox.checkpoint(second_snapshot_name) + print(f"Trigger Snapshot Command Stdout: {snapshot_result.stdout.strip()}") + print(f"Trigger Command Stderr: {snapshot_result.stderr.strip()}") + print(f"Trigger Command Exit Code: {snapshot_result.exit_code}") + + assert snapshot_result.exit_code == 0 + + + print("\n***** Phase 2: Restoring from most recent snapshot & Verifying *****") + with PodSnapshotSandboxClient( + template_name=template_name, + namespace=namespace, + api_url=api_url, + server_port=server_port + ) as sandbox_restored: # restores from second_snapshot_name by default + + print("\nWaiting 5 seconds for restored pod to resume printing...") + time.sleep(5) + + # Fetch logs using the Kubernetes API + logs = v1.read_namespaced_pod_log( + name=sandbox_restored.pod_name, + namespace=sandbox_restored.namespace + ) + + # logs must not be empty + counts = [int(n) for n in re.findall(r"Count: (\d+)", logs)] + assert len(counts) > 0, "Failed to retrieve any 'Count:' logs from restored pod." + + # The first number printed by the restored pod must be >= 20. + # If it restarted, it would be 0 or 1. + first_count = counts[0] + print("this is the first_count:", first_count) + assert first_count >= wait_time * 2, ( + f"State Mismatch! Expected counter to start >= {wait_time*2}, " + f"but got {first_count}. The pod likely restarted from scratch." + ) + + print("--- Pod Snapshot Test Passed! ---") + + except Exception as e: + print(f"\n--- An error occurred during the test: {e} ---") + # The __exit__ method of the Sandbox class will handle cleanup. + finally: + print("\n--- Sandbox Client Test Finished ---") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test the Sandbox client.") + parser.add_argument( + "--template-name", + default="python-sandbox-template", + help="The name of the sandbox template to use for the test." + ) + + # Default is None to allow testing the Port-Forward fallback + parser.add_argument( + "--gateway-name", + default=None, + help="The name of the Gateway resource. If omitted, defaults to local port-forward mode." + ) + + parser.add_argument( + "--gateway-namespace", + default=None, + help="The namespace of the Gateway resource. If omitted, defaults to local port-forward mode." + ) + + parser.add_argument( + "--api-url", help="Direct URL to router (e.g. http://localhost:8080)", default=None) + parser.add_argument("--namespace", default="default", + help="Namespace to create sandbox in") + parser.add_argument("--server-port", type=int, default=8888, + help="Port the sandbox container listens on") + parser.add_argument("--labels", nargs="+", default=["app=sandbox-test"], + help="Labels for the sandbox pod/claim in key=value format (e.g. app=sandbox-test env=dev)") + + args = parser.parse_args() + + labels_dict = {} + for l in args.labels: + if "=" in l: + k, v = l.split("=", 1) + labels_dict[k] = v + else: + print(f"Warning: Ignoring invalid label format '{l}'. Use key=value.") + + asyncio.run(main( + template_name=args.template_name, + api_url=args.api_url, + namespace=args.namespace, + server_port=args.server_port, + labels=labels_dict + )) +# python3 test_podsnapshot_extension.py --labels app=agent-sandbox-workload --template-name python-counter-template --namespace sandbox-test From 2aa44f494fba1cb59cdc3edfa36824644132f52a Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Thu, 5 Feb 2026 16:26:08 +0000 Subject: [PATCH 02/12] Update pss client --- .../agentic_sandbox/extensions/__init__.py | 3 +- .../gke_extensions/__init__.py | 15 +++ .../{ => gke_extensions}/podsnapshot.md | 28 +++-- .../podsnapshot_client.py} | 102 +++++++++++------- .../python-counter-template.yaml | 29 +++++ .../test_podsnapshot_extension.py | 19 ++-- 6 files changed, 139 insertions(+), 57 deletions(-) create mode 100644 clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/__init__.py rename clients/python/agentic-sandbox-client/agentic_sandbox/{ => gke_extensions}/podsnapshot.md (72%) rename clients/python/agentic-sandbox-client/agentic_sandbox/{extensions/podsnapshot.py => gke_extensions/podsnapshot_client.py} (75%) create mode 100644 clients/python/agentic-sandbox-client/python-counter-template.yaml diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py index 0a5b29227..9fbc1daf2 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py @@ -12,5 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .computer_use import ComputerUseSandbox -from .podsnapshot import PodSnapshotSandboxClient +from .computer_use import ComputerUseSandbox \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/__init__.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/__init__.py new file mode 100644 index 000000000..9aad1555a --- /dev/null +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .podsnapshot_client import PodSnapshotSandboxClient \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/podsnapshot.md b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md similarity index 72% rename from clients/python/agentic-sandbox-client/agentic_sandbox/podsnapshot.md rename to clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md index 452b21a2b..7597441ba 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/podsnapshot.md +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md @@ -2,27 +2,37 @@ This directory contains the Python client extension for interacting with the Agentic Sandbox to manage Pod Snapshots. This extension allows you to trigger checkpoints (snapshots) of a running sandbox and restore a new sandbox from the recently created snapshot. -## `podsnapshot.py` +## `podsnapshot_client.py` -This file defines the `PodSnapshotSandboxClient` class, which extends the base `SandboxClient` to provide snapshot capabilities. +This file defines the `SnapshotPersistenceManager` and `PodSnapshotSandboxClient` class, which extend the base `SandboxClient` to provide snapshot capabilities. + +### `SnapshotPersistenceManager` + +A utility class for managing local persistence of snapshot metadata in a secure directory. Stores metadata as a dictionary keyed by `trigger_name`. + +### `PodSnapshotSandboxClient` + +A specialized Sandbox client for interacting with the gke pod snapshot controller. ### Key Features: -* **`PodSnapshotSandboxClient(template_name: str, ...)`**: - * Initializes the client. +* **`PodSnapshotSandboxClient(template_name: str, podsnapshot_timeout: int = 180, server_port: int = 8080, ...)`**: + * Initializes the client with optional podsnapshot timeout and server port. * If snapshot exists, the pod snapshot controller restores from the most recent snapshot matching the label of the `SandboxTemplate`, otherwise creates a new `Sandbox`. * **`snapshot_controller_ready(self) -> bool`**: * Checks if the snapshot agent (both self-installed and GKE managed) is running and ready. -* **`checkpoint(self, trigger_name: str) -> ExecutionResult`**: +* **`checkpoint(self, trigger_name: str) -> tuple[ExecutionResult, str]`**: * Triggers a manual snapshot of the current sandbox pod by creating a `PodSnapshotManualTrigger` resource. + * The trigger_name is suffixed with the current datetime. * Waits for the snapshot to be processed. * The pod snapshot controller creates a `PodSnapshot` resource automatically. -* **`list_snapshots(self, policy_name: str, ready_only: bool) -> list`**: - * TBD -* **`delete_snapshots(self, **filters) -> int`**: + * Returns a tuple of ExecutionResult and the final trigger name. +* **`list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None`**: * TBD +* **`delete_snapshots(self, trigger_name: str) -> int`**: + * TBD * **Automatic Cleanup**: - * The `__exit__` method attempts to clean up triggers `PodSnapshotManualTrigger` created during the session, keeping the `PodSnapshot` resources alive after session exit. + * The `__exit__` method cleans up the `SandboxClaim` resources. ## `test_podsnapshot_extension.py` diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/podsnapshot.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py similarity index 75% rename from clients/python/agentic-sandbox-client/agentic_sandbox/extensions/podsnapshot.py rename to clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py index 401867f64..2fce99b72 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/podsnapshot.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py @@ -1,4 +1,4 @@ -# Copyright 2025 The Kubernetes Authors. +# Copyright 2026 The Kubernetes Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,9 @@ import logging import sys -from kubernetes import client, config, watch +from datetime import datetime +from typing import Any +from kubernetes import client, watch from kubernetes.client import ApiException from ..sandbox_client import SandboxClient, ExecutionResult from ..constants import * @@ -26,9 +28,35 @@ ) +class SnapshotPersistenceManager: + """ + Manages local persistence of snapshot metadata in a secure directory. + Stores metadata as a dictionary keyed by trigger_name. + """ + def __init__(self): + """Initializes the persistence manager and ensures the secure directory exists.""" + pass + + def _ensure_secure_dir(self): + """Ensures the directory exists with 700 permissions.""" + pass + + def _load_metadata(self) -> dict[str, Any]: + """Loads metadata. Returns an empty dict if file doesn't exist or is invalid.""" + pass + + def save_snapshot_metadata(self, record: dict[str, Any]): + """Saves a snapshot record to the local registry.""" + pass + + def delete_snapshot_metadata(self, trigger_name: str): + """Deletes a snapshot record from the local registry.""" + pass + + class PodSnapshotSandboxClient(SandboxClient): """ - A specialized Sandbox client for interacting with the snapshot controller. + A specialized Sandbox client for interacting with the gke pod snapshot controller. Handles the case only when triggerConfig is type manual. """ @@ -39,23 +67,22 @@ def __init__( server_port: int = 8080, **kwargs, ): + super().__init__( + template_name, server_port=server_port, **kwargs + ) self.controller_ready = False self.podsnapshot_timeout = podsnapshot_timeout - self.created_snapshots = [] self.controller_ready = self.snapshot_controller_ready() - super().__init__( - template_name, server_port=server_port, **kwargs - ) - def _wait_for_snapshot_processed(self, trigger_name: str): + def _wait_for_snapshot_processed(self, trigger_name: str) -> tuple[str, str]: """ - Waits for the PodSnapshotManualTrigger to be processed and a snapshot created. + Waits for the PodSnapshotManualTrigger to be processed and returns (snapshot_uid, timestamp). """ w = watch.Watch() logging.info(f"Waiting for snapshot manual trigger '{trigger_name}' to be processed...") - + try: for event in w.stream( func=self.custom_objects_api.list_namespaced_custom_object, @@ -70,22 +97,25 @@ def _wait_for_snapshot_processed(self, trigger_name: str): obj = event["object"] status = obj.get("status", {}) conditions = status.get("conditions", []) - + for condition in conditions: if ( condition.get("type") == "Triggered" and condition.get("status") == "True" and condition.get("reason") == "Complete" ): - logging.info(f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {status.get('snapshotCreated', {}).get('name')}") + uid = status.get('snapshotCreated', {}).get('name') + timestamp = condition.get('lastTransitionTime') + logging.info(f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {uid}") w.stop() - return + return uid, timestamp except Exception as e: logging.error(f"Error watching snapshot: {e}") raise raise TimeoutError(f"Snapshot manual trigger '{trigger_name}' was not processed within {self.podsnapshot_timeout} seconds.") + def snapshot_controller_ready(self) -> bool: """ Checks if the snapshot controller and agent pods are running. @@ -126,22 +156,28 @@ def check_namespace(namespace: str, required_components: list[str]) -> bool: self.controller_ready = False return self.controller_ready - def checkpoint(self, trigger_name: str) -> ExecutionResult: + + def checkpoint(self, trigger_name: str) -> tuple[ExecutionResult, str]: """ Triggers a snapshot of the specified pod by creating a PodSnapshotManualTrigger resource. + The trigger_name will be suffixed with the current datetime. + Returns: + tuple[ExecutionResult, str]: The result of the operation and the final trigger name (with suffix). """ + trigger_name = f"{trigger_name}-{datetime.now().strftime('%Y%m%d%H%M%S')}" + if not self.controller_ready: return ExecutionResult( stdout="", stderr="Snapshot controller is not ready. Ensure it is installed and running.", exit_code=1 - ) + ), trigger_name if not self.pod_name: return ExecutionResult( stdout="", stderr="Sandbox pod name not found. Ensure sandbox is created.", exit_code=1 - ) + ), trigger_name manifest = { "apiVersion": f"{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}", @@ -163,26 +199,29 @@ def checkpoint(self, trigger_name: str) -> ExecutionResult: plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, body=manifest ) - self.created_snapshots.append(trigger_name) - self._wait_for_snapshot_processed(trigger_name) + snapshot_uid, timestamp = self._wait_for_snapshot_processed(trigger_name) + + # TODO: Add snapshot metadata persistence logic here using SnapshotPersistenceManager + return ExecutionResult( stdout=f"PodSnapshotManualTrigger '{trigger_name}' created successfully.", stderr="", exit_code=0 - ) + ), trigger_name except ApiException as e: return ExecutionResult( stdout="", stderr=f"Failed to create PodSnapshotManualTrigger: {e}", exit_code=1 - ) + ), trigger_name except TimeoutError as e: return ExecutionResult( stdout="", stderr=f"Snapshot creation timed out: {e}", exit_code=1 - ) + ), trigger_name + def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None: """ Checks for existing snapshots matching the label selector and optional policy name. @@ -192,28 +231,17 @@ def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | No """ pass - def delete_snapshots(self, **filters) -> int: + + def delete_snapshots(self, trigger_name: str) -> int: """ - Deletes snapshots matching the provided filters. + Deletes snapshots matching the provided trigger name and the PSMT resources. Returns the count of successfully deleted snapshots. """ pass + def __exit__(self, exc_type, exc_val, exc_tb): """ - Automatically cleans up the Sandbox and the PSMT Trigger Requests. + Automatically cleans up the Sandbox. """ - for trigger_name in self.created_snapshots: - try: - logging.info(f"Cleaning up Trigger request: {trigger_name}") - self.custom_objects_api.delete_namespaced_custom_object( - group=PODSNAPSHOT_API_GROUP, - version=PODSNAPSHOT_API_VERSION, - namespace=self.namespace, - plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, - name=trigger_name - ) - except ApiException as e: - if e.status != 404: - logging.warning(f"Failed to cleanup trigger '{trigger_name}': {e}") super().__exit__(exc_type, exc_val, exc_tb) \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/python-counter-template.yaml b/clients/python/agentic-sandbox-client/python-counter-template.yaml new file mode 100644 index 000000000..f4f288933 --- /dev/null +++ b/clients/python/agentic-sandbox-client/python-counter-template.yaml @@ -0,0 +1,29 @@ +apiVersion: extensions.agents.x-k8s.io/v1alpha1 +kind: SandboxTemplate +metadata: + name: python-counter-template + namespace: default + labels: + language: python +spec: + #enableDisruptionControl: true + #shutdownTime: "2025-12-31T23:59:59Z" + podTemplate: + metadata: + labels: + app: agent-sandbox-workload + spec: + serviceAccountName: default + runtimeClassName: gvisor + containers: + - name: my-container1 + image: python:3.10-slim + command: ["python3", "-c"] + args: + - | + import time + i = 0 + while True: + print(f"Count: {i}", flush=True) + i += 1 + time.sleep(1) diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py index 7781d8c74..d687386b2 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py @@ -15,9 +15,10 @@ import argparse import asyncio import time +import sys from kubernetes import client, config import re -from agentic_sandbox.extensions import PodSnapshotSandboxClient +from agentic_sandbox.gke_extensions import PodSnapshotSandboxClient POD_NAME_ANNOTATION = "agents.x-k8s.io/pod-name" @@ -38,8 +39,8 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p config.load_kube_config() wait_time = 10 - first_snapshot_name = "test-snapshot-10" - second_snapshot_name = "test-snapshot-20" + first_checkpoint_name = "test-snapshot-10" + second_checkpoint_name = "test-snapshot-20" v1 = client.CoreV1Api() try: @@ -55,9 +56,9 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p assert sandbox.controller_ready == True, "Sandbox controller is not ready." time.sleep(wait_time) - print(f"Creating first pod snapshot '{first_snapshot_name}' after {wait_time} seconds...") - snapshot_result = sandbox.checkpoint(first_snapshot_name) - print(f"Trigger Snapshot Command Stdout: {snapshot_result.stdout.strip()}") + print(f"Creating first pod snapshot '{first_checkpoint_name}' after {wait_time} seconds...") + snapshot_result, trigger_name = sandbox.checkpoint(first_checkpoint_name) + print(f"Trigger Command Stdout: {snapshot_result.stdout.strip()}") print(f"Trigger Command Stderr: {snapshot_result.stderr.strip()}") print(f"Trigger Command Exit Code: {snapshot_result.exit_code}") @@ -65,9 +66,9 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p time.sleep(wait_time) - print(f"\nCreating second pod snapshot '{second_snapshot_name}' after {wait_time} seconds...") - snapshot_result = sandbox.checkpoint(second_snapshot_name) - print(f"Trigger Snapshot Command Stdout: {snapshot_result.stdout.strip()}") + print(f"\nCreating second pod snapshot '{second_checkpoint_name}' after {wait_time} seconds...") + snapshot_result, trigger_name = sandbox.checkpoint(second_checkpoint_name) + print(f"Trigger Command Stdout: {snapshot_result.stdout.strip()}") print(f"Trigger Command Stderr: {snapshot_result.stderr.strip()}") print(f"Trigger Command Exit Code: {snapshot_result.exit_code}") From 17b5a9f466fef255c664e064f159ce02af453e5f Mon Sep 17 00:00:00 2001 From: shrutiyam-glitch Date: Thu, 5 Feb 2026 08:28:33 -0800 Subject: [PATCH 03/12] Remove import statement for ComputerUseSandbox --- .../agentic_sandbox/extensions/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py index 9fbc1daf2..1002b1472 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from .computer_use import ComputerUseSandbox \ No newline at end of file From 32cbcef1df2b1c9b0494efb097fe063f68023ae6 Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Sat, 7 Feb 2026 00:32:50 +0000 Subject: [PATCH 04/12] Add unit tests and address comments --- .../agentic_sandbox/constants.py | 1 + .../gke_extensions/podsnapshot_client.py | 86 +++--- .../test_podsnapshot_extension.py | 57 ++-- .../test_podsnapshot_client.py | 253 ++++++++++++++++++ 4 files changed, 343 insertions(+), 54 deletions(-) rename clients/python/agentic-sandbox-client/{ => agentic_sandbox/gke_extensions}/test_podsnapshot_extension.py (70%) create mode 100644 clients/python/agentic-sandbox-client/test_podsnapshot_client.py diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py b/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py index 19b399724..8c51d4c31 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py @@ -31,6 +31,7 @@ PODSNAPSHOT_API_VERSION = "v1alpha1" PODSNAPSHOT_PLURAL = "podsnapshots" PODSNAPSHOTMANUALTRIGGER_PLURAL = "podsnapshotmanualtriggers" +PODSNAPSHOT_API_KIND = "PodSnapshotManualTrigger" SNAPSHOT_NAMESPACE_SELF_INSTALLED = "gps-system" SNAPSHOT_NAMESPACE_MANAGED = "gke-managed-pod-snapshots" diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py index 2fce99b72..d76af3188 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py @@ -14,8 +14,9 @@ import logging import sys -from datetime import datetime +import os from typing import Any +from dataclasses import dataclass from kubernetes import client, watch from kubernetes.client import ApiException from ..sandbox_client import SandboxClient, ExecutionResult @@ -28,6 +29,18 @@ ) +@dataclass +class SnapshotResult: + """Result of a snapshot processing operation.""" + snapshot_uid: str + snapshot_timestamp: str + +@dataclass +class CheckpointResponse: + """Structured response for checkpoint operations.""" + execution_result: ExecutionResult + trigger_name: str + class SnapshotPersistenceManager: """ Manages local persistence of snapshot metadata in a secure directory. @@ -73,12 +86,15 @@ def __init__( self.controller_ready = False self.podsnapshot_timeout = podsnapshot_timeout - self.controller_ready = self.snapshot_controller_ready() + def __enter__(self) -> 'PodSnapshotSandboxClient': + self.controller_ready = self.snapshot_controller_ready() + super().__enter__() + return self - def _wait_for_snapshot_processed(self, trigger_name: str) -> tuple[str, str]: + def _wait_for_snapshot_processed(self, trigger_name: str) -> SnapshotResult: """ - Waits for the PodSnapshotManualTrigger to be processed and returns (snapshot_uid, timestamp). + Waits for the PodSnapshotManualTrigger to be processed and returns SnapshotResult. """ w = watch.Watch() logging.info(f"Waiting for snapshot manual trigger '{trigger_name}' to be processed...") @@ -104,11 +120,11 @@ def _wait_for_snapshot_processed(self, trigger_name: str) -> tuple[str, str]: and condition.get("status") == "True" and condition.get("reason") == "Complete" ): - uid = status.get('snapshotCreated', {}).get('name') - timestamp = condition.get('lastTransitionTime') - logging.info(f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {uid}") + snapshot_uid = status.get('snapshotCreated', {}).get('name') + snapshot_timestamp = condition.get('lastTransitionTime') + logging.info(f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {snapshot_uid}") w.stop() - return uid, timestamp + return SnapshotResult(snapshot_uid=snapshot_uid, snapshot_timestamp=snapshot_timestamp) except Exception as e: logging.error(f"Error watching snapshot: {e}") raise @@ -125,11 +141,11 @@ def snapshot_controller_ready(self) -> bool: if self.controller_ready: return True - v1 = client.CoreV1Api() + core_v1_api = client.CoreV1Api() def check_namespace(namespace: str, required_components: list[str]) -> bool: try: - pods = v1.list_namespaced_pod(namespace) + pods = core_v1_api.list_namespaced_pod(namespace) found_components = {component: False for component in required_components} for pod in pods.items: @@ -157,14 +173,14 @@ def check_namespace(namespace: str, required_components: list[str]) -> bool: return self.controller_ready - def checkpoint(self, trigger_name: str) -> tuple[ExecutionResult, str]: + def checkpoint(self, trigger_name: str) -> CheckpointResponse: """ Triggers a snapshot of the specified pod by creating a PodSnapshotManualTrigger resource. The trigger_name will be suffixed with the current datetime. Returns: tuple[ExecutionResult, str]: The result of the operation and the final trigger name (with suffix). """ - trigger_name = f"{trigger_name}-{datetime.now().strftime('%Y%m%d%H%M%S')}" + trigger_name = f"{trigger_name}-{os.urandom(4).hex()}" if not self.controller_ready: return ExecutionResult( @@ -181,7 +197,7 @@ def checkpoint(self, trigger_name: str) -> tuple[ExecutionResult, str]: manifest = { "apiVersion": f"{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}", - "kind": "PodSnapshotManualTrigger", + "kind": f"{PODSNAPSHOT_API_KIND}", "metadata": { "name": trigger_name, "namespace": self.namespace @@ -199,28 +215,38 @@ def checkpoint(self, trigger_name: str) -> tuple[ExecutionResult, str]: plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, body=manifest ) - snapshot_uid, timestamp = self._wait_for_snapshot_processed(trigger_name) + snapshot_result = self._wait_for_snapshot_processed(trigger_name) # TODO: Add snapshot metadata persistence logic here using SnapshotPersistenceManager - return ExecutionResult( - stdout=f"PodSnapshotManualTrigger '{trigger_name}' created successfully.", - stderr="", - exit_code=0 - ), trigger_name + return CheckpointResponse( + execution_result=ExecutionResult( + stdout=f"PodSnapshotManualTrigger '{trigger_name}' created successfully. Snapshot UID: {snapshot_result.snapshot_uid}", + stderr="", + exit_code=0 + ), + trigger_name=trigger_name + ) except ApiException as e: - return ExecutionResult( - stdout="", - stderr=f"Failed to create PodSnapshotManualTrigger: {e}", - exit_code=1 - ), trigger_name + logging.exception(f"Failed to create PodSnapshotManualTrigger '{trigger_name}': {e}") + return CheckpointResponse( + execution_result=ExecutionResult( + stdout="", + stderr=f"Failed to create PodSnapshotManualTrigger: {e}", + exit_code=1 + ), + trigger_name=trigger_name + ) except TimeoutError as e: - return ExecutionResult( - stdout="", - stderr=f"Snapshot creation timed out: {e}", - exit_code=1 - ), trigger_name - + logging.exception(f"Snapshot creation timed out for trigger '{trigger_name}': {e}") + return CheckpointResponse( + execution_result=ExecutionResult( + stdout="", + stderr=f"Snapshot creation timed out: {e}", + exit_code=1 + ), + trigger_name=trigger_name + ) def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None: """ diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_extension.py similarity index 70% rename from clients/python/agentic-sandbox-client/test_podsnapshot_extension.py rename to clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_extension.py index d687386b2..06e1665a7 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_extension.py @@ -1,4 +1,4 @@ -# Copyright 2025 The Kubernetes Authors. +# Copyright 2026 The Kubernetes Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,6 +22,21 @@ POD_NAME_ANNOTATION = "agents.x-k8s.io/pod-name" +def test_checkpoint_respone(checkpoint_response, checkpoint_name): + assert hasattr(checkpoint_response, "execution_result"), "Checkpoint response missing 'execution_result' attribute" + assert hasattr(checkpoint_response, "trigger_name"), "Checkpoint response missing 'trigger_name' attribute" + + execution_result = checkpoint_response.execution_result + trigger_name = checkpoint_response.trigger_name + + print(f"Trigger Command Stdout: {execution_result.stdout.strip()}") + print(f"Trigger Command Stderr: {execution_result.stderr.strip()}") + print(f"Trigger Command Exit Code: {execution_result.exit_code}") + + assert trigger_name.startswith(checkpoint_name), f"Expected trigger name prefix '{checkpoint_name}', but got '{trigger_name}'" + assert execution_result.stderr == "", f"Expected no error when creating checkpoint '{checkpoint_name}', but got: {execution_result.stderr.strip()}" + assert execution_result.exit_code == 0 + async def main(template_name: str, api_url: str | None, namespace: str, server_port: int, labels: dict[str, str]): """ @@ -41,7 +56,7 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p wait_time = 10 first_checkpoint_name = "test-snapshot-10" second_checkpoint_name = "test-snapshot-20" - v1 = client.CoreV1Api() + core_v1_api = client.CoreV1Api() try: print("\n***** Phase 1: Starting Counter *****") @@ -57,22 +72,15 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p time.sleep(wait_time) print(f"Creating first pod snapshot '{first_checkpoint_name}' after {wait_time} seconds...") - snapshot_result, trigger_name = sandbox.checkpoint(first_checkpoint_name) - print(f"Trigger Command Stdout: {snapshot_result.stdout.strip()}") - print(f"Trigger Command Stderr: {snapshot_result.stderr.strip()}") - print(f"Trigger Command Exit Code: {snapshot_result.exit_code}") - - assert snapshot_result.exit_code == 0 + checkpoint_response = sandbox.checkpoint(first_checkpoint_name) + test_checkpoint_respone(checkpoint_response, first_checkpoint_name) + time.sleep(wait_time) print(f"\nCreating second pod snapshot '{second_checkpoint_name}' after {wait_time} seconds...") - snapshot_result, trigger_name = sandbox.checkpoint(second_checkpoint_name) - print(f"Trigger Command Stdout: {snapshot_result.stdout.strip()}") - print(f"Trigger Command Stderr: {snapshot_result.stderr.strip()}") - print(f"Trigger Command Exit Code: {snapshot_result.exit_code}") - - assert snapshot_result.exit_code == 0 + checkpoint_response = sandbox.checkpoint(second_checkpoint_name) + test_checkpoint_respone(checkpoint_response, second_checkpoint_name) print("\n***** Phase 2: Restoring from most recent snapshot & Verifying *****") @@ -81,28 +89,29 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p namespace=namespace, api_url=api_url, server_port=server_port - ) as sandbox_restored: # restores from second_snapshot_name by default + ) as sandbox_restored: # restores from second_checkpoint_name by default print("\nWaiting 5 seconds for restored pod to resume printing...") time.sleep(5) # Fetch logs using the Kubernetes API - logs = v1.read_namespaced_pod_log( + logs = core_v1_api.read_namespaced_pod_log( name=sandbox_restored.pod_name, namespace=sandbox_restored.namespace ) - # logs must not be empty + # Extract the sequence of 'Count:' values from the pod logs counts = [int(n) for n in re.findall(r"Count: (\d+)", logs)] assert len(counts) > 0, "Failed to retrieve any 'Count:' logs from restored pod." - # The first number printed by the restored pod must be >= 20. - # If it restarted, it would be 0 or 1. - first_count = counts[0] - print("this is the first_count:", first_count) - assert first_count >= wait_time * 2, ( - f"State Mismatch! Expected counter to start >= {wait_time*2}, " - f"but got {first_count}. The pod likely restarted from scratch." + # Verify the counter resumed from the correct checkpoint state. + # The second snapshot was taken after two wait intervals (totaling 20s if wait_time=10). + min_expected_count_at_restore = wait_time * 2 + first_count_after_restore = counts[0] + + assert first_count_after_restore >= min_expected_count_at_restore, ( + f"State Mismatch! Expected counter to start >= {min_expected_count_at_restore}, " + f"but got {first_count_after_restore}. The pod likely restarted from scratch." ) print("--- Pod Snapshot Test Passed! ---") diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_client.py b/clients/python/agentic-sandbox-client/test_podsnapshot_client.py new file mode 100644 index 000000000..a8d1f567b --- /dev/null +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_client.py @@ -0,0 +1,253 @@ +# Copyright 2026 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import os +import logging +from unittest.mock import MagicMock, patch, call +from datetime import datetime +from agentic_sandbox.gke_extensions.podsnapshot_client import PodSnapshotSandboxClient, SnapshotPersistenceManager +from agentic_sandbox.constants import * +from kubernetes import config + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +def load_kubernetes_config(): + """Loads Kubernetes configuration, prioritizing kubeconfig and falling back to an environment variable.""" + try: + config.load_kube_config() + logging.info("Kubernetes config loaded from kubeconfig file.") + except config.ConfigException: + logging.info("Kubeconfig file not found, attempting to load from environment variable.") + try: + config.load_kube_config(config_file=os.getenv("KUBECONFIG_FILE")) + logging.info("Kubernetes config loaded from KUBECONFIG_FILE environment variable.") + except Exception as e: + logging.error(f"Could not load Kubernetes config: {e}", exc_info=True) + raise + + +class TestPodSnapshotSandboxClient(unittest.TestCase): + + def setUp(self): + logging.info("Setting up TestPodSnapshotSandboxClient...") + # Create client without patching super, as it's tested separately + with patch.object(PodSnapshotSandboxClient, 'snapshot_controller_ready', return_value=True): + self.client = PodSnapshotSandboxClient('test-template') + logging.info("Finished setting up TestPodSnapshotSandboxClient.") + + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.SandboxClient') + def test_init(self, mock_super): + """Test initialization of PodSnapshotSandboxClient.""" + logging.info("Starting test_init...") + with patch.object(PodSnapshotSandboxClient, 'snapshot_controller_ready', return_value=True): + client = PodSnapshotSandboxClient('test-template') + mock_super.assert_called_once_with( + 'test-template', + podsnapshot_timeout=180, + server_port=8080 + ) + self.assertTrue(client.controller_ready) + self.assertEqual(client.podsnapshot_timeout, 180) + logging.info("Finished test_init.") + + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api') + def test_snapshot_controller_ready_self_installed(self, mock_v1_class): + """Test snapshot_controller_ready for self-installed scenario.""" + logging.info("Starting test_snapshot_controller_ready_self_installed...") + mock_v1 = MagicMock() + mock_v1_class.return_value = mock_v1 + + # Mock pods in gps-system + mock_pod_controller = MagicMock() + mock_pod_controller.metadata.name = 'pod-snapshot-controller' + mock_pod_controller.status.phase = 'Running' + + mock_pod_agent = MagicMock() + mock_pod_agent.metadata.name = 'pod-snapshot-agent' + mock_pod_agent.status.phase = 'Running' + + mock_pods = MagicMock() + mock_pods.items = [mock_pod_controller, mock_pod_agent] + mock_v1.list_namespaced_pod.return_value = mock_pods + + self.client.controller_ready = False + result = self.client.snapshot_controller_ready() + + self.assertTrue(result) + self.assertTrue(self.client.controller_ready) + mock_v1.list_namespaced_pod.assert_called_with(SNAPSHOT_NAMESPACE_SELF_INSTALLED) + logging.info("Finished test_snapshot_controller_ready_self_installed.") + + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api') + def test_snapshot_controller_ready_managed(self, mock_v1_class): + """Test snapshot_controller_ready for managed scenario.""" + logging.info("Starting test_snapshot_controller_ready_managed...") + mock_v1 = MagicMock() + mock_v1_class.return_value = mock_v1 + + # Mock pods in gke-managed-pod-snapshots + mock_pod_agent = MagicMock() + mock_pod_agent.metadata.name = 'pod-snapshot-agent' + mock_pod_agent.status.phase = 'Running' + + mock_pods = MagicMock() + mock_pods.items = [mock_pod_agent] + mock_v1.list_namespaced_pod.return_value = mock_pods + + self.client.controller_ready = False + result = self.client.snapshot_controller_ready() + + self.assertTrue(result) + self.assertTrue(self.client.controller_ready) + mock_v1.list_namespaced_pod.assert_called_with(SNAPSHOT_NAMESPACE_MANAGED) + logging.info("Finished test_snapshot_controller_ready_managed.") + + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api') + def test_snapshot_controller_ready_not_ready(self, mock_v1_class): + """Test snapshot_controller_ready when not ready.""" + logging.info("Starting test_snapshot_controller_ready_not_ready...") + mock_v1 = MagicMock() + mock_v1_class.return_value = mock_v1 + + mock_pods = MagicMock() + mock_pods.items = [] + mock_v1.list_namespaced_pod.return_value = mock_pods + + self.client.controller_ready = False + result = self.client.snapshot_controller_ready() + + self.assertFalse(result) + self.assertFalse(self.client.controller_ready) + logging.info("Finished test_snapshot_controller_ready_not_ready.") + + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch') + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi') + def test_checkpoint_success(self, mock_custom_class, mock_watch_class): + """Test successful checkpoint creation.""" + logging.info("Starting test_checkpoint_success...") + mock_custom = MagicMock() + mock_custom_class.return_value = mock_custom + + mock_watch = MagicMock() + mock_watch_class.return_value = mock_watch + + self.client.pod_name = 'test-pod' + self.client.controller_ready = True + self.client.namespace = 'test-ns' + + # Mock the watch stream + mock_event = { + 'type': 'MODIFIED', + 'object': { + 'status': { + 'conditions': [{ + 'type': 'Triggered', + 'status': 'True', + 'reason': 'Complete', + 'lastTransitionTime': '2023-01-01T00:00:00Z' + }], + 'snapshotCreated': {'name': 'snapshot-uid'} + } + } + } + mock_watch.stream.return_value = [mock_event] + + trigger_name = f"test-trigger" + result = self.client.checkpoint(trigger_name) + + self.assertEqual(result.execution_result.exit_code, 0) + self.assertIn('test-trigger', result.trigger_name) + self.assertIn(datetime.now().strftime('%Y%m%d'), result.trigger_name) + + # Verify create call + expected_manifest = { + 'apiVersion': f'{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}', + 'kind': f'{PODSNAPSHOT_API_KIND}', + 'metadata': { + 'name': trigger_name, + 'namespace': 'test-ns' + }, + 'spec': { + 'targetPod': 'test-pod' + } + } + mock_custom.create_namespaced_custom_object.assert_called_once_with( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace='test-ns', + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + body=expected_manifest + ) + logging.info("Finished test_checkpoint_success.") + + def test_checkpoint_controller_not_ready(self): + """Test checkpoint when controller is not ready.""" + logging.info("Starting test_checkpoint_controller_not_ready...") + self.client.controller_ready = False + result = self.client.checkpoint('test-trigger') + + self.assertEqual(result.execution_result.exit_code, 1) + self.assertIn('test-trigger', result.trigger_name) + self.assertIn('Snapshot controller is not ready', result.execution_result.stderr) + logging.info("Finished test_checkpoint_controller_not_ready.") + + def test_checkpoint_no_pod_name(self): + """Test checkpoint when pod name is not set.""" + logging.info("Starting test_checkpoint_no_pod_name...") + self.client.controller_ready = True + self.client.pod_name = None + result = self.client.checkpoint('test-trigger') + + self.assertEqual(result.execution_result.exit_code, 1) + self.assertIn('test-trigger', result.trigger_name) + self.assertIn('Sandbox pod name not found', result.execution_result.stderr) + logging.info("Finished test_checkpoint_no_pod_name.") + + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch') + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi') + def test_checkpoint_timeout(self, mock_custom_class, mock_watch_class): + """Test checkpoint timeout scenario.""" + logging.info("Starting test_checkpoint_timeout...") + mock_custom = MagicMock() + mock_custom_class.return_value = mock_custom + + mock_watch = MagicMock() + mock_watch_class.return_value = mock_watch + + self.client.pod_name = 'test-pod' + self.client.controller_ready = True + self.client.podsnapshot_timeout = 1 + + # Mock empty stream (timeout) + mock_watch.stream.return_value = [] + + result = self.client.checkpoint('test-trigger') + + self.assertEqual(result.execution_result.exit_code, 1) + self.assertIn('timed out', result.execution_result.stderr) + logging.info("Finished test_checkpoint_timeout.") + + + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.SandboxClient.__exit__') + def test_exit(self, mock_super_exit): + """Test __exit__ method.""" + logging.info("Starting test_exit...") + self.client.__exit__(None, None, None) + mock_super_exit.assert_called_once_with(None, None, None) + logging.info("Finished test_exit.") + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From dde9dd27bbc450ac6bd09ed018fef03ea3dd90e1 Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Sat, 7 Feb 2026 00:58:02 +0000 Subject: [PATCH 05/12] Update test --- .../test_podsnapshot_client.py | 67 ++++++++----------- .../test_podsnapshot_extension.py | 0 2 files changed, 28 insertions(+), 39 deletions(-) rename clients/python/agentic-sandbox-client/{ => agentic_sandbox/gke_extensions}/test_podsnapshot_client.py (84%) rename clients/python/agentic-sandbox-client/{agentic_sandbox/gke_extensions => }/test_podsnapshot_extension.py (100%) diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py similarity index 84% rename from clients/python/agentic-sandbox-client/test_podsnapshot_client.py rename to clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py index a8d1f567b..ae0a16a31 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py @@ -40,25 +40,34 @@ def load_kubernetes_config(): class TestPodSnapshotSandboxClient(unittest.TestCase): - def setUp(self): + @patch('kubernetes.config') + def setUp(self, mock_config): logging.info("Setting up TestPodSnapshotSandboxClient...") + # Mock kubernetes config loading + mock_config.load_incluster_config.side_effect = config.ConfigException("Not in cluster") + mock_config.load_kube_config.return_value = None + # Create client without patching super, as it's tested separately with patch.object(PodSnapshotSandboxClient, 'snapshot_controller_ready', return_value=True): self.client = PodSnapshotSandboxClient('test-template') + + # Mock the kubernetes APIs on the client instance + self.client.custom_objects_api = MagicMock() + self.client.core_v1_api = MagicMock() + logging.info("Finished setting up TestPodSnapshotSandboxClient.") - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.SandboxClient') - def test_init(self, mock_super): + def test_init(self): """Test initialization of PodSnapshotSandboxClient.""" logging.info("Starting test_init...") - with patch.object(PodSnapshotSandboxClient, 'snapshot_controller_ready', return_value=True): - client = PodSnapshotSandboxClient('test-template') - mock_super.assert_called_once_with( - 'test-template', - podsnapshot_timeout=180, - server_port=8080 - ) - self.assertTrue(client.controller_ready) + with patch('agentic_sandbox.sandbox_client.SandboxClient.__init__', return_value=None) as mock_super: + with patch.object(PodSnapshotSandboxClient, 'snapshot_controller_ready', return_value=True): + client = PodSnapshotSandboxClient('test-template') + mock_super.assert_called_once_with( + 'test-template', + server_port=8080 + ) + self.assertFalse(client.controller_ready) self.assertEqual(client.podsnapshot_timeout, 180) logging.info("Finished test_init.") @@ -133,13 +142,11 @@ def test_snapshot_controller_ready_not_ready(self, mock_v1_class): logging.info("Finished test_snapshot_controller_ready_not_ready.") @patch('agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch') - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi') - def test_checkpoint_success(self, mock_custom_class, mock_watch_class): + def test_checkpoint_success(self, mock_watch_class): """Test successful checkpoint creation.""" logging.info("Starting test_checkpoint_success...") - mock_custom = MagicMock() - mock_custom_class.return_value = mock_custom - + + # Mock the watch mock_watch = MagicMock() mock_watch_class.return_value = mock_watch @@ -164,32 +171,14 @@ def test_checkpoint_success(self, mock_custom_class, mock_watch_class): } mock_watch.stream.return_value = [mock_event] - trigger_name = f"test-trigger" - result = self.client.checkpoint(trigger_name) + result = self.client.checkpoint('test-trigger') self.assertEqual(result.execution_result.exit_code, 0) self.assertIn('test-trigger', result.trigger_name) - self.assertIn(datetime.now().strftime('%Y%m%d'), result.trigger_name) - - # Verify create call - expected_manifest = { - 'apiVersion': f'{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}', - 'kind': f'{PODSNAPSHOT_API_KIND}', - 'metadata': { - 'name': trigger_name, - 'namespace': 'test-ns' - }, - 'spec': { - 'targetPod': 'test-pod' - } - } - mock_custom.create_namespaced_custom_object.assert_called_once_with( - group=PODSNAPSHOT_API_GROUP, - version=PODSNAPSHOT_API_VERSION, - namespace='test-ns', - plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, - body=expected_manifest - ) + self.assertIn('snapshot-uid', result.execution_result.stdout) + + # Verify create call was made + self.client.custom_objects_api.create_namespaced_custom_object.assert_called_once() logging.info("Finished test_checkpoint_success.") def test_checkpoint_controller_not_ready(self): diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py similarity index 100% rename from clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_extension.py rename to clients/python/agentic-sandbox-client/test_podsnapshot_extension.py From 1a361437846f14000860c36f98eb210871d4c8c0 Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Thu, 12 Feb 2026 17:15:42 +0000 Subject: [PATCH 06/12] Address comments --- .../agentic_sandbox/constants.py | 2 - .../agentic_sandbox/extensions/__init__.py | 1 + .../gke_extensions/podsnapshot.md | 14 +- .../gke_extensions/podsnapshot_client.py | 136 +++++++------- .../gke_extensions/test_podsnapshot_client.py | 169 +++++++++--------- .../python-counter-template.yaml | 4 +- .../test_podsnapshot_extension.py | 130 ++++++++------ 7 files changed, 238 insertions(+), 218 deletions(-) diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py b/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py index 8c51d4c31..79959468a 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py @@ -33,7 +33,5 @@ PODSNAPSHOTMANUALTRIGGER_PLURAL = "podsnapshotmanualtriggers" PODSNAPSHOT_API_KIND = "PodSnapshotManualTrigger" -SNAPSHOT_NAMESPACE_SELF_INSTALLED = "gps-system" SNAPSHOT_NAMESPACE_MANAGED = "gke-managed-pod-snapshots" -SNAPSHOT_CONTROLLER_NAME = "pod-snapshot-controller" SNAPSHOT_AGENT = "pod-snapshot-agent" \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py index 1002b1472..1a72134ef 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py @@ -11,3 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md index 7597441ba..9f3ff5efb 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md @@ -23,12 +23,12 @@ A specialized Sandbox client for interacting with the gke pod snapshot controlle * Checks if the snapshot agent (both self-installed and GKE managed) is running and ready. * **`checkpoint(self, trigger_name: str) -> tuple[ExecutionResult, str]`**: * Triggers a manual snapshot of the current sandbox pod by creating a `PodSnapshotManualTrigger` resource. - * The trigger_name is suffixed with the current datetime. + * The trigger_name is suffixed with unique hash. * Waits for the snapshot to be processed. * The pod snapshot controller creates a `PodSnapshot` resource automatically. - * Returns a tuple of ExecutionResult and the final trigger name. + * Returns the CheckpointResponse object(success, error_code, error_reason, trigger_name). * **`list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None`**: - * TBD + * TBD * **`delete_snapshots(self, trigger_name: str) -> int`**: * TBD * **Automatic Cleanup**: @@ -62,9 +62,13 @@ This file, located in the parent directory (`clients/python/agentic-sandbox-clie pip install -e clients/python/agentic-sandbox-client/ ``` -3. **Pod Snapshot Controller**: The Pod Snapshot controller must be installed in the standard cluster running inside gVisor(Userguide). The GCS bucket to store the pod snapshot states and respective permissions must be applied. +3. **Pod Snapshot Controller**: The Pod Snapshot controller must be installed in a **GKE standard cluster** running with **gVisor**. + * For detailed setup instructions, refer to the [GKE Pod Snapshots public documentation](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/pod-snapshots). + * Ensure a GCS bucket is configured to store the pod snapshot states and that the necessary IAM permissions are applied. + 4. **CRDs**: `PodSnapshotStorageConfig`, `PodSnapshotPolicy` CRDs must be applied. `PodSnapshotPolicy` should specify the selector match labels. -5. **Sandbox Template**: A `SandboxTemplate` (e.g., `python-counter-template`) with runtime gVisor and label that matches that selector label in `PodSnapshotPolicy` must be available in the cluster. + +5. **Sandbox Template**: A `SandboxTemplate` (e.g., `python-counter-template`) with runtime gVisor, appropriate KSA and label that matches that selector label in `PodSnapshotPolicy` must be available in the cluster. ### Running Tests: diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py index d76af3188..912493cf2 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py @@ -32,20 +32,27 @@ @dataclass class SnapshotResult: """Result of a snapshot processing operation.""" + snapshot_uid: str snapshot_timestamp: str + @dataclass class CheckpointResponse: """Structured response for checkpoint operations.""" - execution_result: ExecutionResult + + success: bool trigger_name: str + error_reason: str + error_code: int + class SnapshotPersistenceManager: """ Manages local persistence of snapshot metadata in a secure directory. Stores metadata as a dictionary keyed by trigger_name. """ + def __init__(self): """Initializes the persistence manager and ensures the secure directory exists.""" pass @@ -61,7 +68,7 @@ def _load_metadata(self) -> dict[str, Any]: def save_snapshot_metadata(self, record: dict[str, Any]): """Saves a snapshot record to the local registry.""" pass - + def delete_snapshot_metadata(self, trigger_name: str): """Deletes a snapshot record from the local registry.""" pass @@ -70,7 +77,7 @@ def delete_snapshot_metadata(self, trigger_name: str): class PodSnapshotSandboxClient(SandboxClient): """ A specialized Sandbox client for interacting with the gke pod snapshot controller. - Handles the case only when triggerConfig is type manual. + Currently supports manual triggering via PodSnapshotManualTrigger. """ def __init__( @@ -80,14 +87,12 @@ def __init__( server_port: int = 8080, **kwargs, ): - super().__init__( - template_name, server_port=server_port, **kwargs - ) + super().__init__(template_name, server_port=server_port, **kwargs) self.controller_ready = False self.podsnapshot_timeout = podsnapshot_timeout - def __enter__(self) -> 'PodSnapshotSandboxClient': + def __enter__(self) -> "PodSnapshotSandboxClient": self.controller_ready = self.snapshot_controller_ready() super().__enter__() return self @@ -97,8 +102,10 @@ def _wait_for_snapshot_processed(self, trigger_name: str) -> SnapshotResult: Waits for the PodSnapshotManualTrigger to be processed and returns SnapshotResult. """ w = watch.Watch() - logging.info(f"Waiting for snapshot manual trigger '{trigger_name}' to be processed...") - + logging.info( + f"Waiting for snapshot manual trigger '{trigger_name}' to be processed..." + ) + try: for event in w.stream( func=self.custom_objects_api.list_namespaced_custom_object, @@ -107,35 +114,40 @@ def _wait_for_snapshot_processed(self, trigger_name: str) -> SnapshotResult: version=PODSNAPSHOT_API_VERSION, plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, field_selector=f"metadata.name={trigger_name}", - timeout_seconds=self.podsnapshot_timeout + timeout_seconds=self.podsnapshot_timeout, ): if event["type"] in ["ADDED", "MODIFIED"]: obj = event["object"] status = obj.get("status", {}) conditions = status.get("conditions", []) - + for condition in conditions: if ( condition.get("type") == "Triggered" and condition.get("status") == "True" and condition.get("reason") == "Complete" ): - snapshot_uid = status.get('snapshotCreated', {}).get('name') - snapshot_timestamp = condition.get('lastTransitionTime') - logging.info(f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {snapshot_uid}") + snapshot_uid = status.get("snapshotCreated", {}).get("name") + snapshot_timestamp = condition.get("lastTransitionTime") + logging.info( + f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {snapshot_uid}" + ) w.stop() - return SnapshotResult(snapshot_uid=snapshot_uid, snapshot_timestamp=snapshot_timestamp) + return SnapshotResult( + snapshot_uid=snapshot_uid, + snapshot_timestamp=snapshot_timestamp, + ) except Exception as e: logging.error(f"Error watching snapshot: {e}") raise - raise TimeoutError(f"Snapshot manual trigger '{trigger_name}' was not processed within {self.podsnapshot_timeout} seconds.") - + raise TimeoutError( + f"Snapshot manual trigger '{trigger_name}' was not processed within {self.podsnapshot_timeout} seconds." + ) def snapshot_controller_ready(self) -> bool: """ - Checks if the snapshot controller and agent pods are running. - Checks both self-installed (gps-system) and GKE-managed pod snapshot systems. + Checks if the snapshot agent pods are running in a GKE-managed pod snapshot cluster. """ if self.controller_ready: @@ -146,7 +158,9 @@ def snapshot_controller_ready(self) -> bool: def check_namespace(namespace: str, required_components: list[str]) -> bool: try: pods = core_v1_api.list_namespaced_pod(namespace) - found_components = {component: False for component in required_components} + found_components = { + component: False for component in required_components + } for pod in pods.items: if pod.status.phase == "Running": @@ -159,11 +173,6 @@ def check_namespace(namespace: str, required_components: list[str]) -> bool: except ApiException: return False - # Check self-installed: requires both controller and agent in gps-system - if check_namespace(SNAPSHOT_NAMESPACE_SELF_INSTALLED, [SNAPSHOT_CONTROLLER_NAME, SNAPSHOT_AGENT]): - self.controller_ready = True - return True - # Check managed: requires only agent in gke-managed-pod-snapshots if check_namespace(SNAPSHOT_NAMESPACE_MANAGED, [SNAPSHOT_AGENT]): self.controller_ready = True @@ -172,7 +181,6 @@ def check_namespace(namespace: str, required_components: list[str]) -> bool: self.controller_ready = False return self.controller_ready - def checkpoint(self, trigger_name: str) -> CheckpointResponse: """ Triggers a snapshot of the specified pod by creating a PodSnapshotManualTrigger resource. @@ -183,28 +191,25 @@ def checkpoint(self, trigger_name: str) -> CheckpointResponse: trigger_name = f"{trigger_name}-{os.urandom(4).hex()}" if not self.controller_ready: - return ExecutionResult( - stdout="", - stderr="Snapshot controller is not ready. Ensure it is installed and running.", - exit_code=1 - ), trigger_name + return CheckpointResponse( + success=False, + trigger_name=trigger_name, + error_reason="Snapshot controller is not ready. Ensure it is installed and running.", + error_code=1, + ) if not self.pod_name: - return ExecutionResult( - stdout="", - stderr="Sandbox pod name not found. Ensure sandbox is created.", - exit_code=1 - ), trigger_name + return CheckpointResponse( + success=False, + trigger_name=trigger_name, + error_reason="Sandbox pod name not found. Ensure sandbox is created.", + error_code=1, + ) manifest = { "apiVersion": f"{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}", "kind": f"{PODSNAPSHOT_API_KIND}", - "metadata": { - "name": trigger_name, - "namespace": self.namespace - }, - "spec": { - "targetPod": self.pod_name - } + "metadata": {"name": trigger_name, "namespace": self.namespace}, + "spec": {"targetPod": self.pod_name}, } try: @@ -213,41 +218,36 @@ def checkpoint(self, trigger_name: str) -> CheckpointResponse: version=PODSNAPSHOT_API_VERSION, namespace=self.namespace, plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, - body=manifest + body=manifest, ) snapshot_result = self._wait_for_snapshot_processed(trigger_name) - + # TODO: Add snapshot metadata persistence logic here using SnapshotPersistenceManager return CheckpointResponse( - execution_result=ExecutionResult( - stdout=f"PodSnapshotManualTrigger '{trigger_name}' created successfully. Snapshot UID: {snapshot_result.snapshot_uid}", - stderr="", - exit_code=0 - ), - trigger_name=trigger_name + success=True, trigger_name=trigger_name, error_reason="", error_code=0 ) except ApiException as e: - logging.exception(f"Failed to create PodSnapshotManualTrigger '{trigger_name}': {e}") + logging.exception( + f"Failed to create PodSnapshotManualTrigger '{trigger_name}': {e}" + ) return CheckpointResponse( - execution_result=ExecutionResult( - stdout="", - stderr=f"Failed to create PodSnapshotManualTrigger: {e}", - exit_code=1 - ), - trigger_name=trigger_name + success=False, + trigger_name=trigger_name, + error_reason=f"Failed to create PodSnapshotManualTrigger: {e}", + error_code=1, ) except TimeoutError as e: - logging.exception(f"Snapshot creation timed out for trigger '{trigger_name}': {e}") + logging.exception( + f"Snapshot creation timed out for trigger '{trigger_name}': {e}" + ) return CheckpointResponse( - execution_result=ExecutionResult( - stdout="", - stderr=f"Snapshot creation timed out: {e}", - exit_code=1 - ), - trigger_name=trigger_name + success=False, + trigger_name=trigger_name, + error_reason=f"Snapshot creation timed out: {e}", + error_code=1, ) - + def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None: """ Checks for existing snapshots matching the label selector and optional policy name. @@ -257,7 +257,6 @@ def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | No """ pass - def delete_snapshots(self, trigger_name: str) -> int: """ Deletes snapshots matching the provided trigger name and the PSMT resources. @@ -265,9 +264,8 @@ def delete_snapshots(self, trigger_name: str) -> int: """ pass - def __exit__(self, exc_type, exc_val, exc_tb): """ Automatically cleans up the Sandbox. """ - super().__exit__(exc_type, exc_val, exc_tb) \ No newline at end of file + super().__exit__(exc_type, exc_val, exc_tb) diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py index ae0a16a31..9c7e74ccd 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py @@ -17,11 +17,17 @@ import logging from unittest.mock import MagicMock, patch, call from datetime import datetime -from agentic_sandbox.gke_extensions.podsnapshot_client import PodSnapshotSandboxClient, SnapshotPersistenceManager +from agentic_sandbox.gke_extensions.podsnapshot_client import ( + PodSnapshotSandboxClient, + SnapshotPersistenceManager, +) from agentic_sandbox.constants import * from kubernetes import config -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + def load_kubernetes_config(): """Loads Kubernetes configuration, prioritizing kubeconfig and falling back to an environment variable.""" @@ -29,10 +35,14 @@ def load_kubernetes_config(): config.load_kube_config() logging.info("Kubernetes config loaded from kubeconfig file.") except config.ConfigException: - logging.info("Kubeconfig file not found, attempting to load from environment variable.") + logging.info( + "Kubeconfig file not found, attempting to load from environment variable." + ) try: config.load_kube_config(config_file=os.getenv("KUBECONFIG_FILE")) - logging.info("Kubernetes config loaded from KUBECONFIG_FILE environment variable.") + logging.info( + "Kubernetes config loaded from KUBECONFIG_FILE environment variable." + ) except Exception as e: logging.error(f"Could not load Kubernetes config: {e}", exc_info=True) raise @@ -40,66 +50,43 @@ def load_kubernetes_config(): class TestPodSnapshotSandboxClient(unittest.TestCase): - @patch('kubernetes.config') + @patch("kubernetes.config") def setUp(self, mock_config): logging.info("Setting up TestPodSnapshotSandboxClient...") # Mock kubernetes config loading - mock_config.load_incluster_config.side_effect = config.ConfigException("Not in cluster") + mock_config.load_incluster_config.side_effect = config.ConfigException( + "Not in cluster" + ) mock_config.load_kube_config.return_value = None - + # Create client without patching super, as it's tested separately - with patch.object(PodSnapshotSandboxClient, 'snapshot_controller_ready', return_value=True): - self.client = PodSnapshotSandboxClient('test-template') - + with patch.object( + PodSnapshotSandboxClient, "snapshot_controller_ready", return_value=True + ): + self.client = PodSnapshotSandboxClient("test-template") + # Mock the kubernetes APIs on the client instance self.client.custom_objects_api = MagicMock() self.client.core_v1_api = MagicMock() - + logging.info("Finished setting up TestPodSnapshotSandboxClient.") def test_init(self): """Test initialization of PodSnapshotSandboxClient.""" logging.info("Starting test_init...") - with patch('agentic_sandbox.sandbox_client.SandboxClient.__init__', return_value=None) as mock_super: - with patch.object(PodSnapshotSandboxClient, 'snapshot_controller_ready', return_value=True): - client = PodSnapshotSandboxClient('test-template') - mock_super.assert_called_once_with( - 'test-template', - server_port=8080 - ) + with patch( + "agentic_sandbox.sandbox_client.SandboxClient.__init__", return_value=None + ) as mock_super: + with patch.object( + PodSnapshotSandboxClient, "snapshot_controller_ready", return_value=True + ): + client = PodSnapshotSandboxClient("test-template") + mock_super.assert_called_once_with("test-template", server_port=8080) self.assertFalse(client.controller_ready) self.assertEqual(client.podsnapshot_timeout, 180) logging.info("Finished test_init.") - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api') - def test_snapshot_controller_ready_self_installed(self, mock_v1_class): - """Test snapshot_controller_ready for self-installed scenario.""" - logging.info("Starting test_snapshot_controller_ready_self_installed...") - mock_v1 = MagicMock() - mock_v1_class.return_value = mock_v1 - - # Mock pods in gps-system - mock_pod_controller = MagicMock() - mock_pod_controller.metadata.name = 'pod-snapshot-controller' - mock_pod_controller.status.phase = 'Running' - - mock_pod_agent = MagicMock() - mock_pod_agent.metadata.name = 'pod-snapshot-agent' - mock_pod_agent.status.phase = 'Running' - - mock_pods = MagicMock() - mock_pods.items = [mock_pod_controller, mock_pod_agent] - mock_v1.list_namespaced_pod.return_value = mock_pods - - self.client.controller_ready = False - result = self.client.snapshot_controller_ready() - - self.assertTrue(result) - self.assertTrue(self.client.controller_ready) - mock_v1.list_namespaced_pod.assert_called_with(SNAPSHOT_NAMESPACE_SELF_INSTALLED) - logging.info("Finished test_snapshot_controller_ready_self_installed.") - - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api') + @patch("agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api") def test_snapshot_controller_ready_managed(self, mock_v1_class): """Test snapshot_controller_ready for managed scenario.""" logging.info("Starting test_snapshot_controller_ready_managed...") @@ -108,8 +95,8 @@ def test_snapshot_controller_ready_managed(self, mock_v1_class): # Mock pods in gke-managed-pod-snapshots mock_pod_agent = MagicMock() - mock_pod_agent.metadata.name = 'pod-snapshot-agent' - mock_pod_agent.status.phase = 'Running' + mock_pod_agent.metadata.name = "pod-snapshot-agent" + mock_pod_agent.status.phase = "Running" mock_pods = MagicMock() mock_pods.items = [mock_pod_agent] @@ -123,10 +110,10 @@ def test_snapshot_controller_ready_managed(self, mock_v1_class): mock_v1.list_namespaced_pod.assert_called_with(SNAPSHOT_NAMESPACE_MANAGED) logging.info("Finished test_snapshot_controller_ready_managed.") - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api') - def test_snapshot_controller_ready_not_ready(self, mock_v1_class): + @patch("agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api") + def test_snapshot_controller_ready_status_not_ready(self, mock_v1_class): """Test snapshot_controller_ready when not ready.""" - logging.info("Starting test_snapshot_controller_ready_not_ready...") + logging.info("Starting test_snapshot_controller_ready_status_not_ready...") mock_v1 = MagicMock() mock_v1_class.return_value = mock_v1 @@ -139,43 +126,45 @@ def test_snapshot_controller_ready_not_ready(self, mock_v1_class): self.assertFalse(result) self.assertFalse(self.client.controller_ready) - logging.info("Finished test_snapshot_controller_ready_not_ready.") + logging.info("Finished test_snapshot_controller_ready_status_not_ready.") - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch') + @patch("agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch") def test_checkpoint_success(self, mock_watch_class): """Test successful checkpoint creation.""" logging.info("Starting test_checkpoint_success...") - + # Mock the watch mock_watch = MagicMock() mock_watch_class.return_value = mock_watch - self.client.pod_name = 'test-pod' + self.client.pod_name = "test-pod" self.client.controller_ready = True - self.client.namespace = 'test-ns' + self.client.namespace = "test-ns" # Mock the watch stream mock_event = { - 'type': 'MODIFIED', - 'object': { - 'status': { - 'conditions': [{ - 'type': 'Triggered', - 'status': 'True', - 'reason': 'Complete', - 'lastTransitionTime': '2023-01-01T00:00:00Z' - }], - 'snapshotCreated': {'name': 'snapshot-uid'} + "type": "MODIFIED", + "object": { + "status": { + "conditions": [ + { + "type": "Triggered", + "status": "True", + "reason": "Complete", + "lastTransitionTime": "2023-01-01T00:00:00Z", + } + ], + "snapshotCreated": {"name": "snapshot-uid"}, } - } + }, } mock_watch.stream.return_value = [mock_event] - result = self.client.checkpoint('test-trigger') + result = self.client.checkpoint("test-trigger") - self.assertEqual(result.execution_result.exit_code, 0) - self.assertIn('test-trigger', result.trigger_name) - self.assertIn('snapshot-uid', result.execution_result.stdout) + self.assertEqual(result.error_code, 0) + self.assertTrue(result.success) + self.assertIn("test-trigger", result.trigger_name) # Verify create call was made self.client.custom_objects_api.create_namespaced_custom_object.assert_called_once() @@ -185,11 +174,12 @@ def test_checkpoint_controller_not_ready(self): """Test checkpoint when controller is not ready.""" logging.info("Starting test_checkpoint_controller_not_ready...") self.client.controller_ready = False - result = self.client.checkpoint('test-trigger') + result = self.client.checkpoint("test-trigger") - self.assertEqual(result.execution_result.exit_code, 1) - self.assertIn('test-trigger', result.trigger_name) - self.assertIn('Snapshot controller is not ready', result.execution_result.stderr) + self.assertEqual(result.error_code, 1) + self.assertFalse(result.success) + self.assertIn("test-trigger", result.trigger_name) + self.assertIn("Snapshot controller is not ready", result.error_reason) logging.info("Finished test_checkpoint_controller_not_ready.") def test_checkpoint_no_pod_name(self): @@ -197,15 +187,16 @@ def test_checkpoint_no_pod_name(self): logging.info("Starting test_checkpoint_no_pod_name...") self.client.controller_ready = True self.client.pod_name = None - result = self.client.checkpoint('test-trigger') + result = self.client.checkpoint("test-trigger") - self.assertEqual(result.execution_result.exit_code, 1) - self.assertIn('test-trigger', result.trigger_name) - self.assertIn('Sandbox pod name not found', result.execution_result.stderr) + self.assertEqual(result.error_code, 1) + self.assertFalse(result.success) + self.assertIn("test-trigger", result.trigger_name) + self.assertIn("Sandbox pod name not found", result.error_reason) logging.info("Finished test_checkpoint_no_pod_name.") - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch') - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi') + @patch("agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch") + @patch("agentic_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi") def test_checkpoint_timeout(self, mock_custom_class, mock_watch_class): """Test checkpoint timeout scenario.""" logging.info("Starting test_checkpoint_timeout...") @@ -215,21 +206,21 @@ def test_checkpoint_timeout(self, mock_custom_class, mock_watch_class): mock_watch = MagicMock() mock_watch_class.return_value = mock_watch - self.client.pod_name = 'test-pod' + self.client.pod_name = "test-pod" self.client.controller_ready = True self.client.podsnapshot_timeout = 1 # Mock empty stream (timeout) mock_watch.stream.return_value = [] - result = self.client.checkpoint('test-trigger') + result = self.client.checkpoint("test-trigger") - self.assertEqual(result.execution_result.exit_code, 1) - self.assertIn('timed out', result.execution_result.stderr) + self.assertEqual(result.error_code, 1) + self.assertFalse(result.success) + self.assertIn("timed out", result.error_reason) logging.info("Finished test_checkpoint_timeout.") - - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.SandboxClient.__exit__') + @patch("agentic_sandbox.gke_extensions.podsnapshot_client.SandboxClient.__exit__") def test_exit(self, mock_super_exit): """Test __exit__ method.""" logging.info("Starting test_exit...") @@ -239,4 +230,4 @@ def test_exit(self, mock_super_exit): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/clients/python/agentic-sandbox-client/python-counter-template.yaml b/clients/python/agentic-sandbox-client/python-counter-template.yaml index f4f288933..19407c1b2 100644 --- a/clients/python/agentic-sandbox-client/python-counter-template.yaml +++ b/clients/python/agentic-sandbox-client/python-counter-template.yaml @@ -2,7 +2,7 @@ apiVersion: extensions.agents.x-k8s.io/v1alpha1 kind: SandboxTemplate metadata: name: python-counter-template - namespace: default + namespace: sandbox-test labels: language: python spec: @@ -13,7 +13,7 @@ spec: labels: app: agent-sandbox-workload spec: - serviceAccountName: default + serviceAccountName: sandbox-test runtimeClassName: gvisor containers: - name: my-container1 diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py index 06e1665a7..45b817745 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py @@ -20,32 +20,41 @@ import re from agentic_sandbox.gke_extensions import PodSnapshotSandboxClient -POD_NAME_ANNOTATION = "agents.x-k8s.io/pod-name" -def test_checkpoint_respone(checkpoint_response, checkpoint_name): - assert hasattr(checkpoint_response, "execution_result"), "Checkpoint response missing 'execution_result' attribute" - assert hasattr(checkpoint_response, "trigger_name"), "Checkpoint response missing 'trigger_name' attribute" - - execution_result = checkpoint_response.execution_result - trigger_name = checkpoint_response.trigger_name - - print(f"Trigger Command Stdout: {execution_result.stdout.strip()}") - print(f"Trigger Command Stderr: {execution_result.stderr.strip()}") - print(f"Trigger Command Exit Code: {execution_result.exit_code}") - - assert trigger_name.startswith(checkpoint_name), f"Expected trigger name prefix '{checkpoint_name}', but got '{trigger_name}'" - assert execution_result.stderr == "", f"Expected no error when creating checkpoint '{checkpoint_name}', but got: {execution_result.stderr.strip()}" - assert execution_result.exit_code == 0 - - -async def main(template_name: str, api_url: str | None, namespace: str, server_port: int, labels: dict[str, str]): +def test_checkpoint_response(checkpoint_response, checkpoint_name): + assert hasattr( + checkpoint_response, "trigger_name" + ), "Checkpoint response missing 'trigger_name' attribute" + + print(f"Trigger Name: {checkpoint_response.trigger_name}") + print(f"Success: {checkpoint_response.success}") + print(f"Error Code: {checkpoint_response.error_code}") + print(f"Error Reason: {checkpoint_response.error_reason}") + + assert checkpoint_response.trigger_name.startswith( + checkpoint_name + ), f"Expected trigger name prefix '{checkpoint_name}', but got '{checkpoint_response.trigger_name}'" + assert ( + checkpoint_response.success + ), f"Expected success=True, but got False. Reason: {checkpoint_response.error_reason}" + assert checkpoint_response.error_code == 0 + + +async def main( + template_name: str, + api_url: str | None, + namespace: str, + server_port: int, + labels: dict[str, str], +): """ Tests the Sandbox client by creating a sandbox, running a command, and then cleaning up. """ print( - f"--- Starting Sandbox Client Test (Namespace: {namespace}, Port: {server_port}) ---") + f"--- Starting Sandbox Client Test (Namespace: {namespace}, Port: {server_port}) ---" + ) # Load kube config try: @@ -65,50 +74,55 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p template_name=template_name, namespace=namespace, api_url=api_url, - server_port=server_port + server_port=server_port, ) as sandbox: print("\n======= Testing Pod Snapshot Extension =======") assert sandbox.controller_ready == True, "Sandbox controller is not ready." time.sleep(wait_time) - print(f"Creating first pod snapshot '{first_checkpoint_name}' after {wait_time} seconds...") + print( + f"Creating first pod snapshot '{first_checkpoint_name}' after {wait_time} seconds..." + ) checkpoint_response = sandbox.checkpoint(first_checkpoint_name) - test_checkpoint_respone(checkpoint_response, first_checkpoint_name) - + test_checkpoint_response(checkpoint_response, first_checkpoint_name) time.sleep(wait_time) - print(f"\nCreating second pod snapshot '{second_checkpoint_name}' after {wait_time} seconds...") + print( + f"\nCreating second pod snapshot '{second_checkpoint_name}' after {wait_time} seconds..." + ) checkpoint_response = sandbox.checkpoint(second_checkpoint_name) - test_checkpoint_respone(checkpoint_response, second_checkpoint_name) - + test_checkpoint_response(checkpoint_response, second_checkpoint_name) print("\n***** Phase 2: Restoring from most recent snapshot & Verifying *****") with PodSnapshotSandboxClient( template_name=template_name, namespace=namespace, api_url=api_url, - server_port=server_port - ) as sandbox_restored: # restores from second_checkpoint_name by default + server_port=server_port, + ) as sandbox_restored: # restores from second_checkpoint_name by default print("\nWaiting 5 seconds for restored pod to resume printing...") time.sleep(5) - # Fetch logs using the Kubernetes API + # Fetch logs using the Kubernetes API logs = core_v1_api.read_namespaced_pod_log( - name=sandbox_restored.pod_name, - namespace=sandbox_restored.namespace + name=sandbox_restored.pod_name, namespace=sandbox_restored.namespace ) # Extract the sequence of 'Count:' values from the pod logs counts = [int(n) for n in re.findall(r"Count: (\d+)", logs)] - assert len(counts) > 0, "Failed to retrieve any 'Count:' logs from restored pod." + assert ( + len(counts) > 0 + ), "Failed to retrieve any 'Count:' logs from restored pod." # Verify the counter resumed from the correct checkpoint state. # The second snapshot was taken after two wait intervals (totaling 20s if wait_time=10). - min_expected_count_at_restore = wait_time * 2 + min_expected_count_at_restore = wait_time * 2 first_count_after_restore = counts[0] + print(f"First count after restore: {first_count_after_restore}") + assert first_count_after_restore >= min_expected_count_at_restore, ( f"State Mismatch! Expected counter to start >= {min_expected_count_at_restore}, " f"but got {first_count_after_restore}. The pod likely restarted from scratch." @@ -122,35 +136,48 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p finally: print("\n--- Sandbox Client Test Finished ---") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Test the Sandbox client.") parser.add_argument( "--template-name", default="python-sandbox-template", - help="The name of the sandbox template to use for the test." + help="The name of the sandbox template to use for the test.", ) # Default is None to allow testing the Port-Forward fallback parser.add_argument( "--gateway-name", default=None, - help="The name of the Gateway resource. If omitted, defaults to local port-forward mode." + help="The name of the Gateway resource. If omitted, defaults to local port-forward mode.", ) parser.add_argument( "--gateway-namespace", default=None, - help="The namespace of the Gateway resource. If omitted, defaults to local port-forward mode." + help="The namespace of the Gateway resource. If omitted, defaults to local port-forward mode.", ) parser.add_argument( - "--api-url", help="Direct URL to router (e.g. http://localhost:8080)", default=None) - parser.add_argument("--namespace", default="default", - help="Namespace to create sandbox in") - parser.add_argument("--server-port", type=int, default=8888, - help="Port the sandbox container listens on") - parser.add_argument("--labels", nargs="+", default=["app=sandbox-test"], - help="Labels for the sandbox pod/claim in key=value format (e.g. app=sandbox-test env=dev)") + "--api-url", + help="Direct URL to router (e.g. http://localhost:8080)", + default=None, + ) + parser.add_argument( + "--namespace", default="default", help="Namespace to create sandbox in" + ) + parser.add_argument( + "--server-port", + type=int, + default=8888, + help="Port the sandbox container listens on", + ) + parser.add_argument( + "--labels", + nargs="+", + default=["app=sandbox-test"], + help="Labels for the sandbox pod/claim in key=value format (e.g. app=sandbox-test env=dev)", + ) args = parser.parse_args() @@ -162,11 +189,12 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p else: print(f"Warning: Ignoring invalid label format '{l}'. Use key=value.") - asyncio.run(main( - template_name=args.template_name, - api_url=args.api_url, - namespace=args.namespace, - server_port=args.server_port, - labels=labels_dict - )) -# python3 test_podsnapshot_extension.py --labels app=agent-sandbox-workload --template-name python-counter-template --namespace sandbox-test + asyncio.run( + main( + template_name=args.template_name, + api_url=args.api_url, + namespace=args.namespace, + server_port=args.server_port, + labels=labels_dict, + ) + ) From 31ea2a323d54fa10fc68b89664a1e5617daf6b19 Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Thu, 12 Feb 2026 18:04:01 +0000 Subject: [PATCH 07/12] Enable persistent storage metadata, List and delete snapshots --- .../agentic_sandbox/constants.py | 1 + .../gke_extensions/podsnapshot.md | 12 +- .../gke_extensions/podsnapshot_client.py | 234 +++++++++++++++++- .../test_podsnapshot_extension.py | 10 + 4 files changed, 245 insertions(+), 12 deletions(-) diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py b/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py index 79959468a..055684f7d 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py @@ -32,6 +32,7 @@ PODSNAPSHOT_PLURAL = "podsnapshots" PODSNAPSHOTMANUALTRIGGER_PLURAL = "podsnapshotmanualtriggers" PODSNAPSHOT_API_KIND = "PodSnapshotManualTrigger" +PODSNAPSHOTPOLICY_PLURAL = "podsnapshotpolicies" SNAPSHOT_NAMESPACE_MANAGED = "gke-managed-pod-snapshots" SNAPSHOT_AGENT = "pod-snapshot-agent" \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md index 9f3ff5efb..b9284cbd1 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md @@ -28,9 +28,15 @@ A specialized Sandbox client for interacting with the gke pod snapshot controlle * The pod snapshot controller creates a `PodSnapshot` resource automatically. * Returns the CheckpointResponse object(success, error_code, error_reason, trigger_name). * **`list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None`**: - * TBD -* **`delete_snapshots(self, trigger_name: str) -> int`**: - * TBD + * Lists valid snapshots found in the local metadata storage (`~/.snapshot_metadata/.snapshots.json`). + * Filters by `policy_name` and `ready_only` status (default: True). + * Returns a list of dictionaries containing snapshot details (id, trigger_name, source_pod, uid, creationTimestamp, status, policy_name) sorted by creation timestamp (newest first). +* **`delete_snapshots(self, trigger_name: str | None = None) -> int`**: + * Deletes snapshots and their corresponding `PodSnapshotManualTrigger` resources. + * If `trigger_name` is provided, deletes only that specific snapshot. + * If `trigger_name` is `None`, deletes **ALL** snapshots found in the local metadata. + * Cleans up local metadata after successful deletion from K8s. + * Returns the count of successfully deleted snapshots. * **Automatic Cleanup**: * The `__exit__` method cleans up the `SandboxClaim` resources. diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py index 912493cf2..cccf289ad 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py @@ -16,6 +16,8 @@ import sys import os from typing import Any +from pathlib import Path +import json from dataclasses import dataclass from kubernetes import client, watch from kubernetes.client import ApiException @@ -55,23 +57,64 @@ class SnapshotPersistenceManager: def __init__(self): """Initializes the persistence manager and ensures the secure directory exists.""" - pass + self.secure_dir = Path.home() / ".snapshot_metadata" + self._ensure_secure_dir() + self.metadata_file = self.secure_dir / ".snapshots.json" def _ensure_secure_dir(self): """Ensures the directory exists with 700 permissions.""" - pass + if not self.secure_dir.exists(): + self.secure_dir.mkdir(parents=True) + self.secure_dir.chmod(0o700) def _load_metadata(self) -> dict[str, Any]: """Loads metadata. Returns an empty dict if file doesn't exist or is invalid.""" - pass + if not self.metadata_file.exists(): + return {} + try: + with open(self.metadata_file, "r") as f: + data = json.load(f) + if isinstance(data, list): + # Handle legacy list format by clearing it (or converting if preferred, but identifying key is tricky if not consistent) + logging.warning("Found legacy list-format metadata. Resetting to empty dict.") + return {} + return data + except (json.JSONDecodeError, IOError) as e: + logging.warning(f"Failed to load snapshot metadata: {e}") + return {} def save_snapshot_metadata(self, record: dict[str, Any]): """Saves a snapshot record to the local registry.""" - pass + trigger_name = record.get("trigger_name") + if not trigger_name: + logging.error("Cannot save metadata: missing 'trigger_name'.") + return + + snapshots = self._load_metadata() + snapshots[trigger_name] = record + + try: + with open(self.metadata_file, "w") as f: + json.dump(snapshots, f, indent=4) + self.metadata_file.chmod(0o600) + logging.info(f"Snapshot metadata saved to {self.metadata_file}") + except IOError as e: + logging.error(f"Failed to save snapshot metadata: {e}") def delete_snapshot_metadata(self, trigger_name: str): """Deletes a snapshot record from the local registry.""" - pass + snapshots = self._load_metadata() + if trigger_name in snapshots: + del snapshots[trigger_name] + try: + with open(self.metadata_file, "w") as f: + json.dump(snapshots, f, indent=4) + self.metadata_file.chmod(0o600) + logging.info(f"Snapshot metadata deleted for '{trigger_name}'") + except IOError as e: + logging.error(f"Failed to save metadata after deletion: {e}") + else: + logging.info(f"No local metadata found for trigger '{trigger_name}' to delete.") class PodSnapshotSandboxClient(SandboxClient): @@ -91,11 +134,37 @@ def __init__( self.controller_ready = False self.podsnapshot_timeout = podsnapshot_timeout + self.persistence_manager = SnapshotPersistenceManager() def __enter__(self) -> "PodSnapshotSandboxClient": self.controller_ready = self.snapshot_controller_ready() super().__enter__() return self + + def _get_policy_labels(self, snapshot_uid: str) -> dict: + """ + Retrieves the policy labels of the specified PodSnapshot resource. + """ + try: + snapshot = self.custom_objects_api.get_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOT_PLURAL, + name=snapshot_uid + ) + policy_name = snapshot.get("spec", {}).get("policyName", {}) + policy = self.custom_objects_api.get_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOTPOLICY_PLURAL, + name=policy_name + ) + return policy.get("spec", {}).get("selector", {}).get("matchLabels", {}) + except ApiException as e: + logging.error(f"Failed to retrieve PodSnapshot '{snapshot_uid}': {e}") + raise def _wait_for_snapshot_processed(self, trigger_name: str) -> SnapshotResult: """ @@ -222,7 +291,21 @@ def checkpoint(self, trigger_name: str) -> CheckpointResponse: ) snapshot_result = self._wait_for_snapshot_processed(trigger_name) - # TODO: Add snapshot metadata persistence logic here using SnapshotPersistenceManager + policy_labels = self._get_policy_labels(snapshot_result.snapshot_uid) + # Save metadata locally + try: + record = { + "trigger_name": trigger_name, + "uid": snapshot_result.snapshot_uid, + "template_name": self.template_name, + "policy_labels": policy_labels, + "namespace": self.namespace, + "claim_name": self.claim_name, + "timestamp": snapshot_result.snapshot_timestamp + } + self.persistence_manager.save_snapshot_metadata(record) + except Exception as e: + logging.warning(f"Failed to save snapshot metadata locally: {e}") return CheckpointResponse( success=True, trigger_name=trigger_name, error_reason="", error_code=0 @@ -252,17 +335,150 @@ def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | No """ Checks for existing snapshots matching the label selector and optional policy name. Returns a list of valid snapshots sorted by creation timestamp (newest first). + policy_name: Filters snapshots by their spec.policyName. ready_only: If True, filters out snapshots that are only in 'Ready' state. """ - pass + local_meta = self.persistence_manager._load_metadata() + if not local_meta: + logging.info("No local snapshot metadata found.") + return None + + valid_snapshots = [] + + for trigger_name, record in local_meta.items(): + uid = record.get("uid") + if not uid: + continue - def delete_snapshots(self, trigger_name: str) -> int: + try: + # Fetch the actual PodSnapshot resource to verify status and details + snapshot = self.custom_objects_api.get_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOT_PLURAL, + name=uid + ) + except ApiException as e: + # If 404, it means it's in metadata but deleted from K8s. Skip it. + if e.status != 404: + logging.warning(f"Failed to fetch PodSnapshot '{uid}' (trigger: {trigger_name}): {e}") + continue + + spec = snapshot.get("spec", {}) + status = snapshot.get("status", {}) + conditions = status.get("conditions", []) + metadata = snapshot.get("metadata", {}) + + # Filter by policy_name if provided + if policy_name and spec.get("policyName") != policy_name: + continue + + # Check for Ready=True + is_ready = False + for cond in conditions: + if cond.get("type") == "Ready" and cond.get("status") == "True": + is_ready = True + break + + # Skip if only ready snapshots are requested + if ready_only and not is_ready: + continue + + valid_snapshots.append({ + "snapshot_id": metadata.get("name"), + "trigger_name": trigger_name, + "source_pod": metadata.get("labels", {}).get("podsnapshot.gke.io/pod-name", "Unknown"), + "uid": metadata.get("uid"), + "creationTimestamp": metadata.get("creationTimestamp", ""), + "status": "Ready" if is_ready else "NotReady", + "policy_name": spec.get("policyName") + }) + + if not valid_snapshots: + logging.info("No Ready snapshots found matching criteria in local metadata.") + return None + + # Sort snapshots by creation timestamp descending + valid_snapshots.sort(key=lambda x: x["creationTimestamp"], reverse=True) + logging.info(f"Found {len(valid_snapshots)} ready snapshots from local metadata.") + return valid_snapshots + + def delete_snapshots(self, trigger_name: str | None = None) -> int: """ Deletes snapshots matching the provided trigger name and the PSMT resources. Returns the count of successfully deleted snapshots. """ - pass + local_meta = self.persistence_manager._load_metadata() + + triggers_to_delete = [] + + if trigger_name: + if trigger_name in local_meta: + triggers_to_delete.append(trigger_name) + else: + logging.warning(f"Trigger '{trigger_name}' not found in local metadata. No deletion performed.") + return 0 + else: + logging.info("No trigger_name provided. Deleting ALL snapshots found in local metadata.") + triggers_to_delete = list(local_meta.keys()) + + if not triggers_to_delete: + logging.info("No local snapshot metadata found to process.") + return 0 + + delete_count = 0 + for trig in triggers_to_delete: + record = local_meta.get(trig) + if not record: + continue + + uid = record.get("uid") + + # Delete PodSnapshot (if UID is known) + if uid: + try: + logging.info(f"Deleting PodSnapshot '{uid}' associated with trigger '{trig}'...") + self.custom_objects_api.delete_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOT_PLURAL, + name=uid + ) + logging.info(f"PodSnapshot '{uid}' deleted.") + except ApiException as e: + if e.status == 404: + logging.info(f"PodSnapshot '{uid}' not found in K8s (already deleted?).") + else: + logging.error(f"Failed to delete PodSnapshot '{uid}': {e}") + else: + logging.warning(f"No UID associated with trigger '{trig}' in metadata. Skipping PodSnapshot deletion.") + + # Delete PodSnapshotManualTrigger + try: + logging.info(f"Deleting PodSnapshotManualTrigger '{trig}'...") + self.custom_objects_api.delete_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + name=trig + ) + logging.info(f"PodSnapshotManualTrigger '{trig}' deleted.") + except ApiException as e: + if e.status == 404: + logging.info(f"PodSnapshotManualTrigger '{trig}' not found in K8s (already deleted?).") + else: + logging.error(f"Failed to delete PodSnapshotManualTrigger '{trig}': {e}") + + # Cleanup Local Metadata + self.persistence_manager.delete_snapshot_metadata(trig) + delete_count += 1 + + logging.info(f"Snapshot deletion process completed. Deleted {delete_count} snapshots.") + return delete_count def __exit__(self, exc_type, exc_val, exc_tb): """ diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py index 45b817745..8666e9cbe 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py @@ -65,6 +65,7 @@ async def main( wait_time = 10 first_checkpoint_name = "test-snapshot-10" second_checkpoint_name = "test-snapshot-20" + policy_name = "example-psp-workload" core_v1_api = client.CoreV1Api() try: @@ -94,6 +95,11 @@ async def main( checkpoint_response = sandbox.checkpoint(second_checkpoint_name) test_checkpoint_response(checkpoint_response, second_checkpoint_name) + print("\n***** List all existing ready snapshots with the policy name. *****") + snapshots = sandbox.list_snapshots(policy_name=policy_name) + for snap in snapshots: + print(f"Snapshot ID: {snap['snapshot_id']}, Triggered By: {snap['trigger_name']}, Source Pod: {snap['source_pod']}, Creation Time: {snap['creationTimestamp']}, Policy Name: {snap['policy_name']}") + print("\n***** Phase 2: Restoring from most recent snapshot & Verifying *****") with PodSnapshotSandboxClient( template_name=template_name, @@ -128,6 +134,10 @@ async def main( f"but got {first_count_after_restore}. The pod likely restarted from scratch." ) + print("\n**** Deleting snapshots *****") + deleted_snapshots = sandbox_restored.delete_snapshots() + print(f"Deleted Snapshots: {deleted_snapshots}") + print("--- Pod Snapshot Test Passed! ---") except Exception as e: From 1ba1f2ccdc98df22258800db49274e2e811b0730 Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Thu, 12 Feb 2026 20:37:29 +0000 Subject: [PATCH 08/12] Change in term: Update Checkpoint to Snapshot --- .../gke_extensions/podsnapshot.md | 10 ++-- .../gke_extensions/podsnapshot_client.py | 16 +++---- .../gke_extensions/test_podsnapshot_client.py | 40 ++++++++-------- .../test_podsnapshot_extension.py | 46 +++++++++---------- 4 files changed, 56 insertions(+), 56 deletions(-) diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md index 9f3ff5efb..bf5dfd57c 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md @@ -1,6 +1,6 @@ # Agentic Sandbox Pod Snapshot Extension -This directory contains the Python client extension for interacting with the Agentic Sandbox to manage Pod Snapshots. This extension allows you to trigger checkpoints (snapshots) of a running sandbox and restore a new sandbox from the recently created snapshot. +This directory contains the Python client extension for interacting with the Agentic Sandbox to manage Pod Snapshots. This extension allows you to trigger snapshots of a running sandbox and restore a new sandbox from the recently created snapshot. ## `podsnapshot_client.py` @@ -21,12 +21,12 @@ A specialized Sandbox client for interacting with the gke pod snapshot controlle * If snapshot exists, the pod snapshot controller restores from the most recent snapshot matching the label of the `SandboxTemplate`, otherwise creates a new `Sandbox`. * **`snapshot_controller_ready(self) -> bool`**: * Checks if the snapshot agent (both self-installed and GKE managed) is running and ready. -* **`checkpoint(self, trigger_name: str) -> tuple[ExecutionResult, str]`**: +* **`snapshot(self, trigger_name: str) -> SnapshotResponse`**: * Triggers a manual snapshot of the current sandbox pod by creating a `PodSnapshotManualTrigger` resource. * The trigger_name is suffixed with unique hash. * Waits for the snapshot to be processed. * The pod snapshot controller creates a `PodSnapshot` resource automatically. - * Returns the CheckpointResponse object(success, error_code, error_reason, trigger_name). + * Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name). * **`list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None`**: * TBD * **`delete_snapshots(self, trigger_name: str) -> int`**: @@ -36,11 +36,11 @@ A specialized Sandbox client for interacting with the gke pod snapshot controlle ## `test_podsnapshot_extension.py` -This file, located in the parent directory (`clients/python/agentic-sandbox-client/`), contains an integration test script for the `PodSnapshotSandboxClient` extension. It verifies the checkpoint and restore functionality. +This file, located in the parent directory (`clients/python/agentic-sandbox-client/`), contains an integration test script for the `PodSnapshotSandboxClient` extension. It verifies the snapshot and restore functionality. ### Test Phases: -1. **Phase 1: Starting Counter & Checkpointing**: +1. **Phase 1: Starting Counter & Snapshotting**: * Starts a sandbox with a counter application. * Takes a snapshot (`test-snapshot-10`) after ~10 seconds. * Takes a second snapshot (`test-snapshot-20`) after another ~10 seconds. diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py index 912493cf2..6d7db73f8 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py @@ -38,8 +38,8 @@ class SnapshotResult: @dataclass -class CheckpointResponse: - """Structured response for checkpoint operations.""" +class SnapshotResponse: + """Structured response for snapshot operations.""" success: bool trigger_name: str @@ -181,7 +181,7 @@ def check_namespace(namespace: str, required_components: list[str]) -> bool: self.controller_ready = False return self.controller_ready - def checkpoint(self, trigger_name: str) -> CheckpointResponse: + def snapshot(self, trigger_name: str) -> SnapshotResponse: """ Triggers a snapshot of the specified pod by creating a PodSnapshotManualTrigger resource. The trigger_name will be suffixed with the current datetime. @@ -191,14 +191,14 @@ def checkpoint(self, trigger_name: str) -> CheckpointResponse: trigger_name = f"{trigger_name}-{os.urandom(4).hex()}" if not self.controller_ready: - return CheckpointResponse( + return SnapshotResponse( success=False, trigger_name=trigger_name, error_reason="Snapshot controller is not ready. Ensure it is installed and running.", error_code=1, ) if not self.pod_name: - return CheckpointResponse( + return SnapshotResponse( success=False, trigger_name=trigger_name, error_reason="Sandbox pod name not found. Ensure sandbox is created.", @@ -224,14 +224,14 @@ def checkpoint(self, trigger_name: str) -> CheckpointResponse: # TODO: Add snapshot metadata persistence logic here using SnapshotPersistenceManager - return CheckpointResponse( + return SnapshotResponse( success=True, trigger_name=trigger_name, error_reason="", error_code=0 ) except ApiException as e: logging.exception( f"Failed to create PodSnapshotManualTrigger '{trigger_name}': {e}" ) - return CheckpointResponse( + return SnapshotResponse( success=False, trigger_name=trigger_name, error_reason=f"Failed to create PodSnapshotManualTrigger: {e}", @@ -241,7 +241,7 @@ def checkpoint(self, trigger_name: str) -> CheckpointResponse: logging.exception( f"Snapshot creation timed out for trigger '{trigger_name}': {e}" ) - return CheckpointResponse( + return SnapshotResponse( success=False, trigger_name=trigger_name, error_reason=f"Snapshot creation timed out: {e}", diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py index 9c7e74ccd..4660ee080 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py @@ -129,9 +129,9 @@ def test_snapshot_controller_ready_status_not_ready(self, mock_v1_class): logging.info("Finished test_snapshot_controller_ready_status_not_ready.") @patch("agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch") - def test_checkpoint_success(self, mock_watch_class): - """Test successful checkpoint creation.""" - logging.info("Starting test_checkpoint_success...") + def test_snapshot_success(self, mock_watch_class): + """Test successful snapshot creation.""" + logging.info("Starting test_snapshot_success...") # Mock the watch mock_watch = MagicMock() @@ -160,7 +160,7 @@ def test_checkpoint_success(self, mock_watch_class): } mock_watch.stream.return_value = [mock_event] - result = self.client.checkpoint("test-trigger") + result = self.client.snapshot("test-trigger") self.assertEqual(result.error_code, 0) self.assertTrue(result.success) @@ -168,38 +168,38 @@ def test_checkpoint_success(self, mock_watch_class): # Verify create call was made self.client.custom_objects_api.create_namespaced_custom_object.assert_called_once() - logging.info("Finished test_checkpoint_success.") + logging.info("Finished test_snapshot_success.") - def test_checkpoint_controller_not_ready(self): - """Test checkpoint when controller is not ready.""" - logging.info("Starting test_checkpoint_controller_not_ready...") + def test_snapshot_controller_not_ready(self): + """Test snapshot when controller is not ready.""" + logging.info("Starting test_snapshot_controller_not_ready...") self.client.controller_ready = False - result = self.client.checkpoint("test-trigger") + result = self.client.snapshot("test-trigger") self.assertEqual(result.error_code, 1) self.assertFalse(result.success) self.assertIn("test-trigger", result.trigger_name) self.assertIn("Snapshot controller is not ready", result.error_reason) - logging.info("Finished test_checkpoint_controller_not_ready.") + logging.info("Finished test_snapshot_controller_not_ready.") - def test_checkpoint_no_pod_name(self): - """Test checkpoint when pod name is not set.""" - logging.info("Starting test_checkpoint_no_pod_name...") + def test_snapshot_no_pod_name(self): + """Test snapshot when pod name is not set.""" + logging.info("Starting test_snapshot_no_pod_name...") self.client.controller_ready = True self.client.pod_name = None - result = self.client.checkpoint("test-trigger") + result = self.client.snapshot("test-trigger") self.assertEqual(result.error_code, 1) self.assertFalse(result.success) self.assertIn("test-trigger", result.trigger_name) self.assertIn("Sandbox pod name not found", result.error_reason) - logging.info("Finished test_checkpoint_no_pod_name.") + logging.info("Finished test_snapshot_no_pod_name.") @patch("agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch") @patch("agentic_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi") - def test_checkpoint_timeout(self, mock_custom_class, mock_watch_class): - """Test checkpoint timeout scenario.""" - logging.info("Starting test_checkpoint_timeout...") + def test_snapshot_timeout(self, mock_custom_class, mock_watch_class): + """Test snapshot timeout scenario.""" + logging.info("Starting test_snapshot_timeout...") mock_custom = MagicMock() mock_custom_class.return_value = mock_custom @@ -213,12 +213,12 @@ def test_checkpoint_timeout(self, mock_custom_class, mock_watch_class): # Mock empty stream (timeout) mock_watch.stream.return_value = [] - result = self.client.checkpoint("test-trigger") + result = self.client.snapshot("test-trigger") self.assertEqual(result.error_code, 1) self.assertFalse(result.success) self.assertIn("timed out", result.error_reason) - logging.info("Finished test_checkpoint_timeout.") + logging.info("Finished test_snapshot_timeout.") @patch("agentic_sandbox.gke_extensions.podsnapshot_client.SandboxClient.__exit__") def test_exit(self, mock_super_exit): diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py index 45b817745..d7a2bf7f9 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py @@ -21,23 +21,23 @@ from agentic_sandbox.gke_extensions import PodSnapshotSandboxClient -def test_checkpoint_response(checkpoint_response, checkpoint_name): +def test_snapshot_response(snapshot_response, snapshot_name): assert hasattr( - checkpoint_response, "trigger_name" - ), "Checkpoint response missing 'trigger_name' attribute" + snapshot_response, "trigger_name" + ), "snapshot response missing 'trigger_name' attribute" - print(f"Trigger Name: {checkpoint_response.trigger_name}") - print(f"Success: {checkpoint_response.success}") - print(f"Error Code: {checkpoint_response.error_code}") - print(f"Error Reason: {checkpoint_response.error_reason}") + print(f"Trigger Name: {snapshot_response.trigger_name}") + print(f"Success: {snapshot_response.success}") + print(f"Error Code: {snapshot_response.error_code}") + print(f"Error Reason: {snapshot_response.error_reason}") - assert checkpoint_response.trigger_name.startswith( - checkpoint_name - ), f"Expected trigger name prefix '{checkpoint_name}', but got '{checkpoint_response.trigger_name}'" + assert snapshot_response.trigger_name.startswith( + snapshot_name + ), f"Expected trigger name prefix '{snapshot_name}', but got '{snapshot_response.trigger_name}'" assert ( - checkpoint_response.success - ), f"Expected success=True, but got False. Reason: {checkpoint_response.error_reason}" - assert checkpoint_response.error_code == 0 + snapshot_response.success + ), f"Expected success=True, but got False. Reason: {snapshot_response.error_reason}" + assert snapshot_response.error_code == 0 async def main( @@ -63,8 +63,8 @@ async def main( config.load_kube_config() wait_time = 10 - first_checkpoint_name = "test-snapshot-10" - second_checkpoint_name = "test-snapshot-20" + first_snapshot_name = "test-snapshot-10" + second_snapshot_name = "test-snapshot-20" core_v1_api = client.CoreV1Api() try: @@ -81,18 +81,18 @@ async def main( time.sleep(wait_time) print( - f"Creating first pod snapshot '{first_checkpoint_name}' after {wait_time} seconds..." + f"Creating first pod snapshot '{first_snapshot_name}' after {wait_time} seconds..." ) - checkpoint_response = sandbox.checkpoint(first_checkpoint_name) - test_checkpoint_response(checkpoint_response, first_checkpoint_name) + snapshot_response = sandbox.snapshot(first_snapshot_name) + test_snapshot_response(snapshot_response, first_snapshot_name) time.sleep(wait_time) print( - f"\nCreating second pod snapshot '{second_checkpoint_name}' after {wait_time} seconds..." + f"\nCreating second pod snapshot '{second_snapshot_name}' after {wait_time} seconds..." ) - checkpoint_response = sandbox.checkpoint(second_checkpoint_name) - test_checkpoint_response(checkpoint_response, second_checkpoint_name) + snapshot_response = sandbox.snapshot(second_snapshot_name) + test_snapshot_response(snapshot_response, second_snapshot_name) print("\n***** Phase 2: Restoring from most recent snapshot & Verifying *****") with PodSnapshotSandboxClient( @@ -100,7 +100,7 @@ async def main( namespace=namespace, api_url=api_url, server_port=server_port, - ) as sandbox_restored: # restores from second_checkpoint_name by default + ) as sandbox_restored: # restores from second_snapshot_name by default print("\nWaiting 5 seconds for restored pod to resume printing...") time.sleep(5) @@ -116,7 +116,7 @@ async def main( len(counts) > 0 ), "Failed to retrieve any 'Count:' logs from restored pod." - # Verify the counter resumed from the correct checkpoint state. + # Verify the counter resumed from the correct snapshot state. # The second snapshot was taken after two wait intervals (totaling 20s if wait_time=10). min_expected_count_at_restore = wait_time * 2 first_count_after_restore = counts[0] From 35b70964aedcaa1a903c2bbfa16a426cafc3a62f Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Sat, 14 Feb 2026 01:21:37 +0000 Subject: [PATCH 09/12] Add is_restored --- .../gke_extensions/podsnapshot.md | 10 +-- .../gke_extensions/podsnapshot_client.py | 81 +++++++++---------- .../gke_extensions/test_podsnapshot_client.py | 1 - .../test_podsnapshot_extension.py | 30 ++----- 4 files changed, 48 insertions(+), 74 deletions(-) diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md index bf5dfd57c..d0c0bae92 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md @@ -4,11 +4,7 @@ This directory contains the Python client extension for interacting with the Age ## `podsnapshot_client.py` -This file defines the `SnapshotPersistenceManager` and `PodSnapshotSandboxClient` class, which extend the base `SandboxClient` to provide snapshot capabilities. - -### `SnapshotPersistenceManager` - -A utility class for managing local persistence of snapshot metadata in a secure directory. Stores metadata as a dictionary keyed by `trigger_name`. +This file defines the `PodSnapshotSandboxClient` class, which extend the base `SandboxClient` to provide snapshot capabilities. ### `PodSnapshotSandboxClient` @@ -27,10 +23,6 @@ A specialized Sandbox client for interacting with the gke pod snapshot controlle * Waits for the snapshot to be processed. * The pod snapshot controller creates a `PodSnapshot` resource automatically. * Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name). -* **`list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None`**: - * TBD -* **`delete_snapshots(self, trigger_name: str) -> int`**: - * TBD * **Automatic Cleanup**: * The `__exit__` method cleans up the `SandboxClaim` resources. diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py index 6d7db73f8..ea26999d5 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py @@ -43,37 +43,11 @@ class SnapshotResponse: success: bool trigger_name: str + snapshot_uid: str error_reason: str error_code: int -class SnapshotPersistenceManager: - """ - Manages local persistence of snapshot metadata in a secure directory. - Stores metadata as a dictionary keyed by trigger_name. - """ - - def __init__(self): - """Initializes the persistence manager and ensures the secure directory exists.""" - pass - - def _ensure_secure_dir(self): - """Ensures the directory exists with 700 permissions.""" - pass - - def _load_metadata(self) -> dict[str, Any]: - """Loads metadata. Returns an empty dict if file doesn't exist or is invalid.""" - pass - - def save_snapshot_metadata(self, record: dict[str, Any]): - """Saves a snapshot record to the local registry.""" - pass - - def delete_snapshot_metadata(self, trigger_name: str): - """Deletes a snapshot record from the local registry.""" - pass - - class PodSnapshotSandboxClient(SandboxClient): """ A specialized Sandbox client for interacting with the gke pod snapshot controller. @@ -222,10 +196,12 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: ) snapshot_result = self._wait_for_snapshot_processed(trigger_name) - # TODO: Add snapshot metadata persistence logic here using SnapshotPersistenceManager - return SnapshotResponse( - success=True, trigger_name=trigger_name, error_reason="", error_code=0 + success=True, + trigger_name=trigger_name, + snapshot_uid=snapshot_result.snapshot_uid, + error_reason="", + error_code=0, ) except ApiException as e: logging.exception( @@ -234,6 +210,7 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: return SnapshotResponse( success=False, trigger_name=trigger_name, + snapshot_uid=None, error_reason=f"Failed to create PodSnapshotManualTrigger: {e}", error_code=1, ) @@ -244,25 +221,45 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: return SnapshotResponse( success=False, trigger_name=trigger_name, + snapshot_uid=None, error_reason=f"Snapshot creation timed out: {e}", error_code=1, ) - def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None: + def is_restored(self) -> tuple[bool, str | None]: """ - Checks for existing snapshots matching the label selector and optional policy name. - Returns a list of valid snapshots sorted by creation timestamp (newest first). - policy_name: Filters snapshots by their spec.policyName. - ready_only: If True, filters out snapshots that are only in 'Ready' state. + Checks if the sandbox pod was restored from a snapshot. + Verifies this by checking for the 'PodRestored' condition in the pod status. + Returns: + tuple[bool, str | None]: (True, snapshot_uuid) if restored, (False, None) otherwise. """ - pass + if not self.pod_name: + logging.warning("Cannot check restore status: pod_name is unknown.") + return False, None - def delete_snapshots(self, trigger_name: str) -> int: - """ - Deletes snapshots matching the provided trigger name and the PSMT resources. - Returns the count of successfully deleted snapshots. - """ - pass + try: + v1 = client.CoreV1Api() + pod = v1.read_namespaced_pod(self.pod_name, self.namespace) + + if not pod.status or not pod.status.conditions: + return False, None + + for condition in pod.status.conditions: + if condition.type == "PodRestored" and condition.status == "True": + # Attempt to extract UUID from the message + # Message format: "pod successfully restored from pod snapshot namespace/uuid" + if condition.message: + parts = condition.message.split("/") + if len(parts) > 1: + return True, parts[-1].split()[0] + + return True, None + + return False, None + + except ApiException as e: + logging.error(f"Failed to check pod restore status: {e}") + return False, None def __exit__(self, exc_type, exc_val, exc_tb): """ diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py index 4660ee080..028538f68 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py @@ -19,7 +19,6 @@ from datetime import datetime from agentic_sandbox.gke_extensions.podsnapshot_client import ( PodSnapshotSandboxClient, - SnapshotPersistenceManager, ) from agentic_sandbox.constants import * from kubernetes import config diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py index d7a2bf7f9..fdc59e6fb 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py @@ -27,6 +27,7 @@ def test_snapshot_response(snapshot_response, snapshot_name): ), "snapshot response missing 'trigger_name' attribute" print(f"Trigger Name: {snapshot_response.trigger_name}") + print(f"Snapshot UID: {snapshot_response.snapshot_uid}") print(f"Success: {snapshot_response.success}") print(f"Error Code: {snapshot_response.error_code}") print(f"Error Reason: {snapshot_response.error_reason}") @@ -93,6 +94,8 @@ async def main( ) snapshot_response = sandbox.snapshot(second_snapshot_name) test_snapshot_response(snapshot_response, second_snapshot_name) + recent_snapshot_uid = snapshot_response.snapshot_uid + print(f"Recent snapshot UID: {recent_snapshot_uid}") print("\n***** Phase 2: Restoring from most recent snapshot & Verifying *****") with PodSnapshotSandboxClient( @@ -105,29 +108,12 @@ async def main( print("\nWaiting 5 seconds for restored pod to resume printing...") time.sleep(5) - # Fetch logs using the Kubernetes API - logs = core_v1_api.read_namespaced_pod_log( - name=sandbox_restored.pod_name, namespace=sandbox_restored.namespace - ) - - # Extract the sequence of 'Count:' values from the pod logs - counts = [int(n) for n in re.findall(r"Count: (\d+)", logs)] + is_restored, restored_snapshot_uid = sandbox_restored.is_restored() + assert is_restored, "Pod was not restored from a snapshot." assert ( - len(counts) > 0 - ), "Failed to retrieve any 'Count:' logs from restored pod." - - # Verify the counter resumed from the correct snapshot state. - # The second snapshot was taken after two wait intervals (totaling 20s if wait_time=10). - min_expected_count_at_restore = wait_time * 2 - first_count_after_restore = counts[0] - - print(f"First count after restore: {first_count_after_restore}") - - assert first_count_after_restore >= min_expected_count_at_restore, ( - f"State Mismatch! Expected counter to start >= {min_expected_count_at_restore}, " - f"but got {first_count_after_restore}. The pod likely restarted from scratch." - ) - + restored_snapshot_uid == recent_snapshot_uid + ), "Restored snapshot UID does not match the most recent snapshot UID." + print("Pod was restored from the most recent snapshot.") print("--- Pod Snapshot Test Passed! ---") except Exception as e: From 1c56d9a3315403e675788a476269b94b81a0edc4 Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Sat, 14 Feb 2026 01:25:18 +0000 Subject: [PATCH 10/12] update read me --- .../agentic_sandbox/gke_extensions/podsnapshot.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md index d0c0bae92..121625c45 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md @@ -22,7 +22,11 @@ A specialized Sandbox client for interacting with the gke pod snapshot controlle * The trigger_name is suffixed with unique hash. * Waits for the snapshot to be processed. * The pod snapshot controller creates a `PodSnapshot` resource automatically. - * Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name). + * Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name, snapshot_uid). +* **`isrestored(self) -> tuple[bool, str | None]`**: + * Checks if the sandbox pod was restored from a snapshot. + * Verifies this by checking for the 'PodRestored' condition in the pod status. + * Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name, snapshot_uid). * **Automatic Cleanup**: * The `__exit__` method cleans up the `SandboxClaim` resources. @@ -38,7 +42,7 @@ This file, located in the parent directory (`clients/python/agentic-sandbox-clie * Takes a second snapshot (`test-snapshot-20`) after another ~10 seconds. 2. **Phase 2: Restoring from Recent Snapshot**: * Restores a sandbox from the second snapshot. - * Verifies that the counter continues from where it left off (>= 20), proving the state was preserved. + * Verifies that sandbox has been restored from the recent snapshot. ### Prerequisites From 24cdef58af35a950285c0056257f9e033ec1238e Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Thu, 19 Feb 2026 01:53:09 +0000 Subject: [PATCH 11/12] Update is_restored and error catch --- .../gke_extensions/podsnapshot.md | 17 +- .../gke_extensions/podsnapshot_client.py | 217 ++++++++++++---- .../gke_extensions/test_podsnapshot_client.py | 243 ++++++++++++++++-- .../test_podsnapshot_extension.py | 13 +- 4 files changed, 399 insertions(+), 91 deletions(-) diff --git a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot.md b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot.md index 121625c45..33d5577d8 100644 --- a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot.md +++ b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot.md @@ -16,19 +16,20 @@ A specialized Sandbox client for interacting with the gke pod snapshot controlle * Initializes the client with optional podsnapshot timeout and server port. * If snapshot exists, the pod snapshot controller restores from the most recent snapshot matching the label of the `SandboxTemplate`, otherwise creates a new `Sandbox`. * **`snapshot_controller_ready(self) -> bool`**: - * Checks if the snapshot agent (both self-installed and GKE managed) is running and ready. + * Checks if the snapshot agent (GKE managed) is running and ready. * **`snapshot(self, trigger_name: str) -> SnapshotResponse`**: * Triggers a manual snapshot of the current sandbox pod by creating a `PodSnapshotManualTrigger` resource. - * The trigger_name is suffixed with unique hash. + * The `trigger_name` is suffixed with unique hash. * Waits for the snapshot to be processed. * The pod snapshot controller creates a `PodSnapshot` resource automatically. * Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name, snapshot_uid). -* **`isrestored(self) -> tuple[bool, str | None]`**: - * Checks if the sandbox pod was restored from a snapshot. - * Verifies this by checking for the 'PodRestored' condition in the pod status. - * Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name, snapshot_uid). -* **Automatic Cleanup**: - * The `__exit__` method cleans up the `SandboxClaim` resources. +* **`is_restored_from_snapshot(self, snapshot_uid: str) -> RestoreResult`**: + * Checks if the sandbox pod was restored from the specified snapshot. + * Verifies restoration by checking the 'PodRestored' condition in the pod status and confirming the message contains the expected snapshot UID. + * Returns RestoreResult object(success, error_code, error_reason). +* **`__exit__(self)`**: + * Cleans up the `PodSnapshotManualTrigger` resources. + * Cleans up the `SandboxClaim` resources. ## `test_podsnapshot_extension.py` diff --git a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot_client.py b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot_client.py index ea26999d5..65eadf8ba 100644 --- a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot_client.py @@ -22,11 +22,7 @@ from ..sandbox_client import SandboxClient, ExecutionResult from ..constants import * -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - stream=sys.stdout, -) +logger = logging.getLogger(__name__) @dataclass @@ -48,6 +44,15 @@ class SnapshotResponse: error_code: int +@dataclass +class RestoreResult: + """Result of a restore operation.""" + + success: bool + error_reason: str + error_code: int + + class PodSnapshotSandboxClient(SandboxClient): """ A specialized Sandbox client for interacting with the gke pod snapshot controller. @@ -65,21 +70,48 @@ def __init__( self.controller_ready = False self.podsnapshot_timeout = podsnapshot_timeout + self.core_v1_api = client.CoreV1Api() + + self.created_manual_triggers = [] def __enter__(self) -> "PodSnapshotSandboxClient": self.controller_ready = self.snapshot_controller_ready() super().__enter__() return self - def _wait_for_snapshot_processed(self, trigger_name: str) -> SnapshotResult: + def _parse_snapshot_result(self, obj) -> SnapshotResult | None: + """Parses the object to see if snapshot is complete.""" + status = obj.get("status", {}) + conditions = status.get("conditions", []) + for condition in conditions: + if ( + condition.get("type") == "Triggered" + and condition.get("status") == "True" + and condition.get("reason") == "Complete" + ): + snapshot_uid = status.get("snapshotCreated", {}).get("name") + snapshot_timestamp = condition.get("lastTransitionTime") + return SnapshotResult( + snapshot_uid=snapshot_uid, + snapshot_timestamp=snapshot_timestamp, + ) + return None + + def _wait_for_snapshot_processed( + self, trigger_name: str, resource_version: str | None = None + ) -> SnapshotResult: """ Waits for the PodSnapshotManualTrigger to be processed and returns SnapshotResult. """ w = watch.Watch() - logging.info( + logger.info( f"Waiting for snapshot manual trigger '{trigger_name}' to be processed..." ) + kwargs = {} + if resource_version: + kwargs["resource_version"] = resource_version + try: for event in w.stream( func=self.custom_objects_api.list_namespaced_custom_object, @@ -89,30 +121,19 @@ def _wait_for_snapshot_processed(self, trigger_name: str) -> SnapshotResult: plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, field_selector=f"metadata.name={trigger_name}", timeout_seconds=self.podsnapshot_timeout, + **kwargs, ): if event["type"] in ["ADDED", "MODIFIED"]: obj = event["object"] - status = obj.get("status", {}) - conditions = status.get("conditions", []) - - for condition in conditions: - if ( - condition.get("type") == "Triggered" - and condition.get("status") == "True" - and condition.get("reason") == "Complete" - ): - snapshot_uid = status.get("snapshotCreated", {}).get("name") - snapshot_timestamp = condition.get("lastTransitionTime") - logging.info( - f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {snapshot_uid}" - ) - w.stop() - return SnapshotResult( - snapshot_uid=snapshot_uid, - snapshot_timestamp=snapshot_timestamp, - ) + result = self._parse_snapshot_result(obj) + if result: + logger.info( + f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {result.snapshot_uid}" + ) + w.stop() + return result except Exception as e: - logging.error(f"Error watching snapshot: {e}") + logger.error(f"Error watching snapshot: {e}") raise raise TimeoutError( @@ -122,16 +143,36 @@ def _wait_for_snapshot_processed(self, trigger_name: str) -> SnapshotResult: def snapshot_controller_ready(self) -> bool: """ Checks if the snapshot agent pods are running in a GKE-managed pod snapshot cluster. + Falls back to checking CRD existence if pod listing is forbidden. """ if self.controller_ready: return True - core_v1_api = client.CoreV1Api() + def check_crd_installed() -> bool: + try: + # Check directly if the API resource exists using CustomObjectsApi + resource_list = self.custom_objects_api.get_api_resources( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + ) + + if not resource_list or not resource_list.resources: + return False + + for resource in resource_list.resources: + if resource.kind == PODSNAPSHOT_API_KIND: + return True + return False + except ApiException as e: + # If discovery fails with 403/404, we assume not ready/accessible + if e.status == 403 or e.status == 404: + return False + raise def check_namespace(namespace: str, required_components: list[str]) -> bool: try: - pods = core_v1_api.list_namespaced_pod(namespace) + pods = self.core_v1_api.list_namespaced_pod(namespace) found_components = { component: False for component in required_components } @@ -144,8 +185,15 @@ def check_namespace(namespace: str, required_components: list[str]) -> bool: found_components[component] = True return all(found_components.values()) - except ApiException: - return False + except ApiException as e: + if e.status == 403: + logger.info( + f"Permission denied listing pods in {namespace}. Checking CRD existence." + ) + return check_crd_installed() + if e.status == 404: + return False + raise # Check managed: requires only agent in gke-managed-pod-snapshots if check_namespace(SNAPSHOT_NAMESPACE_MANAGED, [SNAPSHOT_AGENT]): @@ -168,6 +216,7 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: return SnapshotResponse( success=False, trigger_name=trigger_name, + snapshot_uid=None, error_reason="Snapshot controller is not ready. Ensure it is installed and running.", error_code=1, ) @@ -175,6 +224,7 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: return SnapshotResponse( success=False, trigger_name=trigger_name, + snapshot_uid=None, error_reason="Sandbox pod name not found. Ensure sandbox is created.", error_code=1, ) @@ -187,14 +237,21 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: } try: - self.custom_objects_api.create_namespaced_custom_object( + created_obj = self.custom_objects_api.create_namespaced_custom_object( group=PODSNAPSHOT_API_GROUP, version=PODSNAPSHOT_API_VERSION, namespace=self.namespace, plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, body=manifest, ) - snapshot_result = self._wait_for_snapshot_processed(trigger_name) + + # Start watching from the version we just created to avoid missing updates + resource_version = created_obj.get("metadata", {}).get("resourceVersion") + snapshot_result = self._wait_for_snapshot_processed( + trigger_name, resource_version + ) + + self.created_manual_triggers.append(trigger_name) return SnapshotResponse( success=True, @@ -204,7 +261,7 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: error_code=0, ) except ApiException as e: - logging.exception( + logger.exception( f"Failed to create PodSnapshotManualTrigger '{trigger_name}': {e}" ) return SnapshotResponse( @@ -215,7 +272,7 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: error_code=1, ) except TimeoutError as e: - logging.exception( + logger.exception( f"Snapshot creation timed out for trigger '{trigger_name}': {e}" ) return SnapshotResponse( @@ -226,43 +283,91 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: error_code=1, ) - def is_restored(self) -> tuple[bool, str | None]: + def is_restored_from_snapshot(self, snapshot_uid: str) -> RestoreResult: """ - Checks if the sandbox pod was restored from a snapshot. - Verifies this by checking for the 'PodRestored' condition in the pod status. + Checks if the sandbox pod was restored from the specified snapshot. + + This is verified by inspecting the 'PodRestored' condition in the pod status + and confirming that the condition's message contains the provided snapshot UID. + Returns: - tuple[bool, str | None]: (True, snapshot_uuid) if restored, (False, None) otherwise. + RestoreResult: The result of the restore operation. """ + if not snapshot_uid: + return RestoreResult( + success=False, + error_reason="Snapshot UID cannot be empty.", + error_code=1, + ) + if not self.pod_name: - logging.warning("Cannot check restore status: pod_name is unknown.") - return False, None + logger.warning("Cannot check restore status: pod_name is unknown.") + return RestoreResult( + success=False, + error_reason="Pod name not found. Ensure sandbox is created.", + error_code=1, + ) try: - v1 = client.CoreV1Api() - pod = v1.read_namespaced_pod(self.pod_name, self.namespace) + pod = self.core_v1_api.read_namespaced_pod(self.pod_name, self.namespace) if not pod.status or not pod.status.conditions: - return False, None + return RestoreResult( + success=False, + error_reason="Pod status or conditions not found.", + error_code=1, + ) for condition in pod.status.conditions: if condition.type == "PodRestored" and condition.status == "True": - # Attempt to extract UUID from the message - # Message format: "pod successfully restored from pod snapshot namespace/uuid" - if condition.message: - parts = condition.message.split("/") - if len(parts) > 1: - return True, parts[-1].split()[0] - - return True, None - - return False, None + # Check if Snapshot UUID is present in the condition.message + if condition.message and snapshot_uid in condition.message: + return RestoreResult( + success=True, + error_reason="", + error_code=0, + ) + else: + return RestoreResult( + success=False, + error_reason="Pod was not restored from the given snapshot", + error_code=1, + ) + + return RestoreResult( + success=False, + error_reason="Pod was not restored from any snapshot", + error_code=1, + ) except ApiException as e: - logging.error(f"Failed to check pod restore status: {e}") - return False, None + logger.error(f"Failed to check pod restore status: {e}") + return RestoreResult( + success=False, + error_reason=f"Failed to check pod restore status: {e}", + error_code=1, + ) def __exit__(self, exc_type, exc_val, exc_tb): """ + Cleans up the PodSnapshotManualTrigger Resources. Automatically cleans up the Sandbox. + + TODO: Add cleanup for PodSnapshot resources. """ + for trigger_name in self.created_manual_triggers: + try: + self.custom_objects_api.delete_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + name=trigger_name, + ) + logger.info(f"Deleted PodSnapshotManualTrigger '{trigger_name}'") + except ApiException as e: + logger.error( + f"Failed to delete PodSnapshotManualTrigger '{trigger_name}': {e}" + ) + super().__exit__(exc_type, exc_val, exc_tb) diff --git a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/test_podsnapshot_client.py b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/test_podsnapshot_client.py index 028538f68..cc1ece767 100644 --- a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/test_podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/test_podsnapshot_client.py @@ -17,10 +17,12 @@ import logging from unittest.mock import MagicMock, patch, call from datetime import datetime -from agentic_sandbox.gke_extensions.podsnapshot_client import ( +from k8s_agent_sandbox.gke_extensions.podsnapshot_client import ( PodSnapshotSandboxClient, ) -from agentic_sandbox.constants import * +from k8s_agent_sandbox.constants import * +from kubernetes.client import ApiException + from kubernetes import config logging.basicConfig( @@ -74,7 +76,7 @@ def test_init(self): """Test initialization of PodSnapshotSandboxClient.""" logging.info("Starting test_init...") with patch( - "agentic_sandbox.sandbox_client.SandboxClient.__init__", return_value=None + "k8s_agent_sandbox.sandbox_client.SandboxClient.__init__", return_value=None ) as mock_super: with patch.object( PodSnapshotSandboxClient, "snapshot_controller_ready", return_value=True @@ -85,12 +87,10 @@ def test_init(self): self.assertEqual(client.podsnapshot_timeout, 180) logging.info("Finished test_init.") - @patch("agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api") - def test_snapshot_controller_ready_managed(self, mock_v1_class): + def test_snapshot_controller_ready_managed(self): """Test snapshot_controller_ready for managed scenario.""" logging.info("Starting test_snapshot_controller_ready_managed...") - mock_v1 = MagicMock() - mock_v1_class.return_value = mock_v1 + mock_v1 = self.client.core_v1_api # Mock pods in gke-managed-pod-snapshots mock_pod_agent = MagicMock() @@ -109,12 +109,10 @@ def test_snapshot_controller_ready_managed(self, mock_v1_class): mock_v1.list_namespaced_pod.assert_called_with(SNAPSHOT_NAMESPACE_MANAGED) logging.info("Finished test_snapshot_controller_ready_managed.") - @patch("agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api") - def test_snapshot_controller_ready_status_not_ready(self, mock_v1_class): - """Test snapshot_controller_ready when not ready.""" + def test_snapshot_controller_ready_status_not_ready(self): + """Test snapshot_controller_ready when not ready (pod missing).""" logging.info("Starting test_snapshot_controller_ready_status_not_ready...") - mock_v1 = MagicMock() - mock_v1_class.return_value = mock_v1 + mock_v1 = self.client.core_v1_api mock_pods = MagicMock() mock_pods.items = [] @@ -127,7 +125,61 @@ def test_snapshot_controller_ready_status_not_ready(self, mock_v1_class): self.assertFalse(self.client.controller_ready) logging.info("Finished test_snapshot_controller_ready_status_not_ready.") - @patch("agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch") + def test_snapshot_controller_ready_forbidden_with_crd(self): + """Test fallback to CRD check when pod listing is forbidden.""" + logging.info("Starting test_snapshot_controller_ready_forbidden_with_crd...") + mock_v1 = self.client.core_v1_api + mock_v1.list_namespaced_pod.side_effect = ApiException(status=403) + + # Mock CustomObjectsApi.get_api_resources + mock_resource_list = MagicMock() + mock_resource = MagicMock() + mock_resource.kind = PODSNAPSHOT_API_KIND + mock_resource_list.resources = [mock_resource] + self.client.custom_objects_api.get_api_resources.return_value = ( + mock_resource_list + ) + + self.client.controller_ready = False + result = self.client.snapshot_controller_ready() + + self.assertTrue(result) + self.assertTrue(self.client.controller_ready) + self.client.custom_objects_api.get_api_resources.assert_called_with( + group=PODSNAPSHOT_API_GROUP, version=PODSNAPSHOT_API_VERSION + ) + logging.info("Finished test_snapshot_controller_ready_forbidden_with_crd.") + + def test_snapshot_controller_ready_forbidden_no_crd(self): + """Test fallback to CRD check fails when CRD is missing.""" + logging.info("Starting test_snapshot_controller_ready_forbidden_no_crd...") + mock_v1 = self.client.core_v1_api + mock_v1.list_namespaced_pod.side_effect = ApiException(status=403) + + # Mock CustomObjectsApi.get_api_resources returning empty + self.client.custom_objects_api.get_api_resources.return_value = None + + self.client.controller_ready = False + result = self.client.snapshot_controller_ready() + + self.assertFalse(result) + self.assertFalse(self.client.controller_ready) + logging.info("Finished test_snapshot_controller_ready_forbidden_no_crd.") + + def test_snapshot_controller_ready_404(self): + """Test snapshot_controller_ready returns False on 404.""" + logging.info("Starting test_snapshot_controller_ready_404...") + mock_v1 = self.client.core_v1_api + mock_v1.list_namespaced_pod.side_effect = ApiException(status=404) + + self.client.controller_ready = False + result = self.client.snapshot_controller_ready() + + self.assertFalse(result) + self.assertFalse(self.client.controller_ready) + logging.info("Finished test_snapshot_controller_ready_404.") + + @patch("k8s_agent_sandbox.gke_extensions.podsnapshot_client.watch.Watch") def test_snapshot_success(self, mock_watch_class): """Test successful snapshot creation.""" logging.info("Starting test_snapshot_success...") @@ -159,6 +211,12 @@ def test_snapshot_success(self, mock_watch_class): } mock_watch.stream.return_value = [mock_event] + # Mock create to return an object with resourceVersion + mock_created_obj = {"metadata": {"resourceVersion": "123"}, "status": {}} + self.client.custom_objects_api.create_namespaced_custom_object.return_value = ( + mock_created_obj + ) + result = self.client.snapshot("test-trigger") self.assertEqual(result.error_code, 0) @@ -167,6 +225,10 @@ def test_snapshot_success(self, mock_watch_class): # Verify create call was made self.client.custom_objects_api.create_namespaced_custom_object.assert_called_once() + # Verify watch was called with resource_version + mock_watch.stream.assert_called_once() + _, kwargs = mock_watch.stream.call_args + self.assertEqual(kwargs.get("resource_version"), "123") logging.info("Finished test_snapshot_success.") def test_snapshot_controller_not_ready(self): @@ -194,8 +256,27 @@ def test_snapshot_no_pod_name(self): self.assertIn("Sandbox pod name not found", result.error_reason) logging.info("Finished test_snapshot_no_pod_name.") - @patch("agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch") - @patch("agentic_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi") + def test_snapshot_creation_api_exception(self): + """Test snapshot handling of API exception during creation.""" + logging.info("Starting test_snapshot_creation_api_exception...") + self.client.pod_name = "test-pod" + self.client.controller_ready = True + + self.client.custom_objects_api.create_namespaced_custom_object.side_effect = ( + ApiException("Create failed") + ) + + result = self.client.snapshot("test-trigger") + + self.assertFalse(result.success) + self.assertEqual(result.error_code, 1) + self.assertIn("Failed to create PodSnapshotManualTrigger", result.error_reason) + logging.info("Finished test_snapshot_creation_api_exception.") + + @patch("k8s_agent_sandbox.gke_extensions.podsnapshot_client.watch.Watch") + @patch( + "k8s_agent_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi" + ) def test_snapshot_timeout(self, mock_custom_class, mock_watch_class): """Test snapshot timeout scenario.""" logging.info("Starting test_snapshot_timeout...") @@ -219,13 +300,135 @@ def test_snapshot_timeout(self, mock_custom_class, mock_watch_class): self.assertIn("timed out", result.error_reason) logging.info("Finished test_snapshot_timeout.") - @patch("agentic_sandbox.gke_extensions.podsnapshot_client.SandboxClient.__exit__") - def test_exit(self, mock_super_exit): - """Test __exit__ method.""" - logging.info("Starting test_exit...") + @patch("k8s_agent_sandbox.gke_extensions.podsnapshot_client.SandboxClient.__exit__") + def test_exit_cleanup(self, mock_super_exit): + """Test __exit__ cleans up created triggers.""" + logging.info("Starting test_exit_cleanup...") + self.client.created_manual_triggers = ["trigger-1", "trigger-2"] + self.client.__exit__(None, None, None) + + # Check deletion calls + self.assertEqual( + self.client.custom_objects_api.delete_namespaced_custom_object.call_count, 2 + ) + + calls = [ + call( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.client.namespace, + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + name="trigger-1", + ), + call( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.client.namespace, + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + name="trigger-2", + ), + ] + self.client.custom_objects_api.delete_namespaced_custom_object.assert_has_calls( + calls, any_order=True + ) + mock_super_exit.assert_called_once_with(None, None, None) - logging.info("Finished test_exit.") + logging.info("Finished test_exit_cleanup.") + + def test_is_restored_from_snapshot_success(self): + """Test is_restored_from_snapshot success case.""" + logging.info("Starting test_is_restored_from_snapshot_success...") + self.client.pod_name = "test-pod" + + mock_pod = MagicMock() + mock_condition = MagicMock() + mock_condition.type = "PodRestored" + mock_condition.status = "True" + mock_condition.message = "Restored from snapshot-uid-123" + mock_pod.status.conditions = [mock_condition] + + self.client.core_v1_api.read_namespaced_pod.return_value = mock_pod + + result = self.client.is_restored_from_snapshot("snapshot-uid-123") + + self.assertTrue(result.success) + self.assertEqual(result.error_code, 0) + logging.info("Finished test_is_restored_from_snapshot_success.") + + def test_is_restored_from_snapshot_mismatch(self): + """Test is_restored_from_snapshot when UID matches another snapshot.""" + logging.info("Starting test_is_restored_from_snapshot_mismatch...") + self.client.pod_name = "test-pod" + + mock_pod = MagicMock() + mock_condition = MagicMock() + mock_condition.type = "PodRestored" + mock_condition.status = "True" + mock_condition.message = "Restored from snapshot-uid-456" + mock_pod.status.conditions = [mock_condition] + + self.client.core_v1_api.read_namespaced_pod.return_value = mock_pod + + result = self.client.is_restored_from_snapshot("snapshot-uid-123") + + self.assertFalse(result.success) + self.assertEqual(result.error_code, 1) + self.assertIn("not restored from the given snapshot", result.error_reason) + logging.info("Finished test_is_restored_from_snapshot_mismatch.") + + def test_is_restored_from_snapshot_no_condition(self): + """Test is_restored_from_snapshot when PodRestored condition is missing.""" + logging.info("Starting test_is_restored_from_snapshot_no_condition...") + self.client.pod_name = "test-pod" + + mock_pod = MagicMock() + mock_pod.status.conditions = [] + + self.client.core_v1_api.read_namespaced_pod.return_value = mock_pod + + result = self.client.is_restored_from_snapshot("snapshot-uid-123") + + self.assertFalse(result.success) + self.assertEqual(result.error_code, 1) + self.assertIn("Pod status or conditions not found", result.error_reason) + logging.info("Finished test_is_restored_from_snapshot_no_condition.") + + def test_is_restored_from_snapshot_api_error(self): + """Test is_restored_from_snapshot API exception handling.""" + logging.info("Starting test_is_restored_from_snapshot_api_error...") + self.client.pod_name = "test-pod" + self.client.core_v1_api.read_namespaced_pod.side_effect = ApiException( + "API Error" + ) + + result = self.client.is_restored_from_snapshot("snapshot-uid-123") + + self.assertFalse(result.success) + self.assertEqual(result.error_code, 1) + self.assertIn("Failed to check pod restore status", result.error_reason) + logging.info("Finished test_is_restored_from_snapshot_api_error.") + + def test_is_restored_from_snapshot_no_pod_name(self): + """Test is_restored_from_snapshot when pod_name is missing.""" + logging.info("Starting test_is_restored_from_snapshot_no_pod_name...") + self.client.pod_name = None + result = self.client.is_restored_from_snapshot("snapshot-uid-123") + + self.assertFalse(result.success) + self.assertEqual(result.error_code, 1) + self.assertIn("Pod name not found", result.error_reason) + logging.info("Finished test_is_restored_from_snapshot_no_pod_name.") + + def test_is_restored_from_snapshot_empty_uid(self): + """Test is_restored_from_snapshot with empty UID.""" + logging.info("Starting test_is_restored_from_snapshot_empty_uid...") + result = self.client.is_restored_from_snapshot("") + + self.assertFalse(result.success) + self.assertEqual(result.error_code, 1) + self.assertIn("Snapshot UID cannot be empty", result.error_reason) + logging.info("Finished test_is_restored_from_snapshot_empty_uid.") if __name__ == "__main__": diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py index fdc59e6fb..04afbd81e 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py @@ -18,7 +18,7 @@ import sys from kubernetes import client, config import re -from agentic_sandbox.gke_extensions import PodSnapshotSandboxClient +from k8s_agent_sandbox.gke_extensions import PodSnapshotSandboxClient def test_snapshot_response(snapshot_response, snapshot_name): @@ -66,7 +66,6 @@ async def main( wait_time = 10 first_snapshot_name = "test-snapshot-10" second_snapshot_name = "test-snapshot-20" - core_v1_api = client.CoreV1Api() try: print("\n***** Phase 1: Starting Counter *****") @@ -108,12 +107,12 @@ async def main( print("\nWaiting 5 seconds for restored pod to resume printing...") time.sleep(5) - is_restored, restored_snapshot_uid = sandbox_restored.is_restored() - assert is_restored, "Pod was not restored from a snapshot." - assert ( - restored_snapshot_uid == recent_snapshot_uid - ), "Restored snapshot UID does not match the most recent snapshot UID." + restore_result = sandbox_restored.is_restored_from_snapshot( + recent_snapshot_uid + ) + assert restore_result.success, "Pod was not restored from a snapshot." print("Pod was restored from the most recent snapshot.") + print("--- Pod Snapshot Test Passed! ---") except Exception as e: From 18b8920bf9df22c8b40131eeedf40e34a91d0082 Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Thu, 19 Feb 2026 23:09:34 +0000 Subject: [PATCH 12/12] Update list and delete methods --- .../gke_extensions/podsnapshot_client.py | 487 ------------------ .../gke_extensions/podsnapshot.md | 21 +- .../gke_extensions/podsnapshot_client.py | 302 ++++++++++- .../gke_extensions/test_podsnapshot_client.py | 69 ++- .../test_podsnapshot_extension.py | 10 +- 5 files changed, 392 insertions(+), 497 deletions(-) delete mode 100644 clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py deleted file mode 100644 index 77dc64132..000000000 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py +++ /dev/null @@ -1,487 +0,0 @@ -# Copyright 2026 The Kubernetes Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import sys -import os -from typing import Any -from pathlib import Path -import json -from dataclasses import dataclass -from kubernetes import client, watch -from kubernetes.client import ApiException -from ..sandbox_client import SandboxClient, ExecutionResult -from ..constants import * - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - stream=sys.stdout, -) - - -@dataclass -class SnapshotResult: - """Result of a snapshot processing operation.""" - - snapshot_uid: str - snapshot_timestamp: str - - -@dataclass -class SnapshotResponse: - """Structured response for snapshot operations.""" - - success: bool - trigger_name: str - error_reason: str - error_code: int - - -class SnapshotPersistenceManager: - """ - Manages local persistence of snapshot metadata in a secure directory. - Stores metadata as a dictionary keyed by trigger_name. - """ - - def __init__(self): - """Initializes the persistence manager and ensures the secure directory exists.""" - self.secure_dir = Path.home() / ".snapshot_metadata" - self._ensure_secure_dir() - self.metadata_file = self.secure_dir / ".snapshots.json" - - def _ensure_secure_dir(self): - """Ensures the directory exists with 700 permissions.""" - if not self.secure_dir.exists(): - self.secure_dir.mkdir(parents=True) - self.secure_dir.chmod(0o700) - - def _load_metadata(self) -> dict[str, Any]: - """Loads metadata. Returns an empty dict if file doesn't exist or is invalid.""" - if not self.metadata_file.exists(): - return {} - try: - with open(self.metadata_file, "r") as f: - data = json.load(f) - if isinstance(data, list): - # Handle legacy list format by clearing it (or converting if preferred, but identifying key is tricky if not consistent) - logging.warning("Found legacy list-format metadata. Resetting to empty dict.") - return {} - return data - except (json.JSONDecodeError, IOError) as e: - logging.warning(f"Failed to load snapshot metadata: {e}") - return {} - - def save_snapshot_metadata(self, record: dict[str, Any]): - """Saves a snapshot record to the local registry.""" - trigger_name = record.get("trigger_name") - if not trigger_name: - logging.error("Cannot save metadata: missing 'trigger_name'.") - return - - snapshots = self._load_metadata() - snapshots[trigger_name] = record - - try: - with open(self.metadata_file, "w") as f: - json.dump(snapshots, f, indent=4) - self.metadata_file.chmod(0o600) - logging.info(f"Snapshot metadata saved to {self.metadata_file}") - except IOError as e: - logging.error(f"Failed to save snapshot metadata: {e}") - - def delete_snapshot_metadata(self, trigger_name: str): - """Deletes a snapshot record from the local registry.""" - snapshots = self._load_metadata() - if trigger_name in snapshots: - del snapshots[trigger_name] - try: - with open(self.metadata_file, "w") as f: - json.dump(snapshots, f, indent=4) - self.metadata_file.chmod(0o600) - logging.info(f"Snapshot metadata deleted for '{trigger_name}'") - except IOError as e: - logging.error(f"Failed to save metadata after deletion: {e}") - else: - logging.info(f"No local metadata found for trigger '{trigger_name}' to delete.") - - -class PodSnapshotSandboxClient(SandboxClient): - """ - A specialized Sandbox client for interacting with the gke pod snapshot controller. - Currently supports manual triggering via PodSnapshotManualTrigger. - """ - - def __init__( - self, - template_name: str, - podsnapshot_timeout: int = 180, - server_port: int = 8080, - **kwargs, - ): - super().__init__(template_name, server_port=server_port, **kwargs) - - self.controller_ready = False - self.podsnapshot_timeout = podsnapshot_timeout - self.persistence_manager = SnapshotPersistenceManager() - - def __enter__(self) -> "PodSnapshotSandboxClient": - self.controller_ready = self.snapshot_controller_ready() - super().__enter__() - return self - - def _get_policy_labels(self, snapshot_uid: str) -> dict: - """ - Retrieves the policy labels of the specified PodSnapshot resource. - """ - try: - snapshot = self.custom_objects_api.get_namespaced_custom_object( - group=PODSNAPSHOT_API_GROUP, - version=PODSNAPSHOT_API_VERSION, - namespace=self.namespace, - plural=PODSNAPSHOT_PLURAL, - name=snapshot_uid - ) - policy_name = snapshot.get("spec", {}).get("policyName", {}) - policy = self.custom_objects_api.get_namespaced_custom_object( - group=PODSNAPSHOT_API_GROUP, - version=PODSNAPSHOT_API_VERSION, - namespace=self.namespace, - plural=PODSNAPSHOTPOLICY_PLURAL, - name=policy_name - ) - return policy.get("spec", {}).get("selector", {}).get("matchLabels", {}) - except ApiException as e: - logging.error(f"Failed to retrieve PodSnapshot '{snapshot_uid}': {e}") - raise - - def _wait_for_snapshot_processed(self, trigger_name: str) -> SnapshotResult: - """ - Waits for the PodSnapshotManualTrigger to be processed and returns SnapshotResult. - """ - w = watch.Watch() - logging.info( - f"Waiting for snapshot manual trigger '{trigger_name}' to be processed..." - ) - - try: - for event in w.stream( - func=self.custom_objects_api.list_namespaced_custom_object, - namespace=self.namespace, - group=PODSNAPSHOT_API_GROUP, - version=PODSNAPSHOT_API_VERSION, - plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, - field_selector=f"metadata.name={trigger_name}", - timeout_seconds=self.podsnapshot_timeout, - ): - if event["type"] in ["ADDED", "MODIFIED"]: - obj = event["object"] - status = obj.get("status", {}) - conditions = status.get("conditions", []) - - for condition in conditions: - if ( - condition.get("type") == "Triggered" - and condition.get("status") == "True" - and condition.get("reason") == "Complete" - ): - snapshot_uid = status.get("snapshotCreated", {}).get("name") - snapshot_timestamp = condition.get("lastTransitionTime") - logging.info( - f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {snapshot_uid}" - ) - w.stop() - return SnapshotResult( - snapshot_uid=snapshot_uid, - snapshot_timestamp=snapshot_timestamp, - ) - except Exception as e: - logging.error(f"Error watching snapshot: {e}") - raise - - raise TimeoutError( - f"Snapshot manual trigger '{trigger_name}' was not processed within {self.podsnapshot_timeout} seconds." - ) - - def snapshot_controller_ready(self) -> bool: - """ - Checks if the snapshot agent pods are running in a GKE-managed pod snapshot cluster. - """ - - if self.controller_ready: - return True - - core_v1_api = client.CoreV1Api() - - def check_namespace(namespace: str, required_components: list[str]) -> bool: - try: - pods = core_v1_api.list_namespaced_pod(namespace) - found_components = { - component: False for component in required_components - } - - for pod in pods.items: - if pod.status.phase == "Running": - name = pod.metadata.name - for component in required_components: - if component in name: - found_components[component] = True - - return all(found_components.values()) - except ApiException: - return False - - # Check managed: requires only agent in gke-managed-pod-snapshots - if check_namespace(SNAPSHOT_NAMESPACE_MANAGED, [SNAPSHOT_AGENT]): - self.controller_ready = True - return True - - self.controller_ready = False - return self.controller_ready - - def snapshot(self, trigger_name: str) -> SnapshotResponse: - """ - Triggers a snapshot of the specified pod by creating a PodSnapshotManualTrigger resource. - The trigger_name will be suffixed with the current datetime. - Returns: - tuple[ExecutionResult, str]: The result of the operation and the final trigger name (with suffix). - """ - trigger_name = f"{trigger_name}-{os.urandom(4).hex()}" - - if not self.controller_ready: - return SnapshotResponse( - success=False, - trigger_name=trigger_name, - error_reason="Snapshot controller is not ready. Ensure it is installed and running.", - error_code=1, - ) - if not self.pod_name: - return SnapshotResponse( - success=False, - trigger_name=trigger_name, - error_reason="Sandbox pod name not found. Ensure sandbox is created.", - error_code=1, - ) - - manifest = { - "apiVersion": f"{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}", - "kind": f"{PODSNAPSHOT_API_KIND}", - "metadata": {"name": trigger_name, "namespace": self.namespace}, - "spec": {"targetPod": self.pod_name}, - } - - try: - self.custom_objects_api.create_namespaced_custom_object( - group=PODSNAPSHOT_API_GROUP, - version=PODSNAPSHOT_API_VERSION, - namespace=self.namespace, - plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, - body=manifest, - ) - snapshot_result = self._wait_for_snapshot_processed(trigger_name) - - policy_labels = self._get_policy_labels(snapshot_result.snapshot_uid) - # Save metadata locally - try: - record = { - "trigger_name": trigger_name, - "uid": snapshot_result.snapshot_uid, - "template_name": self.template_name, - "policy_labels": policy_labels, - "namespace": self.namespace, - "claim_name": self.claim_name, - "timestamp": snapshot_result.snapshot_timestamp - } - self.persistence_manager.save_snapshot_metadata(record) - except Exception as e: - logging.warning(f"Failed to save snapshot metadata locally: {e}") - - return SnapshotResponse( - success=True, trigger_name=trigger_name, error_reason="", error_code=0 - ) - except ApiException as e: - logging.exception( - f"Failed to create PodSnapshotManualTrigger '{trigger_name}': {e}" - ) - return SnapshotResponse( - success=False, - trigger_name=trigger_name, - error_reason=f"Failed to create PodSnapshotManualTrigger: {e}", - error_code=1, - ) - except TimeoutError as e: - logging.exception( - f"Snapshot creation timed out for trigger '{trigger_name}': {e}" - ) - return SnapshotResponse( - success=False, - trigger_name=trigger_name, - error_reason=f"Snapshot creation timed out: {e}", - error_code=1, - ) - - def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None: - """ - Checks for existing snapshots matching the label selector and optional policy name. - Returns a list of valid snapshots sorted by creation timestamp (newest first). - - policy_name: Filters snapshots by their spec.policyName. - ready_only: If True, filters out snapshots that are only in 'Ready' state. - """ - local_meta = self.persistence_manager._load_metadata() - if not local_meta: - logging.info("No local snapshot metadata found.") - return None - - valid_snapshots = [] - - for trigger_name, record in local_meta.items(): - uid = record.get("uid") - if not uid: - continue - - try: - # Fetch the actual PodSnapshot resource to verify status and details - snapshot = self.custom_objects_api.get_namespaced_custom_object( - group=PODSNAPSHOT_API_GROUP, - version=PODSNAPSHOT_API_VERSION, - namespace=self.namespace, - plural=PODSNAPSHOT_PLURAL, - name=uid - ) - except ApiException as e: - # If 404, it means it's in metadata but deleted from K8s. Skip it. - if e.status != 404: - logging.warning(f"Failed to fetch PodSnapshot '{uid}' (trigger: {trigger_name}): {e}") - continue - - spec = snapshot.get("spec", {}) - status = snapshot.get("status", {}) - conditions = status.get("conditions", []) - metadata = snapshot.get("metadata", {}) - - # Filter by policy_name if provided - if policy_name and spec.get("policyName") != policy_name: - continue - - # Check for Ready=True - is_ready = False - for cond in conditions: - if cond.get("type") == "Ready" and cond.get("status") == "True": - is_ready = True - break - - # Skip if only ready snapshots are requested - if ready_only and not is_ready: - continue - - valid_snapshots.append({ - "snapshot_id": metadata.get("name"), - "trigger_name": trigger_name, - "source_pod": metadata.get("labels", {}).get("podsnapshot.gke.io/pod-name", "Unknown"), - "uid": metadata.get("uid"), - "creationTimestamp": metadata.get("creationTimestamp", ""), - "status": "Ready" if is_ready else "NotReady", - "policy_name": spec.get("policyName") - }) - - if not valid_snapshots: - logging.info("No Ready snapshots found matching criteria in local metadata.") - return None - - # Sort snapshots by creation timestamp descending - valid_snapshots.sort(key=lambda x: x["creationTimestamp"], reverse=True) - logging.info(f"Found {len(valid_snapshots)} ready snapshots from local metadata.") - return valid_snapshots - - def delete_snapshots(self, trigger_name: str | None = None) -> int: - """ - Deletes snapshots matching the provided trigger name and the PSMT resources. - Returns the count of successfully deleted snapshots. - """ - local_meta = self.persistence_manager._load_metadata() - - triggers_to_delete = [] - - if trigger_name: - if trigger_name in local_meta: - triggers_to_delete.append(trigger_name) - else: - logging.warning(f"Trigger '{trigger_name}' not found in local metadata. No deletion performed.") - return 0 - else: - logging.info("No trigger_name provided. Deleting ALL snapshots found in local metadata.") - triggers_to_delete = list(local_meta.keys()) - - if not triggers_to_delete: - logging.info("No local snapshot metadata found to process.") - return 0 - - delete_count = 0 - for trig in triggers_to_delete: - record = local_meta.get(trig) - if not record: - continue - - uid = record.get("uid") - - # Delete PodSnapshot (if UID is known) - if uid: - try: - logging.info(f"Deleting PodSnapshot '{uid}' associated with trigger '{trig}'...") - self.custom_objects_api.delete_namespaced_custom_object( - group=PODSNAPSHOT_API_GROUP, - version=PODSNAPSHOT_API_VERSION, - namespace=self.namespace, - plural=PODSNAPSHOT_PLURAL, - name=uid - ) - logging.info(f"PodSnapshot '{uid}' deleted.") - except ApiException as e: - if e.status == 404: - logging.info(f"PodSnapshot '{uid}' not found in K8s (already deleted?).") - else: - logging.error(f"Failed to delete PodSnapshot '{uid}': {e}") - else: - logging.warning(f"No UID associated with trigger '{trig}' in metadata. Skipping PodSnapshot deletion.") - - # Delete PodSnapshotManualTrigger - try: - logging.info(f"Deleting PodSnapshotManualTrigger '{trig}'...") - self.custom_objects_api.delete_namespaced_custom_object( - group=PODSNAPSHOT_API_GROUP, - version=PODSNAPSHOT_API_VERSION, - namespace=self.namespace, - plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, - name=trig - ) - logging.info(f"PodSnapshotManualTrigger '{trig}' deleted.") - except ApiException as e: - if e.status == 404: - logging.info(f"PodSnapshotManualTrigger '{trig}' not found in K8s (already deleted?).") - else: - logging.error(f"Failed to delete PodSnapshotManualTrigger '{trig}': {e}") - - # Cleanup Local Metadata - self.persistence_manager.delete_snapshot_metadata(trig) - delete_count += 1 - - logging.info(f"Snapshot deletion process completed. Deleted {delete_count} snapshots.") - return delete_count - - def __exit__(self, exc_type, exc_val, exc_tb): - """ - Automatically cleans up the Sandbox. - """ - super().__exit__(exc_type, exc_val, exc_tb) diff --git a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot.md b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot.md index 6521f809b..6b61ccd21 100644 --- a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot.md +++ b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot.md @@ -30,17 +30,30 @@ A specialized Sandbox client for interacting with the gke pod snapshot controlle * **`list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None`**: * Lists valid snapshots found in the local metadata storage (`~/.snapshot_metadata/.snapshots.json`). * Filters by `policy_name` and `ready_only` status (default: True). - * Returns a list of dictionaries containing snapshot details (id, trigger_name, source_pod, uid, creationTimestamp, status, policy_name) sorted by creation timestamp (newest first). -* **`delete_snapshots(self, trigger_name: str | None = None) -> int`**: + * Returns a list of dictionaries containing snapshot details (id, source_pod, uid, creationTimestamp, status, policy_name) sorted by creation timestamp (newest first). +* **`delete_snapshots(self, snapshot_uid: str | None = None, policy_name: str | None = None) -> int`**: * Deletes snapshots and their corresponding `PodSnapshotManualTrigger` resources. - * If `trigger_name` is provided, deletes only that specific snapshot. - * If `trigger_name` is `None`, deletes **ALL** snapshots found in the local metadata. + * If `snapshot_uid` is provided, deletes that specific snapshot. + * If `policy_name` is provided, deletes all snapshots associated with that policy. + * If neither is provided, deletes **ALL** snapshots found in the local metadata. * Cleans up local metadata after successful deletion from K8s. * Returns the count of successfully deleted snapshots. * **`__exit__(self)`**: * Cleans up the `PodSnapshotManualTrigger` resources. * Cleans up the `SandboxClaim` resources. +### `SnapshotPersistenceManager` + +Manages local persistence of snapshot metadata in a secure directory. + +* **File Location**: `~/.snapshot_metadata/.snapshots.json` +* **Security**: Ensures the directory has `0o700` permissions and the file has `0o600` permissions. +* **Storage**: Metadata is stored as a JSON object keyed by `snapshot_uid`. +* **Functionality**: + * **`save_snapshot_metadata`**: Saves a snapshot record. + * **`delete_snapshot_metadata`**: Deletes a snapshot record by UID. + * **`_load_metadata`**: Loads and returns the metadata dictionary. + ## `test_podsnapshot_extension.py` This file, located in the parent directory (`clients/python/agentic-sandbox-client/`), contains an integration test script for the `PodSnapshotSandboxClient` extension. It verifies the snapshot and restore functionality. diff --git a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot_client.py b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot_client.py index 65eadf8ba..b6eb7037e 100644 --- a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot_client.py @@ -16,6 +16,8 @@ import sys import os from typing import Any +from pathlib import Path +import json from dataclasses import dataclass from kubernetes import client, watch from kubernetes.client import ApiException @@ -53,6 +55,84 @@ class RestoreResult: error_code: int +@dataclass +class PolicyMetadata: + policy_name: str + policy_labels: dict[str, str] + + +class SnapshotPersistenceManager: + """ + Manages local persistence of snapshot metadata in a secure directory. + Stores metadata as a dictionary keyed by trigger_name. + """ + + def __init__(self): + """Initializes the persistence manager and ensures the secure directory exists.""" + self.secure_dir = Path.home() / ".snapshot_metadata" + self._ensure_secure_dir() + self.metadata_file = self.secure_dir / ".snapshots.json" + + def _ensure_secure_dir(self): + """Ensures the directory exists with 700 permissions.""" + if not self.secure_dir.exists(): + self.secure_dir.mkdir(parents=True) + self.secure_dir.chmod(0o700) + + def _load_metadata(self) -> dict[str, Any]: + """Loads metadata. Returns an empty dict if file doesn't exist or is invalid.""" + if not self.metadata_file.exists(): + return {} + try: + with open(self.metadata_file, "r") as f: + data = json.load(f) + if isinstance(data, list): + # Handle legacy list format by clearing it (or converting if preferred, but identifying key is tricky if not consistent) + logging.warning( + "Found legacy list-format metadata. Resetting to empty dict." + ) + return {} + return data + except (json.JSONDecodeError, IOError) as e: + logging.warning(f"Failed to load snapshot metadata: {e}") + return {} + + def save_snapshot_metadata(self, record: dict[str, Any]): + """Saves a snapshot record to the local registry keyed by snapshot_uid.""" + snapshot_uid = record.get("snapshot_uid") + if not snapshot_uid: + logging.error("Cannot save metadata: missing 'snapshot_uid'.") + return + + snapshots = self._load_metadata() + snapshots[snapshot_uid] = record + + try: + with open(self.metadata_file, "w") as f: + json.dump(snapshots, f, indent=4) + self.metadata_file.chmod(0o600) + logging.info(f"Snapshot metadata saved to {self.metadata_file}") + except IOError as e: + logging.error(f"Failed to save snapshot metadata: {e}") + + def delete_snapshot_metadata(self, snapshot_uid: str): + """Deletes a snapshot record from the local registry.""" + snapshots = self._load_metadata() + if snapshot_uid in snapshots: + del snapshots[snapshot_uid] + try: + with open(self.metadata_file, "w") as f: + json.dump(snapshots, f, indent=4) + self.metadata_file.chmod(0o600) + logging.info(f"Snapshot metadata deleted for '{snapshot_uid}'") + except IOError as e: + logging.error(f"Failed to save metadata after deletion: {e}") + else: + logging.info( + f"No local metadata found for snapshot '{snapshot_uid}' to delete." + ) + + class PodSnapshotSandboxClient(SandboxClient): """ A specialized Sandbox client for interacting with the gke pod snapshot controller. @@ -70,6 +150,7 @@ def __init__( self.controller_ready = False self.podsnapshot_timeout = podsnapshot_timeout + self.persistence_manager = SnapshotPersistenceManager() self.core_v1_api = client.CoreV1Api() self.created_manual_triggers = [] @@ -79,6 +160,41 @@ def __enter__(self) -> "PodSnapshotSandboxClient": super().__enter__() return self + def _get_policy_info(self, snapshot_uid: str) -> PolicyMetadata: + """ + Retrieves the policy name and labels of the specified PodSnapshot resource. + """ + try: + snapshot = self.custom_objects_api.get_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOT_PLURAL, + name=snapshot_uid, + ) + policy_name = snapshot.get("spec", {}).get("policyName", "") + if not policy_name: + logging.warning(f"No policyName found for snapshot {snapshot_uid}") + return PolicyMetadata(policy_name="", policy_labels={}) + + policy = self.custom_objects_api.get_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOTPOLICY_PLURAL, + name=policy_name, + ) + policy_metadata = PolicyMetadata( + policy_name=policy_name, + policy_labels=policy.get("spec", {}) + .get("selector", {}) + .get("matchLabels", {}), + ) + return policy_metadata + except ApiException as e: + logging.error(f"Failed to retrieve PodSnapshot '{snapshot_uid}': {e}") + raise + def _parse_snapshot_result(self, obj) -> SnapshotResult | None: """Parses the object to see if snapshot is complete.""" status = obj.get("status", {}) @@ -252,6 +368,21 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: ) self.created_manual_triggers.append(trigger_name) + policy_metadata = self._get_policy_info(snapshot_result.snapshot_uid) + # Save metadata locally + try: + record = { + "snapshot_uid": snapshot_result.snapshot_uid, + "template_name": self.template_name, + "policy_name": policy_metadata.policy_name, + "policy_labels": policy_metadata.policy_labels, + "namespace": self.namespace, + "claim_name": self.claim_name, + "timestamp": snapshot_result.snapshot_timestamp, + } + self.persistence_manager.save_snapshot_metadata(record) + except Exception as e: + logging.warning(f"Failed to save snapshot metadata locally: {e}") return SnapshotResponse( success=True, @@ -348,12 +479,179 @@ def is_restored_from_snapshot(self, snapshot_uid: str) -> RestoreResult: error_code=1, ) + def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None: + """ + Checks for existing snapshots matching the policy name. + Returns a list of valid snapshots sorted by creation timestamp (newest first). + + policy_name: Filters snapshots by their spec.policyName. + ready_only: If True, filters out snapshots that are only in 'Ready' state. + """ + local_meta = self.persistence_manager._load_metadata() + if not local_meta: + logging.info("No local snapshot metadata found.") + return None + + valid_snapshots = [] + + for snapshot_id, record in local_meta.items(): + uid = record.get("snapshot_uid") + if not uid: + # Fallback if key is not uid (legacy) or uid missing in record + uid = snapshot_id + + # Optimized Filtering: Check policy_name from metadata if available + meta_policy_name = record.get("policy_name") + if policy_name and meta_policy_name and meta_policy_name != policy_name: + continue + + try: + # Fetch the actual PodSnapshot resource to verify status and details + snapshot = self.custom_objects_api.get_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOT_PLURAL, + name=uid, + ) + except ApiException as e: + # If 404, it means it's in metadata but deleted from K8s. Skip it. + if e.status != 404: + logging.warning(f"Failed to fetch PodSnapshot '{uid}': {e}") + else: + # delete from the local metadata + self.persistence_manager.delete_snapshot_metadata(uid) + continue + + spec = snapshot.get("spec", {}) + status = snapshot.get("status", {}) + conditions = status.get("conditions", []) + metadata = snapshot.get("metadata", {}) + + # Filter by policy_name if provided and not already checked + crd_policy_name = spec.get("policyName") + if policy_name and crd_policy_name != policy_name: + continue + + # Update metadata if policy_name was missing + if not meta_policy_name and crd_policy_name: + record["policy_name"] = crd_policy_name + self.persistence_manager.save_snapshot_metadata(record) + + # Check for Ready=True + is_ready = False + for cond in conditions: + if cond.get("type") == "Ready" and cond.get("status") == "True": + is_ready = True + break + + # Skip if only ready snapshots are requested + if ready_only and not is_ready: + continue + + valid_snapshots.append( + { + "snapshot_id": metadata.get("name"), + "source_pod": metadata.get("labels", {}).get( + "podsnapshot.gke.io/pod-name", "Unknown" + ), + "uid": metadata.get("uid"), + "creationTimestamp": metadata.get("creationTimestamp", ""), + "status": "Ready" if is_ready else "NotReady", + "policy_name": spec.get("policyName"), + } + ) + + if not valid_snapshots: + logging.info( + "No Ready snapshots found matching criteria in local metadata." + ) + return None + + # Sort snapshots by creation timestamp descending + valid_snapshots.sort(key=lambda x: x["creationTimestamp"], reverse=True) + logging.info( + f"Found {len(valid_snapshots)} ready snapshots from local metadata." + ) + return valid_snapshots + + def delete_snapshots( + self, snapshot_uid: str | None = None, policy_name: str | None = None + ) -> int: + """ + Deletes snapshots. + - If snapshot_uid is provided, deletes that specific snapshot. + - If policy_name is provided, deletes all snapshots matching that policy. + - If neither is provided, deletes ALL snapshots in local metadata. + Returns the count of successfully deleted snapshots. + """ + local_meta = self.persistence_manager._load_metadata() + + snapshots_to_delete = [] # List of uids + + if snapshot_uid: + if snapshot_uid in local_meta: + snapshots_to_delete.append(snapshot_uid) + else: + logging.warning( + f"Snapshot '{snapshot_uid}' not found in local metadata. Checking other filters or skipping." + ) + # We could try to delete from K8s even if not in metadata, but for now strict consistency + + if policy_name: + for uid, record in local_meta.items(): + if record.get("policy_name") == policy_name: + if uid not in snapshots_to_delete: + snapshots_to_delete.append(uid) + + if not snapshot_uid and not policy_name: + logging.info( + "No snapshot_uid or policy_name provided. Deleting ALL snapshots found in local metadata." + ) + snapshots_to_delete = list(local_meta.keys()) + + if not snapshots_to_delete: + logging.info("No snapshots found matching criteria to delete.") + return 0 + + delete_count = 0 + for uid in snapshots_to_delete: + record = local_meta.get(uid) + if not record: + continue + + # Delete PodSnapshot + try: + logging.info(f"Deleting PodSnapshot '{uid}'...") + self.custom_objects_api.delete_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOT_PLURAL, + name=uid, + ) + logging.info(f"PodSnapshot '{uid}' deleted.") + except ApiException as e: + if e.status == 404: + logging.info( + f"PodSnapshot '{uid}' not found in K8s (already deleted?)." + ) + else: + logging.error(f"Failed to delete PodSnapshot '{uid}': {e}") + + # Cleanup Local Metadata + self.persistence_manager.delete_snapshot_metadata(uid) + delete_count += 1 + + logging.info( + f"Snapshot deletion process completed. Deleted {delete_count} snapshots." + ) + return delete_count + def __exit__(self, exc_type, exc_val, exc_tb): """ Cleans up the PodSnapshotManualTrigger Resources. Automatically cleans up the Sandbox. - - TODO: Add cleanup for PodSnapshot resources. """ for trigger_name in self.created_manual_triggers: try: diff --git a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/test_podsnapshot_client.py b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/test_podsnapshot_client.py index cc1ece767..2e39f6219 100644 --- a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/test_podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/test_podsnapshot_client.py @@ -19,6 +19,7 @@ from datetime import datetime from k8s_agent_sandbox.gke_extensions.podsnapshot_client import ( PodSnapshotSandboxClient, + PolicyMetadata, ) from k8s_agent_sandbox.constants import * from kubernetes.client import ApiException @@ -52,7 +53,10 @@ def load_kubernetes_config(): class TestPodSnapshotSandboxClient(unittest.TestCase): @patch("kubernetes.config") - def setUp(self, mock_config): + @patch( + "k8s_agent_sandbox.gke_extensions.podsnapshot_client.SnapshotPersistenceManager" + ) + def setUp(self, mock_persistence_cls, mock_config): logging.info("Setting up TestPodSnapshotSandboxClient...") # Mock kubernetes config loading mock_config.load_incluster_config.side_effect = config.ConfigException( @@ -60,12 +64,17 @@ def setUp(self, mock_config): ) mock_config.load_kube_config.return_value = None + self.mock_persistence_manager = mock_persistence_cls.return_value + # Create client without patching super, as it's tested separately with patch.object( PodSnapshotSandboxClient, "snapshot_controller_ready", return_value=True ): self.client = PodSnapshotSandboxClient("test-template") + # Verify persistence manager is mocked + self.client.persistence_manager = self.mock_persistence_manager + # Mock the kubernetes APIs on the client instance self.client.custom_objects_api = MagicMock() self.client.core_v1_api = MagicMock() @@ -217,7 +226,14 @@ def test_snapshot_success(self, mock_watch_class): mock_created_obj ) - result = self.client.snapshot("test-trigger") + # Mock _get_policy_info to return valid data + policy_metadata = PolicyMetadata( + policy_name="test-policy", policy_labels={"app": "foo"} + ) + with patch.object( + self.client, "_get_policy_info", return_value=policy_metadata + ): + result = self.client.snapshot("test-trigger") self.assertEqual(result.error_code, 0) self.assertTrue(result.success) @@ -229,6 +245,14 @@ def test_snapshot_success(self, mock_watch_class): mock_watch.stream.assert_called_once() _, kwargs = mock_watch.stream.call_args self.assertEqual(kwargs.get("resource_version"), "123") + + # Verify metadata saved + self.mock_persistence_manager.save_snapshot_metadata.assert_called_once() + args, _ = self.mock_persistence_manager.save_snapshot_metadata.call_args + record = args[0] + self.assertEqual(record["snapshot_uid"], "snapshot-uid") + self.assertEqual(record["policy_name"], "test-policy") + logging.info("Finished test_snapshot_success.") def test_snapshot_controller_not_ready(self): @@ -430,6 +454,47 @@ def test_is_restored_from_snapshot_empty_uid(self): self.assertIn("Snapshot UID cannot be empty", result.error_reason) logging.info("Finished test_is_restored_from_snapshot_empty_uid.") + def test_delete_snapshots_by_uid(self): + """Test delete_snapshots by specific UID.""" + logging.info("Starting test_delete_snapshots_by_uid...") + self.mock_persistence_manager._load_metadata.return_value = { + "uid-1": {"uid": "uid-1", "policy_name": "p1"}, + "uid-2": {"uid": "uid-2", "policy_name": "p2"}, + } + + count = self.client.delete_snapshots(snapshot_uid="uid-1") + + self.assertEqual(count, 1) + # Verify deletions + self.mock_persistence_manager.delete_snapshot_metadata.assert_called_with( + "uid-1" + ) + self.client.custom_objects_api.delete_namespaced_custom_object.assert_any_call( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.client.namespace, + plural=PODSNAPSHOT_PLURAL, + name="uid-1", + ) + logging.info("Finished test_delete_snapshots_by_uid.") + + def test_delete_snapshots_by_policy(self): + """Test delete_snapshots by policy name.""" + logging.info("Starting test_delete_snapshots_by_policy...") + self.mock_persistence_manager._load_metadata.return_value = { + "uid-1": {"uid": "uid-1", "policy_name": "p1"}, + "uid-2": {"uid": "uid-2", "policy_name": "p1"}, # Same policy + "uid-3": {"uid": "uid-3", "policy_name": "p2"}, + } + + count = self.client.delete_snapshots(policy_name="p1") + + self.assertEqual(count, 2) + # Verify deletions for both + self.mock_persistence_manager.delete_snapshot_metadata.assert_any_call("uid-1") + self.mock_persistence_manager.delete_snapshot_metadata.assert_any_call("uid-2") + logging.info("Finished test_delete_snapshots_by_policy.") + if __name__ == "__main__": unittest.main() diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py index 2e5b406cb..38a73a139 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py @@ -66,6 +66,7 @@ async def main( wait_time = 10 first_snapshot_name = "test-snapshot-10" second_snapshot_name = "test-snapshot-20" + policy_name = "example-psp-workload" # PodSnapshotPolicy defined in the cluster. try: print("\n***** Phase 1: Starting Counter *****") @@ -85,6 +86,7 @@ async def main( ) snapshot_response = sandbox.snapshot(first_snapshot_name) test_snapshot_response(snapshot_response, first_snapshot_name) + first_snapshot_uid = snapshot_response.snapshot_uid time.sleep(wait_time) @@ -96,10 +98,14 @@ async def main( recent_snapshot_uid = snapshot_response.snapshot_uid print(f"Recent snapshot UID: {recent_snapshot_uid}") - print("\n***** List all existing ready snapshots with the policy name. *****") + print( + "\n***** List all existing ready snapshots with the policy name. *****" + ) snapshots = sandbox.list_snapshots(policy_name=policy_name) for snap in snapshots: - print(f"Snapshot ID: {snap['snapshot_id']}, Triggered By: {snap['trigger_name']}, Source Pod: {snap['source_pod']}, Creation Time: {snap['creationTimestamp']}, Policy Name: {snap['policy_name']}") + print( + f"Snapshot ID: {snap['snapshot_id']}, Source Pod: {snap['source_pod']}, Creation Time: {snap['creationTimestamp']}, Policy Name: {snap['policy_name']}" + ) print("\n***** Phase 2: Restoring from most recent snapshot & Verifying *****") with PodSnapshotSandboxClient(