From b70d4ad2025c44984b03ddf44e0b8f4c6078949f Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Wed, 21 Jan 2026 05:21:40 +0000 Subject: [PATCH 01/10] PodsnapshotSandboxclient base methods --- .../agentic_sandbox/constants.py | 38 +++ .../agentic_sandbox/extensions/__init__.py | 2 + .../agentic_sandbox/extensions/podsnapshot.py | 219 ++++++++++++++++++ .../agentic_sandbox/podsnapshot.md | 70 ++++++ .../agentic_sandbox/sandbox_client.py | 15 +- .../test_podsnapshot_extension.py | 162 +++++++++++++ 6 files changed, 492 insertions(+), 14 deletions(-) create mode 100644 clients/python/agentic-sandbox-client/agentic_sandbox/constants.py create mode 100644 clients/python/agentic-sandbox-client/agentic_sandbox/extensions/podsnapshot.py create mode 100644 clients/python/agentic-sandbox-client/agentic_sandbox/podsnapshot.md create mode 100644 clients/python/agentic-sandbox-client/test_podsnapshot_extension.py diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py b/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py new file mode 100644 index 000000000..19b399724 --- /dev/null +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py @@ -0,0 +1,38 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Constants for API Groups and Resources +GATEWAY_API_GROUP = "gateway.networking.k8s.io" +GATEWAY_API_VERSION = "v1" +GATEWAY_PLURAL = "gateways" + +CLAIM_API_GROUP = "extensions.agents.x-k8s.io" +CLAIM_API_VERSION = "v1alpha1" +CLAIM_PLURAL_NAME = "sandboxclaims" + +SANDBOX_API_GROUP = "agents.x-k8s.io" +SANDBOX_API_VERSION = "v1alpha1" +SANDBOX_PLURAL_NAME = "sandboxes" + +POD_NAME_ANNOTATION = "agents.x-k8s.io/pod-name" + +PODSNAPSHOT_API_GROUP = "podsnapshot.gke.io" +PODSNAPSHOT_API_VERSION = "v1alpha1" +PODSNAPSHOT_PLURAL = "podsnapshots" +PODSNAPSHOTMANUALTRIGGER_PLURAL = "podsnapshotmanualtriggers" + +SNAPSHOT_NAMESPACE_SELF_INSTALLED = "gps-system" +SNAPSHOT_NAMESPACE_MANAGED = "gke-managed-pod-snapshots" +SNAPSHOT_CONTROLLER_NAME = "pod-snapshot-controller" +SNAPSHOT_AGENT = "pod-snapshot-agent" \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py index 1a72134ef..0a5b29227 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py @@ -12,3 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .computer_use import ComputerUseSandbox +from .podsnapshot import PodSnapshotSandboxClient diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/podsnapshot.py b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/podsnapshot.py new file mode 100644 index 000000000..401867f64 --- /dev/null +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/podsnapshot.py @@ -0,0 +1,219 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import sys +from kubernetes import client, config, watch +from kubernetes.client import ApiException +from ..sandbox_client import SandboxClient, ExecutionResult +from ..constants import * + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + stream=sys.stdout, +) + + +class PodSnapshotSandboxClient(SandboxClient): + """ + A specialized Sandbox client for interacting with the snapshot controller. + Handles the case only when triggerConfig is type manual. + """ + + def __init__( + self, + template_name: str, + podsnapshot_timeout: int = 180, + server_port: int = 8080, + **kwargs, + ): + + self.controller_ready = False + self.podsnapshot_timeout = podsnapshot_timeout + self.created_snapshots = [] + self.controller_ready = self.snapshot_controller_ready() + + super().__init__( + template_name, server_port=server_port, **kwargs + ) + + def _wait_for_snapshot_processed(self, trigger_name: str): + """ + Waits for the PodSnapshotManualTrigger to be processed and a snapshot created. + """ + w = watch.Watch() + logging.info(f"Waiting for snapshot manual trigger '{trigger_name}' to be processed...") + + try: + for event in w.stream( + func=self.custom_objects_api.list_namespaced_custom_object, + namespace=self.namespace, + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + field_selector=f"metadata.name={trigger_name}", + timeout_seconds=self.podsnapshot_timeout + ): + if event["type"] in ["ADDED", "MODIFIED"]: + obj = event["object"] + status = obj.get("status", {}) + conditions = status.get("conditions", []) + + for condition in conditions: + if ( + condition.get("type") == "Triggered" + and condition.get("status") == "True" + and condition.get("reason") == "Complete" + ): + logging.info(f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {status.get('snapshotCreated', {}).get('name')}") + w.stop() + return + except Exception as e: + logging.error(f"Error watching snapshot: {e}") + raise + + raise TimeoutError(f"Snapshot manual trigger '{trigger_name}' was not processed within {self.podsnapshot_timeout} seconds.") + + def snapshot_controller_ready(self) -> bool: + """ + Checks if the snapshot controller and agent pods are running. + Checks both self-installed (gps-system) and GKE-managed pod snapshot systems. + """ + + if self.controller_ready: + return True + + v1 = client.CoreV1Api() + + def check_namespace(namespace: str, required_components: list[str]) -> bool: + try: + pods = v1.list_namespaced_pod(namespace) + found_components = {component: False for component in required_components} + + for pod in pods.items: + if pod.status.phase == "Running": + name = pod.metadata.name + for component in required_components: + if component in name: + found_components[component] = True + + return all(found_components.values()) + except ApiException: + return False + + # Check self-installed: requires both controller and agent in gps-system + if check_namespace(SNAPSHOT_NAMESPACE_SELF_INSTALLED, [SNAPSHOT_CONTROLLER_NAME, SNAPSHOT_AGENT]): + self.controller_ready = True + return True + + # Check managed: requires only agent in gke-managed-pod-snapshots + if check_namespace(SNAPSHOT_NAMESPACE_MANAGED, [SNAPSHOT_AGENT]): + self.controller_ready = True + return True + + self.controller_ready = False + return self.controller_ready + + def checkpoint(self, trigger_name: str) -> ExecutionResult: + """ + Triggers a snapshot of the specified pod by creating a PodSnapshotManualTrigger resource. + """ + if not self.controller_ready: + return ExecutionResult( + stdout="", + stderr="Snapshot controller is not ready. Ensure it is installed and running.", + exit_code=1 + ) + if not self.pod_name: + return ExecutionResult( + stdout="", + stderr="Sandbox pod name not found. Ensure sandbox is created.", + exit_code=1 + ) + + manifest = { + "apiVersion": f"{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}", + "kind": "PodSnapshotManualTrigger", + "metadata": { + "name": trigger_name, + "namespace": self.namespace + }, + "spec": { + "targetPod": self.pod_name + } + } + + try: + self.custom_objects_api.create_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + body=manifest + ) + self.created_snapshots.append(trigger_name) + self._wait_for_snapshot_processed(trigger_name) + return ExecutionResult( + stdout=f"PodSnapshotManualTrigger '{trigger_name}' created successfully.", + stderr="", + exit_code=0 + ) + except ApiException as e: + return ExecutionResult( + stdout="", + stderr=f"Failed to create PodSnapshotManualTrigger: {e}", + exit_code=1 + ) + except TimeoutError as e: + return ExecutionResult( + stdout="", + stderr=f"Snapshot creation timed out: {e}", + exit_code=1 + ) + + def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None: + """ + Checks for existing snapshots matching the label selector and optional policy name. + Returns a list of valid snapshots sorted by creation timestamp (newest first). + policy_name: Filters snapshots by their spec.policyName. + ready_only: If True, filters out snapshots that are only in 'Ready' state. + """ + pass + + def delete_snapshots(self, **filters) -> int: + """ + Deletes snapshots matching the provided filters. + Returns the count of successfully deleted snapshots. + """ + pass + + def __exit__(self, exc_type, exc_val, exc_tb): + """ + Automatically cleans up the Sandbox and the PSMT Trigger Requests. + """ + for trigger_name in self.created_snapshots: + try: + logging.info(f"Cleaning up Trigger request: {trigger_name}") + self.custom_objects_api.delete_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + name=trigger_name + ) + except ApiException as e: + if e.status != 404: + logging.warning(f"Failed to cleanup trigger '{trigger_name}': {e}") + super().__exit__(exc_type, exc_val, exc_tb) \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/podsnapshot.md b/clients/python/agentic-sandbox-client/agentic_sandbox/podsnapshot.md new file mode 100644 index 000000000..452b21a2b --- /dev/null +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/podsnapshot.md @@ -0,0 +1,70 @@ +# Agentic Sandbox Pod Snapshot Extension + +This directory contains the Python client extension for interacting with the Agentic Sandbox to manage Pod Snapshots. This extension allows you to trigger checkpoints (snapshots) of a running sandbox and restore a new sandbox from the recently created snapshot. + +## `podsnapshot.py` + +This file defines the `PodSnapshotSandboxClient` class, which extends the base `SandboxClient` to provide snapshot capabilities. + +### Key Features: + +* **`PodSnapshotSandboxClient(template_name: str, ...)`**: + * Initializes the client. + * If snapshot exists, the pod snapshot controller restores from the most recent snapshot matching the label of the `SandboxTemplate`, otherwise creates a new `Sandbox`. +* **`snapshot_controller_ready(self) -> bool`**: + * Checks if the snapshot agent (both self-installed and GKE managed) is running and ready. +* **`checkpoint(self, trigger_name: str) -> ExecutionResult`**: + * Triggers a manual snapshot of the current sandbox pod by creating a `PodSnapshotManualTrigger` resource. + * Waits for the snapshot to be processed. + * The pod snapshot controller creates a `PodSnapshot` resource automatically. +* **`list_snapshots(self, policy_name: str, ready_only: bool) -> list`**: + * TBD +* **`delete_snapshots(self, **filters) -> int`**: + * TBD +* **Automatic Cleanup**: + * The `__exit__` method attempts to clean up triggers `PodSnapshotManualTrigger` created during the session, keeping the `PodSnapshot` resources alive after session exit. + +## `test_podsnapshot_extension.py` + +This file, located in the parent directory (`clients/python/agentic-sandbox-client/`), contains an integration test script for the `PodSnapshotSandboxClient` extension. It verifies the checkpoint and restore functionality. + +### Test Phases: + +1. **Phase 1: Starting Counter & Checkpointing**: + * Starts a sandbox with a counter application. + * Takes a snapshot (`test-snapshot-10`) after ~10 seconds. + * Takes a second snapshot (`test-snapshot-20`) after another ~10 seconds. +2. **Phase 2: Restoring from Recent Snapshot**: + * Restores a sandbox from the second snapshot. + * Verifies that the counter continues from where it left off (>= 20), proving the state was preserved. + +### Prerequisites + +1. **Python Virtual Environment**: + ```bash + python3 -m venv .venv + source .venv/bin/activate + ``` + +2. **Install Dependencies**: + ```bash + pip install kubernetes + pip install -e clients/python/agentic-sandbox-client/ + ``` + +3. **Pod Snapshot Controller**: The Pod Snapshot controller must be installed in the standard cluster running inside gVisor(Userguide). The GCS bucket to store the pod snapshot states and respective permissions must be applied. +4. **CRDs**: `PodSnapshotStorageConfig`, `PodSnapshotPolicy` CRDs must be applied. `PodSnapshotPolicy` should specify the selector match labels. +5. **Sandbox Template**: A `SandboxTemplate` (e.g., `python-counter-template`) with runtime gVisor and label that matches that selector label in `PodSnapshotPolicy` must be available in the cluster. + +### Running Tests: + +To run the integration test, execute the script with the appropriate arguments: + +```bash +python3 clients/python/agentic-sandbox-client/test_podsnapshot_extension.py \ + --labels app=agent-sandbox-workload \ + --template-name python-counter-template \ + --namespace sandbox-test +``` + +Adjust the `--namespace`, `--template-name`, and `--labels` as needed for your environment. \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/sandbox_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/sandbox_client.py index 6fc3260ff..e3aabeee8 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/sandbox_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/sandbox_client.py @@ -36,20 +36,7 @@ initialize_tracer, TracerManager, trace_span, trace, OPENTELEMETRY_AVAILABLE ) -# Constants for API Groups and Resources -GATEWAY_API_GROUP = "gateway.networking.k8s.io" -GATEWAY_API_VERSION = "v1" -GATEWAY_PLURAL = "gateways" - -CLAIM_API_GROUP = "extensions.agents.x-k8s.io" -CLAIM_API_VERSION = "v1alpha1" -CLAIM_PLURAL_NAME = "sandboxclaims" - -SANDBOX_API_GROUP = "agents.x-k8s.io" -SANDBOX_API_VERSION = "v1alpha1" -SANDBOX_PLURAL_NAME = "sandboxes" - -POD_NAME_ANNOTATION = "agents.x-k8s.io/pod-name" +from .constants import * logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py new file mode 100644 index 000000000..7781d8c74 --- /dev/null +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py @@ -0,0 +1,162 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import asyncio +import time +from kubernetes import client, config +import re +from agentic_sandbox.extensions import PodSnapshotSandboxClient + +POD_NAME_ANNOTATION = "agents.x-k8s.io/pod-name" + + +async def main(template_name: str, api_url: str | None, namespace: str, server_port: int, labels: dict[str, str]): + """ + Tests the Sandbox client by creating a sandbox, running a command, + and then cleaning up. + """ + + print( + f"--- Starting Sandbox Client Test (Namespace: {namespace}, Port: {server_port}) ---") + + # Load kube config + try: + config.load_incluster_config() + except config.ConfigException: + config.load_kube_config() + + wait_time = 10 + first_snapshot_name = "test-snapshot-10" + second_snapshot_name = "test-snapshot-20" + v1 = client.CoreV1Api() + + try: + print("\n***** Phase 1: Starting Counter *****") + + with PodSnapshotSandboxClient( + template_name=template_name, + namespace=namespace, + api_url=api_url, + server_port=server_port + ) as sandbox: + print("\n======= Testing Pod Snapshot Extension =======") + assert sandbox.controller_ready == True, "Sandbox controller is not ready." + + time.sleep(wait_time) + print(f"Creating first pod snapshot '{first_snapshot_name}' after {wait_time} seconds...") + snapshot_result = sandbox.checkpoint(first_snapshot_name) + print(f"Trigger Snapshot Command Stdout: {snapshot_result.stdout.strip()}") + print(f"Trigger Command Stderr: {snapshot_result.stderr.strip()}") + print(f"Trigger Command Exit Code: {snapshot_result.exit_code}") + + assert snapshot_result.exit_code == 0 + + time.sleep(wait_time) + + print(f"\nCreating second pod snapshot '{second_snapshot_name}' after {wait_time} seconds...") + snapshot_result = sandbox.checkpoint(second_snapshot_name) + print(f"Trigger Snapshot Command Stdout: {snapshot_result.stdout.strip()}") + print(f"Trigger Command Stderr: {snapshot_result.stderr.strip()}") + print(f"Trigger Command Exit Code: {snapshot_result.exit_code}") + + assert snapshot_result.exit_code == 0 + + + print("\n***** Phase 2: Restoring from most recent snapshot & Verifying *****") + with PodSnapshotSandboxClient( + template_name=template_name, + namespace=namespace, + api_url=api_url, + server_port=server_port + ) as sandbox_restored: # restores from second_snapshot_name by default + + print("\nWaiting 5 seconds for restored pod to resume printing...") + time.sleep(5) + + # Fetch logs using the Kubernetes API + logs = v1.read_namespaced_pod_log( + name=sandbox_restored.pod_name, + namespace=sandbox_restored.namespace + ) + + # logs must not be empty + counts = [int(n) for n in re.findall(r"Count: (\d+)", logs)] + assert len(counts) > 0, "Failed to retrieve any 'Count:' logs from restored pod." + + # The first number printed by the restored pod must be >= 20. + # If it restarted, it would be 0 or 1. + first_count = counts[0] + print("this is the first_count:", first_count) + assert first_count >= wait_time * 2, ( + f"State Mismatch! Expected counter to start >= {wait_time*2}, " + f"but got {first_count}. The pod likely restarted from scratch." + ) + + print("--- Pod Snapshot Test Passed! ---") + + except Exception as e: + print(f"\n--- An error occurred during the test: {e} ---") + # The __exit__ method of the Sandbox class will handle cleanup. + finally: + print("\n--- Sandbox Client Test Finished ---") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test the Sandbox client.") + parser.add_argument( + "--template-name", + default="python-sandbox-template", + help="The name of the sandbox template to use for the test." + ) + + # Default is None to allow testing the Port-Forward fallback + parser.add_argument( + "--gateway-name", + default=None, + help="The name of the Gateway resource. If omitted, defaults to local port-forward mode." + ) + + parser.add_argument( + "--gateway-namespace", + default=None, + help="The namespace of the Gateway resource. If omitted, defaults to local port-forward mode." + ) + + parser.add_argument( + "--api-url", help="Direct URL to router (e.g. http://localhost:8080)", default=None) + parser.add_argument("--namespace", default="default", + help="Namespace to create sandbox in") + parser.add_argument("--server-port", type=int, default=8888, + help="Port the sandbox container listens on") + parser.add_argument("--labels", nargs="+", default=["app=sandbox-test"], + help="Labels for the sandbox pod/claim in key=value format (e.g. app=sandbox-test env=dev)") + + args = parser.parse_args() + + labels_dict = {} + for l in args.labels: + if "=" in l: + k, v = l.split("=", 1) + labels_dict[k] = v + else: + print(f"Warning: Ignoring invalid label format '{l}'. Use key=value.") + + asyncio.run(main( + template_name=args.template_name, + api_url=args.api_url, + namespace=args.namespace, + server_port=args.server_port, + labels=labels_dict + )) +# python3 test_podsnapshot_extension.py --labels app=agent-sandbox-workload --template-name python-counter-template --namespace sandbox-test From 2aa44f494fba1cb59cdc3edfa36824644132f52a Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Thu, 5 Feb 2026 16:26:08 +0000 Subject: [PATCH 02/10] Update pss client --- .../agentic_sandbox/extensions/__init__.py | 3 +- .../gke_extensions/__init__.py | 15 +++ .../{ => gke_extensions}/podsnapshot.md | 28 +++-- .../podsnapshot_client.py} | 102 +++++++++++------- .../python-counter-template.yaml | 29 +++++ .../test_podsnapshot_extension.py | 19 ++-- 6 files changed, 139 insertions(+), 57 deletions(-) create mode 100644 clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/__init__.py rename clients/python/agentic-sandbox-client/agentic_sandbox/{ => gke_extensions}/podsnapshot.md (72%) rename clients/python/agentic-sandbox-client/agentic_sandbox/{extensions/podsnapshot.py => gke_extensions/podsnapshot_client.py} (75%) create mode 100644 clients/python/agentic-sandbox-client/python-counter-template.yaml diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py index 0a5b29227..9fbc1daf2 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py @@ -12,5 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .computer_use import ComputerUseSandbox -from .podsnapshot import PodSnapshotSandboxClient +from .computer_use import ComputerUseSandbox \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/__init__.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/__init__.py new file mode 100644 index 000000000..9aad1555a --- /dev/null +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .podsnapshot_client import PodSnapshotSandboxClient \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/podsnapshot.md b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md similarity index 72% rename from clients/python/agentic-sandbox-client/agentic_sandbox/podsnapshot.md rename to clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md index 452b21a2b..7597441ba 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/podsnapshot.md +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md @@ -2,27 +2,37 @@ This directory contains the Python client extension for interacting with the Agentic Sandbox to manage Pod Snapshots. This extension allows you to trigger checkpoints (snapshots) of a running sandbox and restore a new sandbox from the recently created snapshot. -## `podsnapshot.py` +## `podsnapshot_client.py` -This file defines the `PodSnapshotSandboxClient` class, which extends the base `SandboxClient` to provide snapshot capabilities. +This file defines the `SnapshotPersistenceManager` and `PodSnapshotSandboxClient` class, which extend the base `SandboxClient` to provide snapshot capabilities. + +### `SnapshotPersistenceManager` + +A utility class for managing local persistence of snapshot metadata in a secure directory. Stores metadata as a dictionary keyed by `trigger_name`. + +### `PodSnapshotSandboxClient` + +A specialized Sandbox client for interacting with the gke pod snapshot controller. ### Key Features: -* **`PodSnapshotSandboxClient(template_name: str, ...)`**: - * Initializes the client. +* **`PodSnapshotSandboxClient(template_name: str, podsnapshot_timeout: int = 180, server_port: int = 8080, ...)`**: + * Initializes the client with optional podsnapshot timeout and server port. * If snapshot exists, the pod snapshot controller restores from the most recent snapshot matching the label of the `SandboxTemplate`, otherwise creates a new `Sandbox`. * **`snapshot_controller_ready(self) -> bool`**: * Checks if the snapshot agent (both self-installed and GKE managed) is running and ready. -* **`checkpoint(self, trigger_name: str) -> ExecutionResult`**: +* **`checkpoint(self, trigger_name: str) -> tuple[ExecutionResult, str]`**: * Triggers a manual snapshot of the current sandbox pod by creating a `PodSnapshotManualTrigger` resource. + * The trigger_name is suffixed with the current datetime. * Waits for the snapshot to be processed. * The pod snapshot controller creates a `PodSnapshot` resource automatically. -* **`list_snapshots(self, policy_name: str, ready_only: bool) -> list`**: - * TBD -* **`delete_snapshots(self, **filters) -> int`**: + * Returns a tuple of ExecutionResult and the final trigger name. +* **`list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None`**: * TBD +* **`delete_snapshots(self, trigger_name: str) -> int`**: + * TBD * **Automatic Cleanup**: - * The `__exit__` method attempts to clean up triggers `PodSnapshotManualTrigger` created during the session, keeping the `PodSnapshot` resources alive after session exit. + * The `__exit__` method cleans up the `SandboxClaim` resources. ## `test_podsnapshot_extension.py` diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/podsnapshot.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py similarity index 75% rename from clients/python/agentic-sandbox-client/agentic_sandbox/extensions/podsnapshot.py rename to clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py index 401867f64..2fce99b72 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/podsnapshot.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py @@ -1,4 +1,4 @@ -# Copyright 2025 The Kubernetes Authors. +# Copyright 2026 The Kubernetes Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,9 @@ import logging import sys -from kubernetes import client, config, watch +from datetime import datetime +from typing import Any +from kubernetes import client, watch from kubernetes.client import ApiException from ..sandbox_client import SandboxClient, ExecutionResult from ..constants import * @@ -26,9 +28,35 @@ ) +class SnapshotPersistenceManager: + """ + Manages local persistence of snapshot metadata in a secure directory. + Stores metadata as a dictionary keyed by trigger_name. + """ + def __init__(self): + """Initializes the persistence manager and ensures the secure directory exists.""" + pass + + def _ensure_secure_dir(self): + """Ensures the directory exists with 700 permissions.""" + pass + + def _load_metadata(self) -> dict[str, Any]: + """Loads metadata. Returns an empty dict if file doesn't exist or is invalid.""" + pass + + def save_snapshot_metadata(self, record: dict[str, Any]): + """Saves a snapshot record to the local registry.""" + pass + + def delete_snapshot_metadata(self, trigger_name: str): + """Deletes a snapshot record from the local registry.""" + pass + + class PodSnapshotSandboxClient(SandboxClient): """ - A specialized Sandbox client for interacting with the snapshot controller. + A specialized Sandbox client for interacting with the gke pod snapshot controller. Handles the case only when triggerConfig is type manual. """ @@ -39,23 +67,22 @@ def __init__( server_port: int = 8080, **kwargs, ): + super().__init__( + template_name, server_port=server_port, **kwargs + ) self.controller_ready = False self.podsnapshot_timeout = podsnapshot_timeout - self.created_snapshots = [] self.controller_ready = self.snapshot_controller_ready() - super().__init__( - template_name, server_port=server_port, **kwargs - ) - def _wait_for_snapshot_processed(self, trigger_name: str): + def _wait_for_snapshot_processed(self, trigger_name: str) -> tuple[str, str]: """ - Waits for the PodSnapshotManualTrigger to be processed and a snapshot created. + Waits for the PodSnapshotManualTrigger to be processed and returns (snapshot_uid, timestamp). """ w = watch.Watch() logging.info(f"Waiting for snapshot manual trigger '{trigger_name}' to be processed...") - + try: for event in w.stream( func=self.custom_objects_api.list_namespaced_custom_object, @@ -70,22 +97,25 @@ def _wait_for_snapshot_processed(self, trigger_name: str): obj = event["object"] status = obj.get("status", {}) conditions = status.get("conditions", []) - + for condition in conditions: if ( condition.get("type") == "Triggered" and condition.get("status") == "True" and condition.get("reason") == "Complete" ): - logging.info(f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {status.get('snapshotCreated', {}).get('name')}") + uid = status.get('snapshotCreated', {}).get('name') + timestamp = condition.get('lastTransitionTime') + logging.info(f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {uid}") w.stop() - return + return uid, timestamp except Exception as e: logging.error(f"Error watching snapshot: {e}") raise raise TimeoutError(f"Snapshot manual trigger '{trigger_name}' was not processed within {self.podsnapshot_timeout} seconds.") + def snapshot_controller_ready(self) -> bool: """ Checks if the snapshot controller and agent pods are running. @@ -126,22 +156,28 @@ def check_namespace(namespace: str, required_components: list[str]) -> bool: self.controller_ready = False return self.controller_ready - def checkpoint(self, trigger_name: str) -> ExecutionResult: + + def checkpoint(self, trigger_name: str) -> tuple[ExecutionResult, str]: """ Triggers a snapshot of the specified pod by creating a PodSnapshotManualTrigger resource. + The trigger_name will be suffixed with the current datetime. + Returns: + tuple[ExecutionResult, str]: The result of the operation and the final trigger name (with suffix). """ + trigger_name = f"{trigger_name}-{datetime.now().strftime('%Y%m%d%H%M%S')}" + if not self.controller_ready: return ExecutionResult( stdout="", stderr="Snapshot controller is not ready. Ensure it is installed and running.", exit_code=1 - ) + ), trigger_name if not self.pod_name: return ExecutionResult( stdout="", stderr="Sandbox pod name not found. Ensure sandbox is created.", exit_code=1 - ) + ), trigger_name manifest = { "apiVersion": f"{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}", @@ -163,26 +199,29 @@ def checkpoint(self, trigger_name: str) -> ExecutionResult: plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, body=manifest ) - self.created_snapshots.append(trigger_name) - self._wait_for_snapshot_processed(trigger_name) + snapshot_uid, timestamp = self._wait_for_snapshot_processed(trigger_name) + + # TODO: Add snapshot metadata persistence logic here using SnapshotPersistenceManager + return ExecutionResult( stdout=f"PodSnapshotManualTrigger '{trigger_name}' created successfully.", stderr="", exit_code=0 - ) + ), trigger_name except ApiException as e: return ExecutionResult( stdout="", stderr=f"Failed to create PodSnapshotManualTrigger: {e}", exit_code=1 - ) + ), trigger_name except TimeoutError as e: return ExecutionResult( stdout="", stderr=f"Snapshot creation timed out: {e}", exit_code=1 - ) + ), trigger_name + def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None: """ Checks for existing snapshots matching the label selector and optional policy name. @@ -192,28 +231,17 @@ def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | No """ pass - def delete_snapshots(self, **filters) -> int: + + def delete_snapshots(self, trigger_name: str) -> int: """ - Deletes snapshots matching the provided filters. + Deletes snapshots matching the provided trigger name and the PSMT resources. Returns the count of successfully deleted snapshots. """ pass + def __exit__(self, exc_type, exc_val, exc_tb): """ - Automatically cleans up the Sandbox and the PSMT Trigger Requests. + Automatically cleans up the Sandbox. """ - for trigger_name in self.created_snapshots: - try: - logging.info(f"Cleaning up Trigger request: {trigger_name}") - self.custom_objects_api.delete_namespaced_custom_object( - group=PODSNAPSHOT_API_GROUP, - version=PODSNAPSHOT_API_VERSION, - namespace=self.namespace, - plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, - name=trigger_name - ) - except ApiException as e: - if e.status != 404: - logging.warning(f"Failed to cleanup trigger '{trigger_name}': {e}") super().__exit__(exc_type, exc_val, exc_tb) \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/python-counter-template.yaml b/clients/python/agentic-sandbox-client/python-counter-template.yaml new file mode 100644 index 000000000..f4f288933 --- /dev/null +++ b/clients/python/agentic-sandbox-client/python-counter-template.yaml @@ -0,0 +1,29 @@ +apiVersion: extensions.agents.x-k8s.io/v1alpha1 +kind: SandboxTemplate +metadata: + name: python-counter-template + namespace: default + labels: + language: python +spec: + #enableDisruptionControl: true + #shutdownTime: "2025-12-31T23:59:59Z" + podTemplate: + metadata: + labels: + app: agent-sandbox-workload + spec: + serviceAccountName: default + runtimeClassName: gvisor + containers: + - name: my-container1 + image: python:3.10-slim + command: ["python3", "-c"] + args: + - | + import time + i = 0 + while True: + print(f"Count: {i}", flush=True) + i += 1 + time.sleep(1) diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py index 7781d8c74..d687386b2 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py @@ -15,9 +15,10 @@ import argparse import asyncio import time +import sys from kubernetes import client, config import re -from agentic_sandbox.extensions import PodSnapshotSandboxClient +from agentic_sandbox.gke_extensions import PodSnapshotSandboxClient POD_NAME_ANNOTATION = "agents.x-k8s.io/pod-name" @@ -38,8 +39,8 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p config.load_kube_config() wait_time = 10 - first_snapshot_name = "test-snapshot-10" - second_snapshot_name = "test-snapshot-20" + first_checkpoint_name = "test-snapshot-10" + second_checkpoint_name = "test-snapshot-20" v1 = client.CoreV1Api() try: @@ -55,9 +56,9 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p assert sandbox.controller_ready == True, "Sandbox controller is not ready." time.sleep(wait_time) - print(f"Creating first pod snapshot '{first_snapshot_name}' after {wait_time} seconds...") - snapshot_result = sandbox.checkpoint(first_snapshot_name) - print(f"Trigger Snapshot Command Stdout: {snapshot_result.stdout.strip()}") + print(f"Creating first pod snapshot '{first_checkpoint_name}' after {wait_time} seconds...") + snapshot_result, trigger_name = sandbox.checkpoint(first_checkpoint_name) + print(f"Trigger Command Stdout: {snapshot_result.stdout.strip()}") print(f"Trigger Command Stderr: {snapshot_result.stderr.strip()}") print(f"Trigger Command Exit Code: {snapshot_result.exit_code}") @@ -65,9 +66,9 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p time.sleep(wait_time) - print(f"\nCreating second pod snapshot '{second_snapshot_name}' after {wait_time} seconds...") - snapshot_result = sandbox.checkpoint(second_snapshot_name) - print(f"Trigger Snapshot Command Stdout: {snapshot_result.stdout.strip()}") + print(f"\nCreating second pod snapshot '{second_checkpoint_name}' after {wait_time} seconds...") + snapshot_result, trigger_name = sandbox.checkpoint(second_checkpoint_name) + print(f"Trigger Command Stdout: {snapshot_result.stdout.strip()}") print(f"Trigger Command Stderr: {snapshot_result.stderr.strip()}") print(f"Trigger Command Exit Code: {snapshot_result.exit_code}") From 17b5a9f466fef255c664e064f159ce02af453e5f Mon Sep 17 00:00:00 2001 From: shrutiyam-glitch Date: Thu, 5 Feb 2026 08:28:33 -0800 Subject: [PATCH 03/10] Remove import statement for ComputerUseSandbox --- .../agentic_sandbox/extensions/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py index 9fbc1daf2..1002b1472 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from .computer_use import ComputerUseSandbox \ No newline at end of file From 32cbcef1df2b1c9b0494efb097fe063f68023ae6 Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Sat, 7 Feb 2026 00:32:50 +0000 Subject: [PATCH 04/10] Add unit tests and address comments --- .../agentic_sandbox/constants.py | 1 + .../gke_extensions/podsnapshot_client.py | 86 +++--- .../test_podsnapshot_extension.py | 57 ++-- .../test_podsnapshot_client.py | 253 ++++++++++++++++++ 4 files changed, 343 insertions(+), 54 deletions(-) rename clients/python/agentic-sandbox-client/{ => agentic_sandbox/gke_extensions}/test_podsnapshot_extension.py (70%) create mode 100644 clients/python/agentic-sandbox-client/test_podsnapshot_client.py diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py b/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py index 19b399724..8c51d4c31 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py @@ -31,6 +31,7 @@ PODSNAPSHOT_API_VERSION = "v1alpha1" PODSNAPSHOT_PLURAL = "podsnapshots" PODSNAPSHOTMANUALTRIGGER_PLURAL = "podsnapshotmanualtriggers" +PODSNAPSHOT_API_KIND = "PodSnapshotManualTrigger" SNAPSHOT_NAMESPACE_SELF_INSTALLED = "gps-system" SNAPSHOT_NAMESPACE_MANAGED = "gke-managed-pod-snapshots" diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py index 2fce99b72..d76af3188 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py @@ -14,8 +14,9 @@ import logging import sys -from datetime import datetime +import os from typing import Any +from dataclasses import dataclass from kubernetes import client, watch from kubernetes.client import ApiException from ..sandbox_client import SandboxClient, ExecutionResult @@ -28,6 +29,18 @@ ) +@dataclass +class SnapshotResult: + """Result of a snapshot processing operation.""" + snapshot_uid: str + snapshot_timestamp: str + +@dataclass +class CheckpointResponse: + """Structured response for checkpoint operations.""" + execution_result: ExecutionResult + trigger_name: str + class SnapshotPersistenceManager: """ Manages local persistence of snapshot metadata in a secure directory. @@ -73,12 +86,15 @@ def __init__( self.controller_ready = False self.podsnapshot_timeout = podsnapshot_timeout - self.controller_ready = self.snapshot_controller_ready() + def __enter__(self) -> 'PodSnapshotSandboxClient': + self.controller_ready = self.snapshot_controller_ready() + super().__enter__() + return self - def _wait_for_snapshot_processed(self, trigger_name: str) -> tuple[str, str]: + def _wait_for_snapshot_processed(self, trigger_name: str) -> SnapshotResult: """ - Waits for the PodSnapshotManualTrigger to be processed and returns (snapshot_uid, timestamp). + Waits for the PodSnapshotManualTrigger to be processed and returns SnapshotResult. """ w = watch.Watch() logging.info(f"Waiting for snapshot manual trigger '{trigger_name}' to be processed...") @@ -104,11 +120,11 @@ def _wait_for_snapshot_processed(self, trigger_name: str) -> tuple[str, str]: and condition.get("status") == "True" and condition.get("reason") == "Complete" ): - uid = status.get('snapshotCreated', {}).get('name') - timestamp = condition.get('lastTransitionTime') - logging.info(f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {uid}") + snapshot_uid = status.get('snapshotCreated', {}).get('name') + snapshot_timestamp = condition.get('lastTransitionTime') + logging.info(f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {snapshot_uid}") w.stop() - return uid, timestamp + return SnapshotResult(snapshot_uid=snapshot_uid, snapshot_timestamp=snapshot_timestamp) except Exception as e: logging.error(f"Error watching snapshot: {e}") raise @@ -125,11 +141,11 @@ def snapshot_controller_ready(self) -> bool: if self.controller_ready: return True - v1 = client.CoreV1Api() + core_v1_api = client.CoreV1Api() def check_namespace(namespace: str, required_components: list[str]) -> bool: try: - pods = v1.list_namespaced_pod(namespace) + pods = core_v1_api.list_namespaced_pod(namespace) found_components = {component: False for component in required_components} for pod in pods.items: @@ -157,14 +173,14 @@ def check_namespace(namespace: str, required_components: list[str]) -> bool: return self.controller_ready - def checkpoint(self, trigger_name: str) -> tuple[ExecutionResult, str]: + def checkpoint(self, trigger_name: str) -> CheckpointResponse: """ Triggers a snapshot of the specified pod by creating a PodSnapshotManualTrigger resource. The trigger_name will be suffixed with the current datetime. Returns: tuple[ExecutionResult, str]: The result of the operation and the final trigger name (with suffix). """ - trigger_name = f"{trigger_name}-{datetime.now().strftime('%Y%m%d%H%M%S')}" + trigger_name = f"{trigger_name}-{os.urandom(4).hex()}" if not self.controller_ready: return ExecutionResult( @@ -181,7 +197,7 @@ def checkpoint(self, trigger_name: str) -> tuple[ExecutionResult, str]: manifest = { "apiVersion": f"{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}", - "kind": "PodSnapshotManualTrigger", + "kind": f"{PODSNAPSHOT_API_KIND}", "metadata": { "name": trigger_name, "namespace": self.namespace @@ -199,28 +215,38 @@ def checkpoint(self, trigger_name: str) -> tuple[ExecutionResult, str]: plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, body=manifest ) - snapshot_uid, timestamp = self._wait_for_snapshot_processed(trigger_name) + snapshot_result = self._wait_for_snapshot_processed(trigger_name) # TODO: Add snapshot metadata persistence logic here using SnapshotPersistenceManager - return ExecutionResult( - stdout=f"PodSnapshotManualTrigger '{trigger_name}' created successfully.", - stderr="", - exit_code=0 - ), trigger_name + return CheckpointResponse( + execution_result=ExecutionResult( + stdout=f"PodSnapshotManualTrigger '{trigger_name}' created successfully. Snapshot UID: {snapshot_result.snapshot_uid}", + stderr="", + exit_code=0 + ), + trigger_name=trigger_name + ) except ApiException as e: - return ExecutionResult( - stdout="", - stderr=f"Failed to create PodSnapshotManualTrigger: {e}", - exit_code=1 - ), trigger_name + logging.exception(f"Failed to create PodSnapshotManualTrigger '{trigger_name}': {e}") + return CheckpointResponse( + execution_result=ExecutionResult( + stdout="", + stderr=f"Failed to create PodSnapshotManualTrigger: {e}", + exit_code=1 + ), + trigger_name=trigger_name + ) except TimeoutError as e: - return ExecutionResult( - stdout="", - stderr=f"Snapshot creation timed out: {e}", - exit_code=1 - ), trigger_name - + logging.exception(f"Snapshot creation timed out for trigger '{trigger_name}': {e}") + return CheckpointResponse( + execution_result=ExecutionResult( + stdout="", + stderr=f"Snapshot creation timed out: {e}", + exit_code=1 + ), + trigger_name=trigger_name + ) def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None: """ diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_extension.py similarity index 70% rename from clients/python/agentic-sandbox-client/test_podsnapshot_extension.py rename to clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_extension.py index d687386b2..06e1665a7 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_extension.py @@ -1,4 +1,4 @@ -# Copyright 2025 The Kubernetes Authors. +# Copyright 2026 The Kubernetes Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,6 +22,21 @@ POD_NAME_ANNOTATION = "agents.x-k8s.io/pod-name" +def test_checkpoint_respone(checkpoint_response, checkpoint_name): + assert hasattr(checkpoint_response, "execution_result"), "Checkpoint response missing 'execution_result' attribute" + assert hasattr(checkpoint_response, "trigger_name"), "Checkpoint response missing 'trigger_name' attribute" + + execution_result = checkpoint_response.execution_result + trigger_name = checkpoint_response.trigger_name + + print(f"Trigger Command Stdout: {execution_result.stdout.strip()}") + print(f"Trigger Command Stderr: {execution_result.stderr.strip()}") + print(f"Trigger Command Exit Code: {execution_result.exit_code}") + + assert trigger_name.startswith(checkpoint_name), f"Expected trigger name prefix '{checkpoint_name}', but got '{trigger_name}'" + assert execution_result.stderr == "", f"Expected no error when creating checkpoint '{checkpoint_name}', but got: {execution_result.stderr.strip()}" + assert execution_result.exit_code == 0 + async def main(template_name: str, api_url: str | None, namespace: str, server_port: int, labels: dict[str, str]): """ @@ -41,7 +56,7 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p wait_time = 10 first_checkpoint_name = "test-snapshot-10" second_checkpoint_name = "test-snapshot-20" - v1 = client.CoreV1Api() + core_v1_api = client.CoreV1Api() try: print("\n***** Phase 1: Starting Counter *****") @@ -57,22 +72,15 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p time.sleep(wait_time) print(f"Creating first pod snapshot '{first_checkpoint_name}' after {wait_time} seconds...") - snapshot_result, trigger_name = sandbox.checkpoint(first_checkpoint_name) - print(f"Trigger Command Stdout: {snapshot_result.stdout.strip()}") - print(f"Trigger Command Stderr: {snapshot_result.stderr.strip()}") - print(f"Trigger Command Exit Code: {snapshot_result.exit_code}") - - assert snapshot_result.exit_code == 0 + checkpoint_response = sandbox.checkpoint(first_checkpoint_name) + test_checkpoint_respone(checkpoint_response, first_checkpoint_name) + time.sleep(wait_time) print(f"\nCreating second pod snapshot '{second_checkpoint_name}' after {wait_time} seconds...") - snapshot_result, trigger_name = sandbox.checkpoint(second_checkpoint_name) - print(f"Trigger Command Stdout: {snapshot_result.stdout.strip()}") - print(f"Trigger Command Stderr: {snapshot_result.stderr.strip()}") - print(f"Trigger Command Exit Code: {snapshot_result.exit_code}") - - assert snapshot_result.exit_code == 0 + checkpoint_response = sandbox.checkpoint(second_checkpoint_name) + test_checkpoint_respone(checkpoint_response, second_checkpoint_name) print("\n***** Phase 2: Restoring from most recent snapshot & Verifying *****") @@ -81,28 +89,29 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p namespace=namespace, api_url=api_url, server_port=server_port - ) as sandbox_restored: # restores from second_snapshot_name by default + ) as sandbox_restored: # restores from second_checkpoint_name by default print("\nWaiting 5 seconds for restored pod to resume printing...") time.sleep(5) # Fetch logs using the Kubernetes API - logs = v1.read_namespaced_pod_log( + logs = core_v1_api.read_namespaced_pod_log( name=sandbox_restored.pod_name, namespace=sandbox_restored.namespace ) - # logs must not be empty + # Extract the sequence of 'Count:' values from the pod logs counts = [int(n) for n in re.findall(r"Count: (\d+)", logs)] assert len(counts) > 0, "Failed to retrieve any 'Count:' logs from restored pod." - # The first number printed by the restored pod must be >= 20. - # If it restarted, it would be 0 or 1. - first_count = counts[0] - print("this is the first_count:", first_count) - assert first_count >= wait_time * 2, ( - f"State Mismatch! Expected counter to start >= {wait_time*2}, " - f"but got {first_count}. The pod likely restarted from scratch." + # Verify the counter resumed from the correct checkpoint state. + # The second snapshot was taken after two wait intervals (totaling 20s if wait_time=10). + min_expected_count_at_restore = wait_time * 2 + first_count_after_restore = counts[0] + + assert first_count_after_restore >= min_expected_count_at_restore, ( + f"State Mismatch! Expected counter to start >= {min_expected_count_at_restore}, " + f"but got {first_count_after_restore}. The pod likely restarted from scratch." ) print("--- Pod Snapshot Test Passed! ---") diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_client.py b/clients/python/agentic-sandbox-client/test_podsnapshot_client.py new file mode 100644 index 000000000..a8d1f567b --- /dev/null +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_client.py @@ -0,0 +1,253 @@ +# Copyright 2026 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import os +import logging +from unittest.mock import MagicMock, patch, call +from datetime import datetime +from agentic_sandbox.gke_extensions.podsnapshot_client import PodSnapshotSandboxClient, SnapshotPersistenceManager +from agentic_sandbox.constants import * +from kubernetes import config + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +def load_kubernetes_config(): + """Loads Kubernetes configuration, prioritizing kubeconfig and falling back to an environment variable.""" + try: + config.load_kube_config() + logging.info("Kubernetes config loaded from kubeconfig file.") + except config.ConfigException: + logging.info("Kubeconfig file not found, attempting to load from environment variable.") + try: + config.load_kube_config(config_file=os.getenv("KUBECONFIG_FILE")) + logging.info("Kubernetes config loaded from KUBECONFIG_FILE environment variable.") + except Exception as e: + logging.error(f"Could not load Kubernetes config: {e}", exc_info=True) + raise + + +class TestPodSnapshotSandboxClient(unittest.TestCase): + + def setUp(self): + logging.info("Setting up TestPodSnapshotSandboxClient...") + # Create client without patching super, as it's tested separately + with patch.object(PodSnapshotSandboxClient, 'snapshot_controller_ready', return_value=True): + self.client = PodSnapshotSandboxClient('test-template') + logging.info("Finished setting up TestPodSnapshotSandboxClient.") + + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.SandboxClient') + def test_init(self, mock_super): + """Test initialization of PodSnapshotSandboxClient.""" + logging.info("Starting test_init...") + with patch.object(PodSnapshotSandboxClient, 'snapshot_controller_ready', return_value=True): + client = PodSnapshotSandboxClient('test-template') + mock_super.assert_called_once_with( + 'test-template', + podsnapshot_timeout=180, + server_port=8080 + ) + self.assertTrue(client.controller_ready) + self.assertEqual(client.podsnapshot_timeout, 180) + logging.info("Finished test_init.") + + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api') + def test_snapshot_controller_ready_self_installed(self, mock_v1_class): + """Test snapshot_controller_ready for self-installed scenario.""" + logging.info("Starting test_snapshot_controller_ready_self_installed...") + mock_v1 = MagicMock() + mock_v1_class.return_value = mock_v1 + + # Mock pods in gps-system + mock_pod_controller = MagicMock() + mock_pod_controller.metadata.name = 'pod-snapshot-controller' + mock_pod_controller.status.phase = 'Running' + + mock_pod_agent = MagicMock() + mock_pod_agent.metadata.name = 'pod-snapshot-agent' + mock_pod_agent.status.phase = 'Running' + + mock_pods = MagicMock() + mock_pods.items = [mock_pod_controller, mock_pod_agent] + mock_v1.list_namespaced_pod.return_value = mock_pods + + self.client.controller_ready = False + result = self.client.snapshot_controller_ready() + + self.assertTrue(result) + self.assertTrue(self.client.controller_ready) + mock_v1.list_namespaced_pod.assert_called_with(SNAPSHOT_NAMESPACE_SELF_INSTALLED) + logging.info("Finished test_snapshot_controller_ready_self_installed.") + + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api') + def test_snapshot_controller_ready_managed(self, mock_v1_class): + """Test snapshot_controller_ready for managed scenario.""" + logging.info("Starting test_snapshot_controller_ready_managed...") + mock_v1 = MagicMock() + mock_v1_class.return_value = mock_v1 + + # Mock pods in gke-managed-pod-snapshots + mock_pod_agent = MagicMock() + mock_pod_agent.metadata.name = 'pod-snapshot-agent' + mock_pod_agent.status.phase = 'Running' + + mock_pods = MagicMock() + mock_pods.items = [mock_pod_agent] + mock_v1.list_namespaced_pod.return_value = mock_pods + + self.client.controller_ready = False + result = self.client.snapshot_controller_ready() + + self.assertTrue(result) + self.assertTrue(self.client.controller_ready) + mock_v1.list_namespaced_pod.assert_called_with(SNAPSHOT_NAMESPACE_MANAGED) + logging.info("Finished test_snapshot_controller_ready_managed.") + + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api') + def test_snapshot_controller_ready_not_ready(self, mock_v1_class): + """Test snapshot_controller_ready when not ready.""" + logging.info("Starting test_snapshot_controller_ready_not_ready...") + mock_v1 = MagicMock() + mock_v1_class.return_value = mock_v1 + + mock_pods = MagicMock() + mock_pods.items = [] + mock_v1.list_namespaced_pod.return_value = mock_pods + + self.client.controller_ready = False + result = self.client.snapshot_controller_ready() + + self.assertFalse(result) + self.assertFalse(self.client.controller_ready) + logging.info("Finished test_snapshot_controller_ready_not_ready.") + + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch') + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi') + def test_checkpoint_success(self, mock_custom_class, mock_watch_class): + """Test successful checkpoint creation.""" + logging.info("Starting test_checkpoint_success...") + mock_custom = MagicMock() + mock_custom_class.return_value = mock_custom + + mock_watch = MagicMock() + mock_watch_class.return_value = mock_watch + + self.client.pod_name = 'test-pod' + self.client.controller_ready = True + self.client.namespace = 'test-ns' + + # Mock the watch stream + mock_event = { + 'type': 'MODIFIED', + 'object': { + 'status': { + 'conditions': [{ + 'type': 'Triggered', + 'status': 'True', + 'reason': 'Complete', + 'lastTransitionTime': '2023-01-01T00:00:00Z' + }], + 'snapshotCreated': {'name': 'snapshot-uid'} + } + } + } + mock_watch.stream.return_value = [mock_event] + + trigger_name = f"test-trigger" + result = self.client.checkpoint(trigger_name) + + self.assertEqual(result.execution_result.exit_code, 0) + self.assertIn('test-trigger', result.trigger_name) + self.assertIn(datetime.now().strftime('%Y%m%d'), result.trigger_name) + + # Verify create call + expected_manifest = { + 'apiVersion': f'{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}', + 'kind': f'{PODSNAPSHOT_API_KIND}', + 'metadata': { + 'name': trigger_name, + 'namespace': 'test-ns' + }, + 'spec': { + 'targetPod': 'test-pod' + } + } + mock_custom.create_namespaced_custom_object.assert_called_once_with( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace='test-ns', + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + body=expected_manifest + ) + logging.info("Finished test_checkpoint_success.") + + def test_checkpoint_controller_not_ready(self): + """Test checkpoint when controller is not ready.""" + logging.info("Starting test_checkpoint_controller_not_ready...") + self.client.controller_ready = False + result = self.client.checkpoint('test-trigger') + + self.assertEqual(result.execution_result.exit_code, 1) + self.assertIn('test-trigger', result.trigger_name) + self.assertIn('Snapshot controller is not ready', result.execution_result.stderr) + logging.info("Finished test_checkpoint_controller_not_ready.") + + def test_checkpoint_no_pod_name(self): + """Test checkpoint when pod name is not set.""" + logging.info("Starting test_checkpoint_no_pod_name...") + self.client.controller_ready = True + self.client.pod_name = None + result = self.client.checkpoint('test-trigger') + + self.assertEqual(result.execution_result.exit_code, 1) + self.assertIn('test-trigger', result.trigger_name) + self.assertIn('Sandbox pod name not found', result.execution_result.stderr) + logging.info("Finished test_checkpoint_no_pod_name.") + + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch') + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi') + def test_checkpoint_timeout(self, mock_custom_class, mock_watch_class): + """Test checkpoint timeout scenario.""" + logging.info("Starting test_checkpoint_timeout...") + mock_custom = MagicMock() + mock_custom_class.return_value = mock_custom + + mock_watch = MagicMock() + mock_watch_class.return_value = mock_watch + + self.client.pod_name = 'test-pod' + self.client.controller_ready = True + self.client.podsnapshot_timeout = 1 + + # Mock empty stream (timeout) + mock_watch.stream.return_value = [] + + result = self.client.checkpoint('test-trigger') + + self.assertEqual(result.execution_result.exit_code, 1) + self.assertIn('timed out', result.execution_result.stderr) + logging.info("Finished test_checkpoint_timeout.") + + + @patch('agentic_sandbox.gke_extensions.podsnapshot_client.SandboxClient.__exit__') + def test_exit(self, mock_super_exit): + """Test __exit__ method.""" + logging.info("Starting test_exit...") + self.client.__exit__(None, None, None) + mock_super_exit.assert_called_once_with(None, None, None) + logging.info("Finished test_exit.") + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From dde9dd27bbc450ac6bd09ed018fef03ea3dd90e1 Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Sat, 7 Feb 2026 00:58:02 +0000 Subject: [PATCH 05/10] Update test --- .../test_podsnapshot_client.py | 67 ++++++++----------- .../test_podsnapshot_extension.py | 0 2 files changed, 28 insertions(+), 39 deletions(-) rename clients/python/agentic-sandbox-client/{ => agentic_sandbox/gke_extensions}/test_podsnapshot_client.py (84%) rename clients/python/agentic-sandbox-client/{agentic_sandbox/gke_extensions => }/test_podsnapshot_extension.py (100%) diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py similarity index 84% rename from clients/python/agentic-sandbox-client/test_podsnapshot_client.py rename to clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py index a8d1f567b..ae0a16a31 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py @@ -40,25 +40,34 @@ def load_kubernetes_config(): class TestPodSnapshotSandboxClient(unittest.TestCase): - def setUp(self): + @patch('kubernetes.config') + def setUp(self, mock_config): logging.info("Setting up TestPodSnapshotSandboxClient...") + # Mock kubernetes config loading + mock_config.load_incluster_config.side_effect = config.ConfigException("Not in cluster") + mock_config.load_kube_config.return_value = None + # Create client without patching super, as it's tested separately with patch.object(PodSnapshotSandboxClient, 'snapshot_controller_ready', return_value=True): self.client = PodSnapshotSandboxClient('test-template') + + # Mock the kubernetes APIs on the client instance + self.client.custom_objects_api = MagicMock() + self.client.core_v1_api = MagicMock() + logging.info("Finished setting up TestPodSnapshotSandboxClient.") - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.SandboxClient') - def test_init(self, mock_super): + def test_init(self): """Test initialization of PodSnapshotSandboxClient.""" logging.info("Starting test_init...") - with patch.object(PodSnapshotSandboxClient, 'snapshot_controller_ready', return_value=True): - client = PodSnapshotSandboxClient('test-template') - mock_super.assert_called_once_with( - 'test-template', - podsnapshot_timeout=180, - server_port=8080 - ) - self.assertTrue(client.controller_ready) + with patch('agentic_sandbox.sandbox_client.SandboxClient.__init__', return_value=None) as mock_super: + with patch.object(PodSnapshotSandboxClient, 'snapshot_controller_ready', return_value=True): + client = PodSnapshotSandboxClient('test-template') + mock_super.assert_called_once_with( + 'test-template', + server_port=8080 + ) + self.assertFalse(client.controller_ready) self.assertEqual(client.podsnapshot_timeout, 180) logging.info("Finished test_init.") @@ -133,13 +142,11 @@ def test_snapshot_controller_ready_not_ready(self, mock_v1_class): logging.info("Finished test_snapshot_controller_ready_not_ready.") @patch('agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch') - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi') - def test_checkpoint_success(self, mock_custom_class, mock_watch_class): + def test_checkpoint_success(self, mock_watch_class): """Test successful checkpoint creation.""" logging.info("Starting test_checkpoint_success...") - mock_custom = MagicMock() - mock_custom_class.return_value = mock_custom - + + # Mock the watch mock_watch = MagicMock() mock_watch_class.return_value = mock_watch @@ -164,32 +171,14 @@ def test_checkpoint_success(self, mock_custom_class, mock_watch_class): } mock_watch.stream.return_value = [mock_event] - trigger_name = f"test-trigger" - result = self.client.checkpoint(trigger_name) + result = self.client.checkpoint('test-trigger') self.assertEqual(result.execution_result.exit_code, 0) self.assertIn('test-trigger', result.trigger_name) - self.assertIn(datetime.now().strftime('%Y%m%d'), result.trigger_name) - - # Verify create call - expected_manifest = { - 'apiVersion': f'{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}', - 'kind': f'{PODSNAPSHOT_API_KIND}', - 'metadata': { - 'name': trigger_name, - 'namespace': 'test-ns' - }, - 'spec': { - 'targetPod': 'test-pod' - } - } - mock_custom.create_namespaced_custom_object.assert_called_once_with( - group=PODSNAPSHOT_API_GROUP, - version=PODSNAPSHOT_API_VERSION, - namespace='test-ns', - plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, - body=expected_manifest - ) + self.assertIn('snapshot-uid', result.execution_result.stdout) + + # Verify create call was made + self.client.custom_objects_api.create_namespaced_custom_object.assert_called_once() logging.info("Finished test_checkpoint_success.") def test_checkpoint_controller_not_ready(self): diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py similarity index 100% rename from clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_extension.py rename to clients/python/agentic-sandbox-client/test_podsnapshot_extension.py From 1a361437846f14000860c36f98eb210871d4c8c0 Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Thu, 12 Feb 2026 17:15:42 +0000 Subject: [PATCH 06/10] Address comments --- .../agentic_sandbox/constants.py | 2 - .../agentic_sandbox/extensions/__init__.py | 1 + .../gke_extensions/podsnapshot.md | 14 +- .../gke_extensions/podsnapshot_client.py | 136 +++++++------- .../gke_extensions/test_podsnapshot_client.py | 169 +++++++++--------- .../python-counter-template.yaml | 4 +- .../test_podsnapshot_extension.py | 130 ++++++++------ 7 files changed, 238 insertions(+), 218 deletions(-) diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py b/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py index 8c51d4c31..79959468a 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/constants.py @@ -33,7 +33,5 @@ PODSNAPSHOTMANUALTRIGGER_PLURAL = "podsnapshotmanualtriggers" PODSNAPSHOT_API_KIND = "PodSnapshotManualTrigger" -SNAPSHOT_NAMESPACE_SELF_INSTALLED = "gps-system" SNAPSHOT_NAMESPACE_MANAGED = "gke-managed-pod-snapshots" -SNAPSHOT_CONTROLLER_NAME = "pod-snapshot-controller" SNAPSHOT_AGENT = "pod-snapshot-agent" \ No newline at end of file diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py index 1002b1472..1a72134ef 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/extensions/__init__.py @@ -11,3 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md index 7597441ba..9f3ff5efb 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md @@ -23,12 +23,12 @@ A specialized Sandbox client for interacting with the gke pod snapshot controlle * Checks if the snapshot agent (both self-installed and GKE managed) is running and ready. * **`checkpoint(self, trigger_name: str) -> tuple[ExecutionResult, str]`**: * Triggers a manual snapshot of the current sandbox pod by creating a `PodSnapshotManualTrigger` resource. - * The trigger_name is suffixed with the current datetime. + * The trigger_name is suffixed with unique hash. * Waits for the snapshot to be processed. * The pod snapshot controller creates a `PodSnapshot` resource automatically. - * Returns a tuple of ExecutionResult and the final trigger name. + * Returns the CheckpointResponse object(success, error_code, error_reason, trigger_name). * **`list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None`**: - * TBD + * TBD * **`delete_snapshots(self, trigger_name: str) -> int`**: * TBD * **Automatic Cleanup**: @@ -62,9 +62,13 @@ This file, located in the parent directory (`clients/python/agentic-sandbox-clie pip install -e clients/python/agentic-sandbox-client/ ``` -3. **Pod Snapshot Controller**: The Pod Snapshot controller must be installed in the standard cluster running inside gVisor(Userguide). The GCS bucket to store the pod snapshot states and respective permissions must be applied. +3. **Pod Snapshot Controller**: The Pod Snapshot controller must be installed in a **GKE standard cluster** running with **gVisor**. + * For detailed setup instructions, refer to the [GKE Pod Snapshots public documentation](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/pod-snapshots). + * Ensure a GCS bucket is configured to store the pod snapshot states and that the necessary IAM permissions are applied. + 4. **CRDs**: `PodSnapshotStorageConfig`, `PodSnapshotPolicy` CRDs must be applied. `PodSnapshotPolicy` should specify the selector match labels. -5. **Sandbox Template**: A `SandboxTemplate` (e.g., `python-counter-template`) with runtime gVisor and label that matches that selector label in `PodSnapshotPolicy` must be available in the cluster. + +5. **Sandbox Template**: A `SandboxTemplate` (e.g., `python-counter-template`) with runtime gVisor, appropriate KSA and label that matches that selector label in `PodSnapshotPolicy` must be available in the cluster. ### Running Tests: diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py index d76af3188..912493cf2 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py @@ -32,20 +32,27 @@ @dataclass class SnapshotResult: """Result of a snapshot processing operation.""" + snapshot_uid: str snapshot_timestamp: str + @dataclass class CheckpointResponse: """Structured response for checkpoint operations.""" - execution_result: ExecutionResult + + success: bool trigger_name: str + error_reason: str + error_code: int + class SnapshotPersistenceManager: """ Manages local persistence of snapshot metadata in a secure directory. Stores metadata as a dictionary keyed by trigger_name. """ + def __init__(self): """Initializes the persistence manager and ensures the secure directory exists.""" pass @@ -61,7 +68,7 @@ def _load_metadata(self) -> dict[str, Any]: def save_snapshot_metadata(self, record: dict[str, Any]): """Saves a snapshot record to the local registry.""" pass - + def delete_snapshot_metadata(self, trigger_name: str): """Deletes a snapshot record from the local registry.""" pass @@ -70,7 +77,7 @@ def delete_snapshot_metadata(self, trigger_name: str): class PodSnapshotSandboxClient(SandboxClient): """ A specialized Sandbox client for interacting with the gke pod snapshot controller. - Handles the case only when triggerConfig is type manual. + Currently supports manual triggering via PodSnapshotManualTrigger. """ def __init__( @@ -80,14 +87,12 @@ def __init__( server_port: int = 8080, **kwargs, ): - super().__init__( - template_name, server_port=server_port, **kwargs - ) + super().__init__(template_name, server_port=server_port, **kwargs) self.controller_ready = False self.podsnapshot_timeout = podsnapshot_timeout - def __enter__(self) -> 'PodSnapshotSandboxClient': + def __enter__(self) -> "PodSnapshotSandboxClient": self.controller_ready = self.snapshot_controller_ready() super().__enter__() return self @@ -97,8 +102,10 @@ def _wait_for_snapshot_processed(self, trigger_name: str) -> SnapshotResult: Waits for the PodSnapshotManualTrigger to be processed and returns SnapshotResult. """ w = watch.Watch() - logging.info(f"Waiting for snapshot manual trigger '{trigger_name}' to be processed...") - + logging.info( + f"Waiting for snapshot manual trigger '{trigger_name}' to be processed..." + ) + try: for event in w.stream( func=self.custom_objects_api.list_namespaced_custom_object, @@ -107,35 +114,40 @@ def _wait_for_snapshot_processed(self, trigger_name: str) -> SnapshotResult: version=PODSNAPSHOT_API_VERSION, plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, field_selector=f"metadata.name={trigger_name}", - timeout_seconds=self.podsnapshot_timeout + timeout_seconds=self.podsnapshot_timeout, ): if event["type"] in ["ADDED", "MODIFIED"]: obj = event["object"] status = obj.get("status", {}) conditions = status.get("conditions", []) - + for condition in conditions: if ( condition.get("type") == "Triggered" and condition.get("status") == "True" and condition.get("reason") == "Complete" ): - snapshot_uid = status.get('snapshotCreated', {}).get('name') - snapshot_timestamp = condition.get('lastTransitionTime') - logging.info(f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {snapshot_uid}") + snapshot_uid = status.get("snapshotCreated", {}).get("name") + snapshot_timestamp = condition.get("lastTransitionTime") + logging.info( + f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {snapshot_uid}" + ) w.stop() - return SnapshotResult(snapshot_uid=snapshot_uid, snapshot_timestamp=snapshot_timestamp) + return SnapshotResult( + snapshot_uid=snapshot_uid, + snapshot_timestamp=snapshot_timestamp, + ) except Exception as e: logging.error(f"Error watching snapshot: {e}") raise - raise TimeoutError(f"Snapshot manual trigger '{trigger_name}' was not processed within {self.podsnapshot_timeout} seconds.") - + raise TimeoutError( + f"Snapshot manual trigger '{trigger_name}' was not processed within {self.podsnapshot_timeout} seconds." + ) def snapshot_controller_ready(self) -> bool: """ - Checks if the snapshot controller and agent pods are running. - Checks both self-installed (gps-system) and GKE-managed pod snapshot systems. + Checks if the snapshot agent pods are running in a GKE-managed pod snapshot cluster. """ if self.controller_ready: @@ -146,7 +158,9 @@ def snapshot_controller_ready(self) -> bool: def check_namespace(namespace: str, required_components: list[str]) -> bool: try: pods = core_v1_api.list_namespaced_pod(namespace) - found_components = {component: False for component in required_components} + found_components = { + component: False for component in required_components + } for pod in pods.items: if pod.status.phase == "Running": @@ -159,11 +173,6 @@ def check_namespace(namespace: str, required_components: list[str]) -> bool: except ApiException: return False - # Check self-installed: requires both controller and agent in gps-system - if check_namespace(SNAPSHOT_NAMESPACE_SELF_INSTALLED, [SNAPSHOT_CONTROLLER_NAME, SNAPSHOT_AGENT]): - self.controller_ready = True - return True - # Check managed: requires only agent in gke-managed-pod-snapshots if check_namespace(SNAPSHOT_NAMESPACE_MANAGED, [SNAPSHOT_AGENT]): self.controller_ready = True @@ -172,7 +181,6 @@ def check_namespace(namespace: str, required_components: list[str]) -> bool: self.controller_ready = False return self.controller_ready - def checkpoint(self, trigger_name: str) -> CheckpointResponse: """ Triggers a snapshot of the specified pod by creating a PodSnapshotManualTrigger resource. @@ -183,28 +191,25 @@ def checkpoint(self, trigger_name: str) -> CheckpointResponse: trigger_name = f"{trigger_name}-{os.urandom(4).hex()}" if not self.controller_ready: - return ExecutionResult( - stdout="", - stderr="Snapshot controller is not ready. Ensure it is installed and running.", - exit_code=1 - ), trigger_name + return CheckpointResponse( + success=False, + trigger_name=trigger_name, + error_reason="Snapshot controller is not ready. Ensure it is installed and running.", + error_code=1, + ) if not self.pod_name: - return ExecutionResult( - stdout="", - stderr="Sandbox pod name not found. Ensure sandbox is created.", - exit_code=1 - ), trigger_name + return CheckpointResponse( + success=False, + trigger_name=trigger_name, + error_reason="Sandbox pod name not found. Ensure sandbox is created.", + error_code=1, + ) manifest = { "apiVersion": f"{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}", "kind": f"{PODSNAPSHOT_API_KIND}", - "metadata": { - "name": trigger_name, - "namespace": self.namespace - }, - "spec": { - "targetPod": self.pod_name - } + "metadata": {"name": trigger_name, "namespace": self.namespace}, + "spec": {"targetPod": self.pod_name}, } try: @@ -213,41 +218,36 @@ def checkpoint(self, trigger_name: str) -> CheckpointResponse: version=PODSNAPSHOT_API_VERSION, namespace=self.namespace, plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, - body=manifest + body=manifest, ) snapshot_result = self._wait_for_snapshot_processed(trigger_name) - + # TODO: Add snapshot metadata persistence logic here using SnapshotPersistenceManager return CheckpointResponse( - execution_result=ExecutionResult( - stdout=f"PodSnapshotManualTrigger '{trigger_name}' created successfully. Snapshot UID: {snapshot_result.snapshot_uid}", - stderr="", - exit_code=0 - ), - trigger_name=trigger_name + success=True, trigger_name=trigger_name, error_reason="", error_code=0 ) except ApiException as e: - logging.exception(f"Failed to create PodSnapshotManualTrigger '{trigger_name}': {e}") + logging.exception( + f"Failed to create PodSnapshotManualTrigger '{trigger_name}': {e}" + ) return CheckpointResponse( - execution_result=ExecutionResult( - stdout="", - stderr=f"Failed to create PodSnapshotManualTrigger: {e}", - exit_code=1 - ), - trigger_name=trigger_name + success=False, + trigger_name=trigger_name, + error_reason=f"Failed to create PodSnapshotManualTrigger: {e}", + error_code=1, ) except TimeoutError as e: - logging.exception(f"Snapshot creation timed out for trigger '{trigger_name}': {e}") + logging.exception( + f"Snapshot creation timed out for trigger '{trigger_name}': {e}" + ) return CheckpointResponse( - execution_result=ExecutionResult( - stdout="", - stderr=f"Snapshot creation timed out: {e}", - exit_code=1 - ), - trigger_name=trigger_name + success=False, + trigger_name=trigger_name, + error_reason=f"Snapshot creation timed out: {e}", + error_code=1, ) - + def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None: """ Checks for existing snapshots matching the label selector and optional policy name. @@ -257,7 +257,6 @@ def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | No """ pass - def delete_snapshots(self, trigger_name: str) -> int: """ Deletes snapshots matching the provided trigger name and the PSMT resources. @@ -265,9 +264,8 @@ def delete_snapshots(self, trigger_name: str) -> int: """ pass - def __exit__(self, exc_type, exc_val, exc_tb): """ Automatically cleans up the Sandbox. """ - super().__exit__(exc_type, exc_val, exc_tb) \ No newline at end of file + super().__exit__(exc_type, exc_val, exc_tb) diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py index ae0a16a31..9c7e74ccd 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py @@ -17,11 +17,17 @@ import logging from unittest.mock import MagicMock, patch, call from datetime import datetime -from agentic_sandbox.gke_extensions.podsnapshot_client import PodSnapshotSandboxClient, SnapshotPersistenceManager +from agentic_sandbox.gke_extensions.podsnapshot_client import ( + PodSnapshotSandboxClient, + SnapshotPersistenceManager, +) from agentic_sandbox.constants import * from kubernetes import config -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + def load_kubernetes_config(): """Loads Kubernetes configuration, prioritizing kubeconfig and falling back to an environment variable.""" @@ -29,10 +35,14 @@ def load_kubernetes_config(): config.load_kube_config() logging.info("Kubernetes config loaded from kubeconfig file.") except config.ConfigException: - logging.info("Kubeconfig file not found, attempting to load from environment variable.") + logging.info( + "Kubeconfig file not found, attempting to load from environment variable." + ) try: config.load_kube_config(config_file=os.getenv("KUBECONFIG_FILE")) - logging.info("Kubernetes config loaded from KUBECONFIG_FILE environment variable.") + logging.info( + "Kubernetes config loaded from KUBECONFIG_FILE environment variable." + ) except Exception as e: logging.error(f"Could not load Kubernetes config: {e}", exc_info=True) raise @@ -40,66 +50,43 @@ def load_kubernetes_config(): class TestPodSnapshotSandboxClient(unittest.TestCase): - @patch('kubernetes.config') + @patch("kubernetes.config") def setUp(self, mock_config): logging.info("Setting up TestPodSnapshotSandboxClient...") # Mock kubernetes config loading - mock_config.load_incluster_config.side_effect = config.ConfigException("Not in cluster") + mock_config.load_incluster_config.side_effect = config.ConfigException( + "Not in cluster" + ) mock_config.load_kube_config.return_value = None - + # Create client without patching super, as it's tested separately - with patch.object(PodSnapshotSandboxClient, 'snapshot_controller_ready', return_value=True): - self.client = PodSnapshotSandboxClient('test-template') - + with patch.object( + PodSnapshotSandboxClient, "snapshot_controller_ready", return_value=True + ): + self.client = PodSnapshotSandboxClient("test-template") + # Mock the kubernetes APIs on the client instance self.client.custom_objects_api = MagicMock() self.client.core_v1_api = MagicMock() - + logging.info("Finished setting up TestPodSnapshotSandboxClient.") def test_init(self): """Test initialization of PodSnapshotSandboxClient.""" logging.info("Starting test_init...") - with patch('agentic_sandbox.sandbox_client.SandboxClient.__init__', return_value=None) as mock_super: - with patch.object(PodSnapshotSandboxClient, 'snapshot_controller_ready', return_value=True): - client = PodSnapshotSandboxClient('test-template') - mock_super.assert_called_once_with( - 'test-template', - server_port=8080 - ) + with patch( + "agentic_sandbox.sandbox_client.SandboxClient.__init__", return_value=None + ) as mock_super: + with patch.object( + PodSnapshotSandboxClient, "snapshot_controller_ready", return_value=True + ): + client = PodSnapshotSandboxClient("test-template") + mock_super.assert_called_once_with("test-template", server_port=8080) self.assertFalse(client.controller_ready) self.assertEqual(client.podsnapshot_timeout, 180) logging.info("Finished test_init.") - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api') - def test_snapshot_controller_ready_self_installed(self, mock_v1_class): - """Test snapshot_controller_ready for self-installed scenario.""" - logging.info("Starting test_snapshot_controller_ready_self_installed...") - mock_v1 = MagicMock() - mock_v1_class.return_value = mock_v1 - - # Mock pods in gps-system - mock_pod_controller = MagicMock() - mock_pod_controller.metadata.name = 'pod-snapshot-controller' - mock_pod_controller.status.phase = 'Running' - - mock_pod_agent = MagicMock() - mock_pod_agent.metadata.name = 'pod-snapshot-agent' - mock_pod_agent.status.phase = 'Running' - - mock_pods = MagicMock() - mock_pods.items = [mock_pod_controller, mock_pod_agent] - mock_v1.list_namespaced_pod.return_value = mock_pods - - self.client.controller_ready = False - result = self.client.snapshot_controller_ready() - - self.assertTrue(result) - self.assertTrue(self.client.controller_ready) - mock_v1.list_namespaced_pod.assert_called_with(SNAPSHOT_NAMESPACE_SELF_INSTALLED) - logging.info("Finished test_snapshot_controller_ready_self_installed.") - - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api') + @patch("agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api") def test_snapshot_controller_ready_managed(self, mock_v1_class): """Test snapshot_controller_ready for managed scenario.""" logging.info("Starting test_snapshot_controller_ready_managed...") @@ -108,8 +95,8 @@ def test_snapshot_controller_ready_managed(self, mock_v1_class): # Mock pods in gke-managed-pod-snapshots mock_pod_agent = MagicMock() - mock_pod_agent.metadata.name = 'pod-snapshot-agent' - mock_pod_agent.status.phase = 'Running' + mock_pod_agent.metadata.name = "pod-snapshot-agent" + mock_pod_agent.status.phase = "Running" mock_pods = MagicMock() mock_pods.items = [mock_pod_agent] @@ -123,10 +110,10 @@ def test_snapshot_controller_ready_managed(self, mock_v1_class): mock_v1.list_namespaced_pod.assert_called_with(SNAPSHOT_NAMESPACE_MANAGED) logging.info("Finished test_snapshot_controller_ready_managed.") - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api') - def test_snapshot_controller_ready_not_ready(self, mock_v1_class): + @patch("agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api") + def test_snapshot_controller_ready_status_not_ready(self, mock_v1_class): """Test snapshot_controller_ready when not ready.""" - logging.info("Starting test_snapshot_controller_ready_not_ready...") + logging.info("Starting test_snapshot_controller_ready_status_not_ready...") mock_v1 = MagicMock() mock_v1_class.return_value = mock_v1 @@ -139,43 +126,45 @@ def test_snapshot_controller_ready_not_ready(self, mock_v1_class): self.assertFalse(result) self.assertFalse(self.client.controller_ready) - logging.info("Finished test_snapshot_controller_ready_not_ready.") + logging.info("Finished test_snapshot_controller_ready_status_not_ready.") - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch') + @patch("agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch") def test_checkpoint_success(self, mock_watch_class): """Test successful checkpoint creation.""" logging.info("Starting test_checkpoint_success...") - + # Mock the watch mock_watch = MagicMock() mock_watch_class.return_value = mock_watch - self.client.pod_name = 'test-pod' + self.client.pod_name = "test-pod" self.client.controller_ready = True - self.client.namespace = 'test-ns' + self.client.namespace = "test-ns" # Mock the watch stream mock_event = { - 'type': 'MODIFIED', - 'object': { - 'status': { - 'conditions': [{ - 'type': 'Triggered', - 'status': 'True', - 'reason': 'Complete', - 'lastTransitionTime': '2023-01-01T00:00:00Z' - }], - 'snapshotCreated': {'name': 'snapshot-uid'} + "type": "MODIFIED", + "object": { + "status": { + "conditions": [ + { + "type": "Triggered", + "status": "True", + "reason": "Complete", + "lastTransitionTime": "2023-01-01T00:00:00Z", + } + ], + "snapshotCreated": {"name": "snapshot-uid"}, } - } + }, } mock_watch.stream.return_value = [mock_event] - result = self.client.checkpoint('test-trigger') + result = self.client.checkpoint("test-trigger") - self.assertEqual(result.execution_result.exit_code, 0) - self.assertIn('test-trigger', result.trigger_name) - self.assertIn('snapshot-uid', result.execution_result.stdout) + self.assertEqual(result.error_code, 0) + self.assertTrue(result.success) + self.assertIn("test-trigger", result.trigger_name) # Verify create call was made self.client.custom_objects_api.create_namespaced_custom_object.assert_called_once() @@ -185,11 +174,12 @@ def test_checkpoint_controller_not_ready(self): """Test checkpoint when controller is not ready.""" logging.info("Starting test_checkpoint_controller_not_ready...") self.client.controller_ready = False - result = self.client.checkpoint('test-trigger') + result = self.client.checkpoint("test-trigger") - self.assertEqual(result.execution_result.exit_code, 1) - self.assertIn('test-trigger', result.trigger_name) - self.assertIn('Snapshot controller is not ready', result.execution_result.stderr) + self.assertEqual(result.error_code, 1) + self.assertFalse(result.success) + self.assertIn("test-trigger", result.trigger_name) + self.assertIn("Snapshot controller is not ready", result.error_reason) logging.info("Finished test_checkpoint_controller_not_ready.") def test_checkpoint_no_pod_name(self): @@ -197,15 +187,16 @@ def test_checkpoint_no_pod_name(self): logging.info("Starting test_checkpoint_no_pod_name...") self.client.controller_ready = True self.client.pod_name = None - result = self.client.checkpoint('test-trigger') + result = self.client.checkpoint("test-trigger") - self.assertEqual(result.execution_result.exit_code, 1) - self.assertIn('test-trigger', result.trigger_name) - self.assertIn('Sandbox pod name not found', result.execution_result.stderr) + self.assertEqual(result.error_code, 1) + self.assertFalse(result.success) + self.assertIn("test-trigger", result.trigger_name) + self.assertIn("Sandbox pod name not found", result.error_reason) logging.info("Finished test_checkpoint_no_pod_name.") - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch') - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi') + @patch("agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch") + @patch("agentic_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi") def test_checkpoint_timeout(self, mock_custom_class, mock_watch_class): """Test checkpoint timeout scenario.""" logging.info("Starting test_checkpoint_timeout...") @@ -215,21 +206,21 @@ def test_checkpoint_timeout(self, mock_custom_class, mock_watch_class): mock_watch = MagicMock() mock_watch_class.return_value = mock_watch - self.client.pod_name = 'test-pod' + self.client.pod_name = "test-pod" self.client.controller_ready = True self.client.podsnapshot_timeout = 1 # Mock empty stream (timeout) mock_watch.stream.return_value = [] - result = self.client.checkpoint('test-trigger') + result = self.client.checkpoint("test-trigger") - self.assertEqual(result.execution_result.exit_code, 1) - self.assertIn('timed out', result.execution_result.stderr) + self.assertEqual(result.error_code, 1) + self.assertFalse(result.success) + self.assertIn("timed out", result.error_reason) logging.info("Finished test_checkpoint_timeout.") - - @patch('agentic_sandbox.gke_extensions.podsnapshot_client.SandboxClient.__exit__') + @patch("agentic_sandbox.gke_extensions.podsnapshot_client.SandboxClient.__exit__") def test_exit(self, mock_super_exit): """Test __exit__ method.""" logging.info("Starting test_exit...") @@ -239,4 +230,4 @@ def test_exit(self, mock_super_exit): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/clients/python/agentic-sandbox-client/python-counter-template.yaml b/clients/python/agentic-sandbox-client/python-counter-template.yaml index f4f288933..19407c1b2 100644 --- a/clients/python/agentic-sandbox-client/python-counter-template.yaml +++ b/clients/python/agentic-sandbox-client/python-counter-template.yaml @@ -2,7 +2,7 @@ apiVersion: extensions.agents.x-k8s.io/v1alpha1 kind: SandboxTemplate metadata: name: python-counter-template - namespace: default + namespace: sandbox-test labels: language: python spec: @@ -13,7 +13,7 @@ spec: labels: app: agent-sandbox-workload spec: - serviceAccountName: default + serviceAccountName: sandbox-test runtimeClassName: gvisor containers: - name: my-container1 diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py index 06e1665a7..45b817745 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py @@ -20,32 +20,41 @@ import re from agentic_sandbox.gke_extensions import PodSnapshotSandboxClient -POD_NAME_ANNOTATION = "agents.x-k8s.io/pod-name" -def test_checkpoint_respone(checkpoint_response, checkpoint_name): - assert hasattr(checkpoint_response, "execution_result"), "Checkpoint response missing 'execution_result' attribute" - assert hasattr(checkpoint_response, "trigger_name"), "Checkpoint response missing 'trigger_name' attribute" - - execution_result = checkpoint_response.execution_result - trigger_name = checkpoint_response.trigger_name - - print(f"Trigger Command Stdout: {execution_result.stdout.strip()}") - print(f"Trigger Command Stderr: {execution_result.stderr.strip()}") - print(f"Trigger Command Exit Code: {execution_result.exit_code}") - - assert trigger_name.startswith(checkpoint_name), f"Expected trigger name prefix '{checkpoint_name}', but got '{trigger_name}'" - assert execution_result.stderr == "", f"Expected no error when creating checkpoint '{checkpoint_name}', but got: {execution_result.stderr.strip()}" - assert execution_result.exit_code == 0 - - -async def main(template_name: str, api_url: str | None, namespace: str, server_port: int, labels: dict[str, str]): +def test_checkpoint_response(checkpoint_response, checkpoint_name): + assert hasattr( + checkpoint_response, "trigger_name" + ), "Checkpoint response missing 'trigger_name' attribute" + + print(f"Trigger Name: {checkpoint_response.trigger_name}") + print(f"Success: {checkpoint_response.success}") + print(f"Error Code: {checkpoint_response.error_code}") + print(f"Error Reason: {checkpoint_response.error_reason}") + + assert checkpoint_response.trigger_name.startswith( + checkpoint_name + ), f"Expected trigger name prefix '{checkpoint_name}', but got '{checkpoint_response.trigger_name}'" + assert ( + checkpoint_response.success + ), f"Expected success=True, but got False. Reason: {checkpoint_response.error_reason}" + assert checkpoint_response.error_code == 0 + + +async def main( + template_name: str, + api_url: str | None, + namespace: str, + server_port: int, + labels: dict[str, str], +): """ Tests the Sandbox client by creating a sandbox, running a command, and then cleaning up. """ print( - f"--- Starting Sandbox Client Test (Namespace: {namespace}, Port: {server_port}) ---") + f"--- Starting Sandbox Client Test (Namespace: {namespace}, Port: {server_port}) ---" + ) # Load kube config try: @@ -65,50 +74,55 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p template_name=template_name, namespace=namespace, api_url=api_url, - server_port=server_port + server_port=server_port, ) as sandbox: print("\n======= Testing Pod Snapshot Extension =======") assert sandbox.controller_ready == True, "Sandbox controller is not ready." time.sleep(wait_time) - print(f"Creating first pod snapshot '{first_checkpoint_name}' after {wait_time} seconds...") + print( + f"Creating first pod snapshot '{first_checkpoint_name}' after {wait_time} seconds..." + ) checkpoint_response = sandbox.checkpoint(first_checkpoint_name) - test_checkpoint_respone(checkpoint_response, first_checkpoint_name) - + test_checkpoint_response(checkpoint_response, first_checkpoint_name) time.sleep(wait_time) - print(f"\nCreating second pod snapshot '{second_checkpoint_name}' after {wait_time} seconds...") + print( + f"\nCreating second pod snapshot '{second_checkpoint_name}' after {wait_time} seconds..." + ) checkpoint_response = sandbox.checkpoint(second_checkpoint_name) - test_checkpoint_respone(checkpoint_response, second_checkpoint_name) - + test_checkpoint_response(checkpoint_response, second_checkpoint_name) print("\n***** Phase 2: Restoring from most recent snapshot & Verifying *****") with PodSnapshotSandboxClient( template_name=template_name, namespace=namespace, api_url=api_url, - server_port=server_port - ) as sandbox_restored: # restores from second_checkpoint_name by default + server_port=server_port, + ) as sandbox_restored: # restores from second_checkpoint_name by default print("\nWaiting 5 seconds for restored pod to resume printing...") time.sleep(5) - # Fetch logs using the Kubernetes API + # Fetch logs using the Kubernetes API logs = core_v1_api.read_namespaced_pod_log( - name=sandbox_restored.pod_name, - namespace=sandbox_restored.namespace + name=sandbox_restored.pod_name, namespace=sandbox_restored.namespace ) # Extract the sequence of 'Count:' values from the pod logs counts = [int(n) for n in re.findall(r"Count: (\d+)", logs)] - assert len(counts) > 0, "Failed to retrieve any 'Count:' logs from restored pod." + assert ( + len(counts) > 0 + ), "Failed to retrieve any 'Count:' logs from restored pod." # Verify the counter resumed from the correct checkpoint state. # The second snapshot was taken after two wait intervals (totaling 20s if wait_time=10). - min_expected_count_at_restore = wait_time * 2 + min_expected_count_at_restore = wait_time * 2 first_count_after_restore = counts[0] + print(f"First count after restore: {first_count_after_restore}") + assert first_count_after_restore >= min_expected_count_at_restore, ( f"State Mismatch! Expected counter to start >= {min_expected_count_at_restore}, " f"but got {first_count_after_restore}. The pod likely restarted from scratch." @@ -122,35 +136,48 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p finally: print("\n--- Sandbox Client Test Finished ---") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Test the Sandbox client.") parser.add_argument( "--template-name", default="python-sandbox-template", - help="The name of the sandbox template to use for the test." + help="The name of the sandbox template to use for the test.", ) # Default is None to allow testing the Port-Forward fallback parser.add_argument( "--gateway-name", default=None, - help="The name of the Gateway resource. If omitted, defaults to local port-forward mode." + help="The name of the Gateway resource. If omitted, defaults to local port-forward mode.", ) parser.add_argument( "--gateway-namespace", default=None, - help="The namespace of the Gateway resource. If omitted, defaults to local port-forward mode." + help="The namespace of the Gateway resource. If omitted, defaults to local port-forward mode.", ) parser.add_argument( - "--api-url", help="Direct URL to router (e.g. http://localhost:8080)", default=None) - parser.add_argument("--namespace", default="default", - help="Namespace to create sandbox in") - parser.add_argument("--server-port", type=int, default=8888, - help="Port the sandbox container listens on") - parser.add_argument("--labels", nargs="+", default=["app=sandbox-test"], - help="Labels for the sandbox pod/claim in key=value format (e.g. app=sandbox-test env=dev)") + "--api-url", + help="Direct URL to router (e.g. http://localhost:8080)", + default=None, + ) + parser.add_argument( + "--namespace", default="default", help="Namespace to create sandbox in" + ) + parser.add_argument( + "--server-port", + type=int, + default=8888, + help="Port the sandbox container listens on", + ) + parser.add_argument( + "--labels", + nargs="+", + default=["app=sandbox-test"], + help="Labels for the sandbox pod/claim in key=value format (e.g. app=sandbox-test env=dev)", + ) args = parser.parse_args() @@ -162,11 +189,12 @@ async def main(template_name: str, api_url: str | None, namespace: str, server_p else: print(f"Warning: Ignoring invalid label format '{l}'. Use key=value.") - asyncio.run(main( - template_name=args.template_name, - api_url=args.api_url, - namespace=args.namespace, - server_port=args.server_port, - labels=labels_dict - )) -# python3 test_podsnapshot_extension.py --labels app=agent-sandbox-workload --template-name python-counter-template --namespace sandbox-test + asyncio.run( + main( + template_name=args.template_name, + api_url=args.api_url, + namespace=args.namespace, + server_port=args.server_port, + labels=labels_dict, + ) + ) From 1ba1f2ccdc98df22258800db49274e2e811b0730 Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Thu, 12 Feb 2026 20:37:29 +0000 Subject: [PATCH 07/10] Change in term: Update Checkpoint to Snapshot --- .../gke_extensions/podsnapshot.md | 10 ++-- .../gke_extensions/podsnapshot_client.py | 16 +++---- .../gke_extensions/test_podsnapshot_client.py | 40 ++++++++-------- .../test_podsnapshot_extension.py | 46 +++++++++---------- 4 files changed, 56 insertions(+), 56 deletions(-) diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md index 9f3ff5efb..bf5dfd57c 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md @@ -1,6 +1,6 @@ # Agentic Sandbox Pod Snapshot Extension -This directory contains the Python client extension for interacting with the Agentic Sandbox to manage Pod Snapshots. This extension allows you to trigger checkpoints (snapshots) of a running sandbox and restore a new sandbox from the recently created snapshot. +This directory contains the Python client extension for interacting with the Agentic Sandbox to manage Pod Snapshots. This extension allows you to trigger snapshots of a running sandbox and restore a new sandbox from the recently created snapshot. ## `podsnapshot_client.py` @@ -21,12 +21,12 @@ A specialized Sandbox client for interacting with the gke pod snapshot controlle * If snapshot exists, the pod snapshot controller restores from the most recent snapshot matching the label of the `SandboxTemplate`, otherwise creates a new `Sandbox`. * **`snapshot_controller_ready(self) -> bool`**: * Checks if the snapshot agent (both self-installed and GKE managed) is running and ready. -* **`checkpoint(self, trigger_name: str) -> tuple[ExecutionResult, str]`**: +* **`snapshot(self, trigger_name: str) -> SnapshotResponse`**: * Triggers a manual snapshot of the current sandbox pod by creating a `PodSnapshotManualTrigger` resource. * The trigger_name is suffixed with unique hash. * Waits for the snapshot to be processed. * The pod snapshot controller creates a `PodSnapshot` resource automatically. - * Returns the CheckpointResponse object(success, error_code, error_reason, trigger_name). + * Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name). * **`list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None`**: * TBD * **`delete_snapshots(self, trigger_name: str) -> int`**: @@ -36,11 +36,11 @@ A specialized Sandbox client for interacting with the gke pod snapshot controlle ## `test_podsnapshot_extension.py` -This file, located in the parent directory (`clients/python/agentic-sandbox-client/`), contains an integration test script for the `PodSnapshotSandboxClient` extension. It verifies the checkpoint and restore functionality. +This file, located in the parent directory (`clients/python/agentic-sandbox-client/`), contains an integration test script for the `PodSnapshotSandboxClient` extension. It verifies the snapshot and restore functionality. ### Test Phases: -1. **Phase 1: Starting Counter & Checkpointing**: +1. **Phase 1: Starting Counter & Snapshotting**: * Starts a sandbox with a counter application. * Takes a snapshot (`test-snapshot-10`) after ~10 seconds. * Takes a second snapshot (`test-snapshot-20`) after another ~10 seconds. diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py index 912493cf2..6d7db73f8 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py @@ -38,8 +38,8 @@ class SnapshotResult: @dataclass -class CheckpointResponse: - """Structured response for checkpoint operations.""" +class SnapshotResponse: + """Structured response for snapshot operations.""" success: bool trigger_name: str @@ -181,7 +181,7 @@ def check_namespace(namespace: str, required_components: list[str]) -> bool: self.controller_ready = False return self.controller_ready - def checkpoint(self, trigger_name: str) -> CheckpointResponse: + def snapshot(self, trigger_name: str) -> SnapshotResponse: """ Triggers a snapshot of the specified pod by creating a PodSnapshotManualTrigger resource. The trigger_name will be suffixed with the current datetime. @@ -191,14 +191,14 @@ def checkpoint(self, trigger_name: str) -> CheckpointResponse: trigger_name = f"{trigger_name}-{os.urandom(4).hex()}" if not self.controller_ready: - return CheckpointResponse( + return SnapshotResponse( success=False, trigger_name=trigger_name, error_reason="Snapshot controller is not ready. Ensure it is installed and running.", error_code=1, ) if not self.pod_name: - return CheckpointResponse( + return SnapshotResponse( success=False, trigger_name=trigger_name, error_reason="Sandbox pod name not found. Ensure sandbox is created.", @@ -224,14 +224,14 @@ def checkpoint(self, trigger_name: str) -> CheckpointResponse: # TODO: Add snapshot metadata persistence logic here using SnapshotPersistenceManager - return CheckpointResponse( + return SnapshotResponse( success=True, trigger_name=trigger_name, error_reason="", error_code=0 ) except ApiException as e: logging.exception( f"Failed to create PodSnapshotManualTrigger '{trigger_name}': {e}" ) - return CheckpointResponse( + return SnapshotResponse( success=False, trigger_name=trigger_name, error_reason=f"Failed to create PodSnapshotManualTrigger: {e}", @@ -241,7 +241,7 @@ def checkpoint(self, trigger_name: str) -> CheckpointResponse: logging.exception( f"Snapshot creation timed out for trigger '{trigger_name}': {e}" ) - return CheckpointResponse( + return SnapshotResponse( success=False, trigger_name=trigger_name, error_reason=f"Snapshot creation timed out: {e}", diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py index 9c7e74ccd..4660ee080 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py @@ -129,9 +129,9 @@ def test_snapshot_controller_ready_status_not_ready(self, mock_v1_class): logging.info("Finished test_snapshot_controller_ready_status_not_ready.") @patch("agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch") - def test_checkpoint_success(self, mock_watch_class): - """Test successful checkpoint creation.""" - logging.info("Starting test_checkpoint_success...") + def test_snapshot_success(self, mock_watch_class): + """Test successful snapshot creation.""" + logging.info("Starting test_snapshot_success...") # Mock the watch mock_watch = MagicMock() @@ -160,7 +160,7 @@ def test_checkpoint_success(self, mock_watch_class): } mock_watch.stream.return_value = [mock_event] - result = self.client.checkpoint("test-trigger") + result = self.client.snapshot("test-trigger") self.assertEqual(result.error_code, 0) self.assertTrue(result.success) @@ -168,38 +168,38 @@ def test_checkpoint_success(self, mock_watch_class): # Verify create call was made self.client.custom_objects_api.create_namespaced_custom_object.assert_called_once() - logging.info("Finished test_checkpoint_success.") + logging.info("Finished test_snapshot_success.") - def test_checkpoint_controller_not_ready(self): - """Test checkpoint when controller is not ready.""" - logging.info("Starting test_checkpoint_controller_not_ready...") + def test_snapshot_controller_not_ready(self): + """Test snapshot when controller is not ready.""" + logging.info("Starting test_snapshot_controller_not_ready...") self.client.controller_ready = False - result = self.client.checkpoint("test-trigger") + result = self.client.snapshot("test-trigger") self.assertEqual(result.error_code, 1) self.assertFalse(result.success) self.assertIn("test-trigger", result.trigger_name) self.assertIn("Snapshot controller is not ready", result.error_reason) - logging.info("Finished test_checkpoint_controller_not_ready.") + logging.info("Finished test_snapshot_controller_not_ready.") - def test_checkpoint_no_pod_name(self): - """Test checkpoint when pod name is not set.""" - logging.info("Starting test_checkpoint_no_pod_name...") + def test_snapshot_no_pod_name(self): + """Test snapshot when pod name is not set.""" + logging.info("Starting test_snapshot_no_pod_name...") self.client.controller_ready = True self.client.pod_name = None - result = self.client.checkpoint("test-trigger") + result = self.client.snapshot("test-trigger") self.assertEqual(result.error_code, 1) self.assertFalse(result.success) self.assertIn("test-trigger", result.trigger_name) self.assertIn("Sandbox pod name not found", result.error_reason) - logging.info("Finished test_checkpoint_no_pod_name.") + logging.info("Finished test_snapshot_no_pod_name.") @patch("agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch") @patch("agentic_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi") - def test_checkpoint_timeout(self, mock_custom_class, mock_watch_class): - """Test checkpoint timeout scenario.""" - logging.info("Starting test_checkpoint_timeout...") + def test_snapshot_timeout(self, mock_custom_class, mock_watch_class): + """Test snapshot timeout scenario.""" + logging.info("Starting test_snapshot_timeout...") mock_custom = MagicMock() mock_custom_class.return_value = mock_custom @@ -213,12 +213,12 @@ def test_checkpoint_timeout(self, mock_custom_class, mock_watch_class): # Mock empty stream (timeout) mock_watch.stream.return_value = [] - result = self.client.checkpoint("test-trigger") + result = self.client.snapshot("test-trigger") self.assertEqual(result.error_code, 1) self.assertFalse(result.success) self.assertIn("timed out", result.error_reason) - logging.info("Finished test_checkpoint_timeout.") + logging.info("Finished test_snapshot_timeout.") @patch("agentic_sandbox.gke_extensions.podsnapshot_client.SandboxClient.__exit__") def test_exit(self, mock_super_exit): diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py index 45b817745..d7a2bf7f9 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py @@ -21,23 +21,23 @@ from agentic_sandbox.gke_extensions import PodSnapshotSandboxClient -def test_checkpoint_response(checkpoint_response, checkpoint_name): +def test_snapshot_response(snapshot_response, snapshot_name): assert hasattr( - checkpoint_response, "trigger_name" - ), "Checkpoint response missing 'trigger_name' attribute" + snapshot_response, "trigger_name" + ), "snapshot response missing 'trigger_name' attribute" - print(f"Trigger Name: {checkpoint_response.trigger_name}") - print(f"Success: {checkpoint_response.success}") - print(f"Error Code: {checkpoint_response.error_code}") - print(f"Error Reason: {checkpoint_response.error_reason}") + print(f"Trigger Name: {snapshot_response.trigger_name}") + print(f"Success: {snapshot_response.success}") + print(f"Error Code: {snapshot_response.error_code}") + print(f"Error Reason: {snapshot_response.error_reason}") - assert checkpoint_response.trigger_name.startswith( - checkpoint_name - ), f"Expected trigger name prefix '{checkpoint_name}', but got '{checkpoint_response.trigger_name}'" + assert snapshot_response.trigger_name.startswith( + snapshot_name + ), f"Expected trigger name prefix '{snapshot_name}', but got '{snapshot_response.trigger_name}'" assert ( - checkpoint_response.success - ), f"Expected success=True, but got False. Reason: {checkpoint_response.error_reason}" - assert checkpoint_response.error_code == 0 + snapshot_response.success + ), f"Expected success=True, but got False. Reason: {snapshot_response.error_reason}" + assert snapshot_response.error_code == 0 async def main( @@ -63,8 +63,8 @@ async def main( config.load_kube_config() wait_time = 10 - first_checkpoint_name = "test-snapshot-10" - second_checkpoint_name = "test-snapshot-20" + first_snapshot_name = "test-snapshot-10" + second_snapshot_name = "test-snapshot-20" core_v1_api = client.CoreV1Api() try: @@ -81,18 +81,18 @@ async def main( time.sleep(wait_time) print( - f"Creating first pod snapshot '{first_checkpoint_name}' after {wait_time} seconds..." + f"Creating first pod snapshot '{first_snapshot_name}' after {wait_time} seconds..." ) - checkpoint_response = sandbox.checkpoint(first_checkpoint_name) - test_checkpoint_response(checkpoint_response, first_checkpoint_name) + snapshot_response = sandbox.snapshot(first_snapshot_name) + test_snapshot_response(snapshot_response, first_snapshot_name) time.sleep(wait_time) print( - f"\nCreating second pod snapshot '{second_checkpoint_name}' after {wait_time} seconds..." + f"\nCreating second pod snapshot '{second_snapshot_name}' after {wait_time} seconds..." ) - checkpoint_response = sandbox.checkpoint(second_checkpoint_name) - test_checkpoint_response(checkpoint_response, second_checkpoint_name) + snapshot_response = sandbox.snapshot(second_snapshot_name) + test_snapshot_response(snapshot_response, second_snapshot_name) print("\n***** Phase 2: Restoring from most recent snapshot & Verifying *****") with PodSnapshotSandboxClient( @@ -100,7 +100,7 @@ async def main( namespace=namespace, api_url=api_url, server_port=server_port, - ) as sandbox_restored: # restores from second_checkpoint_name by default + ) as sandbox_restored: # restores from second_snapshot_name by default print("\nWaiting 5 seconds for restored pod to resume printing...") time.sleep(5) @@ -116,7 +116,7 @@ async def main( len(counts) > 0 ), "Failed to retrieve any 'Count:' logs from restored pod." - # Verify the counter resumed from the correct checkpoint state. + # Verify the counter resumed from the correct snapshot state. # The second snapshot was taken after two wait intervals (totaling 20s if wait_time=10). min_expected_count_at_restore = wait_time * 2 first_count_after_restore = counts[0] From 35b70964aedcaa1a903c2bbfa16a426cafc3a62f Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Sat, 14 Feb 2026 01:21:37 +0000 Subject: [PATCH 08/10] Add is_restored --- .../gke_extensions/podsnapshot.md | 10 +-- .../gke_extensions/podsnapshot_client.py | 81 +++++++++---------- .../gke_extensions/test_podsnapshot_client.py | 1 - .../test_podsnapshot_extension.py | 30 ++----- 4 files changed, 48 insertions(+), 74 deletions(-) diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md index bf5dfd57c..d0c0bae92 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md @@ -4,11 +4,7 @@ This directory contains the Python client extension for interacting with the Age ## `podsnapshot_client.py` -This file defines the `SnapshotPersistenceManager` and `PodSnapshotSandboxClient` class, which extend the base `SandboxClient` to provide snapshot capabilities. - -### `SnapshotPersistenceManager` - -A utility class for managing local persistence of snapshot metadata in a secure directory. Stores metadata as a dictionary keyed by `trigger_name`. +This file defines the `PodSnapshotSandboxClient` class, which extend the base `SandboxClient` to provide snapshot capabilities. ### `PodSnapshotSandboxClient` @@ -27,10 +23,6 @@ A specialized Sandbox client for interacting with the gke pod snapshot controlle * Waits for the snapshot to be processed. * The pod snapshot controller creates a `PodSnapshot` resource automatically. * Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name). -* **`list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None`**: - * TBD -* **`delete_snapshots(self, trigger_name: str) -> int`**: - * TBD * **Automatic Cleanup**: * The `__exit__` method cleans up the `SandboxClaim` resources. diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py index 6d7db73f8..ea26999d5 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot_client.py @@ -43,37 +43,11 @@ class SnapshotResponse: success: bool trigger_name: str + snapshot_uid: str error_reason: str error_code: int -class SnapshotPersistenceManager: - """ - Manages local persistence of snapshot metadata in a secure directory. - Stores metadata as a dictionary keyed by trigger_name. - """ - - def __init__(self): - """Initializes the persistence manager and ensures the secure directory exists.""" - pass - - def _ensure_secure_dir(self): - """Ensures the directory exists with 700 permissions.""" - pass - - def _load_metadata(self) -> dict[str, Any]: - """Loads metadata. Returns an empty dict if file doesn't exist or is invalid.""" - pass - - def save_snapshot_metadata(self, record: dict[str, Any]): - """Saves a snapshot record to the local registry.""" - pass - - def delete_snapshot_metadata(self, trigger_name: str): - """Deletes a snapshot record from the local registry.""" - pass - - class PodSnapshotSandboxClient(SandboxClient): """ A specialized Sandbox client for interacting with the gke pod snapshot controller. @@ -222,10 +196,12 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: ) snapshot_result = self._wait_for_snapshot_processed(trigger_name) - # TODO: Add snapshot metadata persistence logic here using SnapshotPersistenceManager - return SnapshotResponse( - success=True, trigger_name=trigger_name, error_reason="", error_code=0 + success=True, + trigger_name=trigger_name, + snapshot_uid=snapshot_result.snapshot_uid, + error_reason="", + error_code=0, ) except ApiException as e: logging.exception( @@ -234,6 +210,7 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: return SnapshotResponse( success=False, trigger_name=trigger_name, + snapshot_uid=None, error_reason=f"Failed to create PodSnapshotManualTrigger: {e}", error_code=1, ) @@ -244,25 +221,45 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: return SnapshotResponse( success=False, trigger_name=trigger_name, + snapshot_uid=None, error_reason=f"Snapshot creation timed out: {e}", error_code=1, ) - def list_snapshots(self, policy_name: str, ready_only: bool = True) -> list | None: + def is_restored(self) -> tuple[bool, str | None]: """ - Checks for existing snapshots matching the label selector and optional policy name. - Returns a list of valid snapshots sorted by creation timestamp (newest first). - policy_name: Filters snapshots by their spec.policyName. - ready_only: If True, filters out snapshots that are only in 'Ready' state. + Checks if the sandbox pod was restored from a snapshot. + Verifies this by checking for the 'PodRestored' condition in the pod status. + Returns: + tuple[bool, str | None]: (True, snapshot_uuid) if restored, (False, None) otherwise. """ - pass + if not self.pod_name: + logging.warning("Cannot check restore status: pod_name is unknown.") + return False, None - def delete_snapshots(self, trigger_name: str) -> int: - """ - Deletes snapshots matching the provided trigger name and the PSMT resources. - Returns the count of successfully deleted snapshots. - """ - pass + try: + v1 = client.CoreV1Api() + pod = v1.read_namespaced_pod(self.pod_name, self.namespace) + + if not pod.status or not pod.status.conditions: + return False, None + + for condition in pod.status.conditions: + if condition.type == "PodRestored" and condition.status == "True": + # Attempt to extract UUID from the message + # Message format: "pod successfully restored from pod snapshot namespace/uuid" + if condition.message: + parts = condition.message.split("/") + if len(parts) > 1: + return True, parts[-1].split()[0] + + return True, None + + return False, None + + except ApiException as e: + logging.error(f"Failed to check pod restore status: {e}") + return False, None def __exit__(self, exc_type, exc_val, exc_tb): """ diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py index 4660ee080..028538f68 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/test_podsnapshot_client.py @@ -19,7 +19,6 @@ from datetime import datetime from agentic_sandbox.gke_extensions.podsnapshot_client import ( PodSnapshotSandboxClient, - SnapshotPersistenceManager, ) from agentic_sandbox.constants import * from kubernetes import config diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py index d7a2bf7f9..fdc59e6fb 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py @@ -27,6 +27,7 @@ def test_snapshot_response(snapshot_response, snapshot_name): ), "snapshot response missing 'trigger_name' attribute" print(f"Trigger Name: {snapshot_response.trigger_name}") + print(f"Snapshot UID: {snapshot_response.snapshot_uid}") print(f"Success: {snapshot_response.success}") print(f"Error Code: {snapshot_response.error_code}") print(f"Error Reason: {snapshot_response.error_reason}") @@ -93,6 +94,8 @@ async def main( ) snapshot_response = sandbox.snapshot(second_snapshot_name) test_snapshot_response(snapshot_response, second_snapshot_name) + recent_snapshot_uid = snapshot_response.snapshot_uid + print(f"Recent snapshot UID: {recent_snapshot_uid}") print("\n***** Phase 2: Restoring from most recent snapshot & Verifying *****") with PodSnapshotSandboxClient( @@ -105,29 +108,12 @@ async def main( print("\nWaiting 5 seconds for restored pod to resume printing...") time.sleep(5) - # Fetch logs using the Kubernetes API - logs = core_v1_api.read_namespaced_pod_log( - name=sandbox_restored.pod_name, namespace=sandbox_restored.namespace - ) - - # Extract the sequence of 'Count:' values from the pod logs - counts = [int(n) for n in re.findall(r"Count: (\d+)", logs)] + is_restored, restored_snapshot_uid = sandbox_restored.is_restored() + assert is_restored, "Pod was not restored from a snapshot." assert ( - len(counts) > 0 - ), "Failed to retrieve any 'Count:' logs from restored pod." - - # Verify the counter resumed from the correct snapshot state. - # The second snapshot was taken after two wait intervals (totaling 20s if wait_time=10). - min_expected_count_at_restore = wait_time * 2 - first_count_after_restore = counts[0] - - print(f"First count after restore: {first_count_after_restore}") - - assert first_count_after_restore >= min_expected_count_at_restore, ( - f"State Mismatch! Expected counter to start >= {min_expected_count_at_restore}, " - f"but got {first_count_after_restore}. The pod likely restarted from scratch." - ) - + restored_snapshot_uid == recent_snapshot_uid + ), "Restored snapshot UID does not match the most recent snapshot UID." + print("Pod was restored from the most recent snapshot.") print("--- Pod Snapshot Test Passed! ---") except Exception as e: From 1c56d9a3315403e675788a476269b94b81a0edc4 Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Sat, 14 Feb 2026 01:25:18 +0000 Subject: [PATCH 09/10] update read me --- .../agentic_sandbox/gke_extensions/podsnapshot.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md index d0c0bae92..121625c45 100644 --- a/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md +++ b/clients/python/agentic-sandbox-client/agentic_sandbox/gke_extensions/podsnapshot.md @@ -22,7 +22,11 @@ A specialized Sandbox client for interacting with the gke pod snapshot controlle * The trigger_name is suffixed with unique hash. * Waits for the snapshot to be processed. * The pod snapshot controller creates a `PodSnapshot` resource automatically. - * Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name). + * Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name, snapshot_uid). +* **`isrestored(self) -> tuple[bool, str | None]`**: + * Checks if the sandbox pod was restored from a snapshot. + * Verifies this by checking for the 'PodRestored' condition in the pod status. + * Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name, snapshot_uid). * **Automatic Cleanup**: * The `__exit__` method cleans up the `SandboxClaim` resources. @@ -38,7 +42,7 @@ This file, located in the parent directory (`clients/python/agentic-sandbox-clie * Takes a second snapshot (`test-snapshot-20`) after another ~10 seconds. 2. **Phase 2: Restoring from Recent Snapshot**: * Restores a sandbox from the second snapshot. - * Verifies that the counter continues from where it left off (>= 20), proving the state was preserved. + * Verifies that sandbox has been restored from the recent snapshot. ### Prerequisites From 24cdef58af35a950285c0056257f9e033ec1238e Mon Sep 17 00:00:00 2001 From: Shrutiya Date: Thu, 19 Feb 2026 01:53:09 +0000 Subject: [PATCH 10/10] Update is_restored and error catch --- .../gke_extensions/podsnapshot.md | 17 +- .../gke_extensions/podsnapshot_client.py | 217 ++++++++++++---- .../gke_extensions/test_podsnapshot_client.py | 243 ++++++++++++++++-- .../test_podsnapshot_extension.py | 13 +- 4 files changed, 399 insertions(+), 91 deletions(-) diff --git a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot.md b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot.md index 121625c45..33d5577d8 100644 --- a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot.md +++ b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot.md @@ -16,19 +16,20 @@ A specialized Sandbox client for interacting with the gke pod snapshot controlle * Initializes the client with optional podsnapshot timeout and server port. * If snapshot exists, the pod snapshot controller restores from the most recent snapshot matching the label of the `SandboxTemplate`, otherwise creates a new `Sandbox`. * **`snapshot_controller_ready(self) -> bool`**: - * Checks if the snapshot agent (both self-installed and GKE managed) is running and ready. + * Checks if the snapshot agent (GKE managed) is running and ready. * **`snapshot(self, trigger_name: str) -> SnapshotResponse`**: * Triggers a manual snapshot of the current sandbox pod by creating a `PodSnapshotManualTrigger` resource. - * The trigger_name is suffixed with unique hash. + * The `trigger_name` is suffixed with unique hash. * Waits for the snapshot to be processed. * The pod snapshot controller creates a `PodSnapshot` resource automatically. * Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name, snapshot_uid). -* **`isrestored(self) -> tuple[bool, str | None]`**: - * Checks if the sandbox pod was restored from a snapshot. - * Verifies this by checking for the 'PodRestored' condition in the pod status. - * Returns the SnapshotResponse object(success, error_code, error_reason, trigger_name, snapshot_uid). -* **Automatic Cleanup**: - * The `__exit__` method cleans up the `SandboxClaim` resources. +* **`is_restored_from_snapshot(self, snapshot_uid: str) -> RestoreResult`**: + * Checks if the sandbox pod was restored from the specified snapshot. + * Verifies restoration by checking the 'PodRestored' condition in the pod status and confirming the message contains the expected snapshot UID. + * Returns RestoreResult object(success, error_code, error_reason). +* **`__exit__(self)`**: + * Cleans up the `PodSnapshotManualTrigger` resources. + * Cleans up the `SandboxClaim` resources. ## `test_podsnapshot_extension.py` diff --git a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot_client.py b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot_client.py index ea26999d5..65eadf8ba 100644 --- a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/podsnapshot_client.py @@ -22,11 +22,7 @@ from ..sandbox_client import SandboxClient, ExecutionResult from ..constants import * -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - stream=sys.stdout, -) +logger = logging.getLogger(__name__) @dataclass @@ -48,6 +44,15 @@ class SnapshotResponse: error_code: int +@dataclass +class RestoreResult: + """Result of a restore operation.""" + + success: bool + error_reason: str + error_code: int + + class PodSnapshotSandboxClient(SandboxClient): """ A specialized Sandbox client for interacting with the gke pod snapshot controller. @@ -65,21 +70,48 @@ def __init__( self.controller_ready = False self.podsnapshot_timeout = podsnapshot_timeout + self.core_v1_api = client.CoreV1Api() + + self.created_manual_triggers = [] def __enter__(self) -> "PodSnapshotSandboxClient": self.controller_ready = self.snapshot_controller_ready() super().__enter__() return self - def _wait_for_snapshot_processed(self, trigger_name: str) -> SnapshotResult: + def _parse_snapshot_result(self, obj) -> SnapshotResult | None: + """Parses the object to see if snapshot is complete.""" + status = obj.get("status", {}) + conditions = status.get("conditions", []) + for condition in conditions: + if ( + condition.get("type") == "Triggered" + and condition.get("status") == "True" + and condition.get("reason") == "Complete" + ): + snapshot_uid = status.get("snapshotCreated", {}).get("name") + snapshot_timestamp = condition.get("lastTransitionTime") + return SnapshotResult( + snapshot_uid=snapshot_uid, + snapshot_timestamp=snapshot_timestamp, + ) + return None + + def _wait_for_snapshot_processed( + self, trigger_name: str, resource_version: str | None = None + ) -> SnapshotResult: """ Waits for the PodSnapshotManualTrigger to be processed and returns SnapshotResult. """ w = watch.Watch() - logging.info( + logger.info( f"Waiting for snapshot manual trigger '{trigger_name}' to be processed..." ) + kwargs = {} + if resource_version: + kwargs["resource_version"] = resource_version + try: for event in w.stream( func=self.custom_objects_api.list_namespaced_custom_object, @@ -89,30 +121,19 @@ def _wait_for_snapshot_processed(self, trigger_name: str) -> SnapshotResult: plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, field_selector=f"metadata.name={trigger_name}", timeout_seconds=self.podsnapshot_timeout, + **kwargs, ): if event["type"] in ["ADDED", "MODIFIED"]: obj = event["object"] - status = obj.get("status", {}) - conditions = status.get("conditions", []) - - for condition in conditions: - if ( - condition.get("type") == "Triggered" - and condition.get("status") == "True" - and condition.get("reason") == "Complete" - ): - snapshot_uid = status.get("snapshotCreated", {}).get("name") - snapshot_timestamp = condition.get("lastTransitionTime") - logging.info( - f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {snapshot_uid}" - ) - w.stop() - return SnapshotResult( - snapshot_uid=snapshot_uid, - snapshot_timestamp=snapshot_timestamp, - ) + result = self._parse_snapshot_result(obj) + if result: + logger.info( + f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {result.snapshot_uid}" + ) + w.stop() + return result except Exception as e: - logging.error(f"Error watching snapshot: {e}") + logger.error(f"Error watching snapshot: {e}") raise raise TimeoutError( @@ -122,16 +143,36 @@ def _wait_for_snapshot_processed(self, trigger_name: str) -> SnapshotResult: def snapshot_controller_ready(self) -> bool: """ Checks if the snapshot agent pods are running in a GKE-managed pod snapshot cluster. + Falls back to checking CRD existence if pod listing is forbidden. """ if self.controller_ready: return True - core_v1_api = client.CoreV1Api() + def check_crd_installed() -> bool: + try: + # Check directly if the API resource exists using CustomObjectsApi + resource_list = self.custom_objects_api.get_api_resources( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + ) + + if not resource_list or not resource_list.resources: + return False + + for resource in resource_list.resources: + if resource.kind == PODSNAPSHOT_API_KIND: + return True + return False + except ApiException as e: + # If discovery fails with 403/404, we assume not ready/accessible + if e.status == 403 or e.status == 404: + return False + raise def check_namespace(namespace: str, required_components: list[str]) -> bool: try: - pods = core_v1_api.list_namespaced_pod(namespace) + pods = self.core_v1_api.list_namespaced_pod(namespace) found_components = { component: False for component in required_components } @@ -144,8 +185,15 @@ def check_namespace(namespace: str, required_components: list[str]) -> bool: found_components[component] = True return all(found_components.values()) - except ApiException: - return False + except ApiException as e: + if e.status == 403: + logger.info( + f"Permission denied listing pods in {namespace}. Checking CRD existence." + ) + return check_crd_installed() + if e.status == 404: + return False + raise # Check managed: requires only agent in gke-managed-pod-snapshots if check_namespace(SNAPSHOT_NAMESPACE_MANAGED, [SNAPSHOT_AGENT]): @@ -168,6 +216,7 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: return SnapshotResponse( success=False, trigger_name=trigger_name, + snapshot_uid=None, error_reason="Snapshot controller is not ready. Ensure it is installed and running.", error_code=1, ) @@ -175,6 +224,7 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: return SnapshotResponse( success=False, trigger_name=trigger_name, + snapshot_uid=None, error_reason="Sandbox pod name not found. Ensure sandbox is created.", error_code=1, ) @@ -187,14 +237,21 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: } try: - self.custom_objects_api.create_namespaced_custom_object( + created_obj = self.custom_objects_api.create_namespaced_custom_object( group=PODSNAPSHOT_API_GROUP, version=PODSNAPSHOT_API_VERSION, namespace=self.namespace, plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, body=manifest, ) - snapshot_result = self._wait_for_snapshot_processed(trigger_name) + + # Start watching from the version we just created to avoid missing updates + resource_version = created_obj.get("metadata", {}).get("resourceVersion") + snapshot_result = self._wait_for_snapshot_processed( + trigger_name, resource_version + ) + + self.created_manual_triggers.append(trigger_name) return SnapshotResponse( success=True, @@ -204,7 +261,7 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: error_code=0, ) except ApiException as e: - logging.exception( + logger.exception( f"Failed to create PodSnapshotManualTrigger '{trigger_name}': {e}" ) return SnapshotResponse( @@ -215,7 +272,7 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: error_code=1, ) except TimeoutError as e: - logging.exception( + logger.exception( f"Snapshot creation timed out for trigger '{trigger_name}': {e}" ) return SnapshotResponse( @@ -226,43 +283,91 @@ def snapshot(self, trigger_name: str) -> SnapshotResponse: error_code=1, ) - def is_restored(self) -> tuple[bool, str | None]: + def is_restored_from_snapshot(self, snapshot_uid: str) -> RestoreResult: """ - Checks if the sandbox pod was restored from a snapshot. - Verifies this by checking for the 'PodRestored' condition in the pod status. + Checks if the sandbox pod was restored from the specified snapshot. + + This is verified by inspecting the 'PodRestored' condition in the pod status + and confirming that the condition's message contains the provided snapshot UID. + Returns: - tuple[bool, str | None]: (True, snapshot_uuid) if restored, (False, None) otherwise. + RestoreResult: The result of the restore operation. """ + if not snapshot_uid: + return RestoreResult( + success=False, + error_reason="Snapshot UID cannot be empty.", + error_code=1, + ) + if not self.pod_name: - logging.warning("Cannot check restore status: pod_name is unknown.") - return False, None + logger.warning("Cannot check restore status: pod_name is unknown.") + return RestoreResult( + success=False, + error_reason="Pod name not found. Ensure sandbox is created.", + error_code=1, + ) try: - v1 = client.CoreV1Api() - pod = v1.read_namespaced_pod(self.pod_name, self.namespace) + pod = self.core_v1_api.read_namespaced_pod(self.pod_name, self.namespace) if not pod.status or not pod.status.conditions: - return False, None + return RestoreResult( + success=False, + error_reason="Pod status or conditions not found.", + error_code=1, + ) for condition in pod.status.conditions: if condition.type == "PodRestored" and condition.status == "True": - # Attempt to extract UUID from the message - # Message format: "pod successfully restored from pod snapshot namespace/uuid" - if condition.message: - parts = condition.message.split("/") - if len(parts) > 1: - return True, parts[-1].split()[0] - - return True, None - - return False, None + # Check if Snapshot UUID is present in the condition.message + if condition.message and snapshot_uid in condition.message: + return RestoreResult( + success=True, + error_reason="", + error_code=0, + ) + else: + return RestoreResult( + success=False, + error_reason="Pod was not restored from the given snapshot", + error_code=1, + ) + + return RestoreResult( + success=False, + error_reason="Pod was not restored from any snapshot", + error_code=1, + ) except ApiException as e: - logging.error(f"Failed to check pod restore status: {e}") - return False, None + logger.error(f"Failed to check pod restore status: {e}") + return RestoreResult( + success=False, + error_reason=f"Failed to check pod restore status: {e}", + error_code=1, + ) def __exit__(self, exc_type, exc_val, exc_tb): """ + Cleans up the PodSnapshotManualTrigger Resources. Automatically cleans up the Sandbox. + + TODO: Add cleanup for PodSnapshot resources. """ + for trigger_name in self.created_manual_triggers: + try: + self.custom_objects_api.delete_namespaced_custom_object( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.namespace, + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + name=trigger_name, + ) + logger.info(f"Deleted PodSnapshotManualTrigger '{trigger_name}'") + except ApiException as e: + logger.error( + f"Failed to delete PodSnapshotManualTrigger '{trigger_name}': {e}" + ) + super().__exit__(exc_type, exc_val, exc_tb) diff --git a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/test_podsnapshot_client.py b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/test_podsnapshot_client.py index 028538f68..cc1ece767 100644 --- a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/test_podsnapshot_client.py +++ b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/gke_extensions/test_podsnapshot_client.py @@ -17,10 +17,12 @@ import logging from unittest.mock import MagicMock, patch, call from datetime import datetime -from agentic_sandbox.gke_extensions.podsnapshot_client import ( +from k8s_agent_sandbox.gke_extensions.podsnapshot_client import ( PodSnapshotSandboxClient, ) -from agentic_sandbox.constants import * +from k8s_agent_sandbox.constants import * +from kubernetes.client import ApiException + from kubernetes import config logging.basicConfig( @@ -74,7 +76,7 @@ def test_init(self): """Test initialization of PodSnapshotSandboxClient.""" logging.info("Starting test_init...") with patch( - "agentic_sandbox.sandbox_client.SandboxClient.__init__", return_value=None + "k8s_agent_sandbox.sandbox_client.SandboxClient.__init__", return_value=None ) as mock_super: with patch.object( PodSnapshotSandboxClient, "snapshot_controller_ready", return_value=True @@ -85,12 +87,10 @@ def test_init(self): self.assertEqual(client.podsnapshot_timeout, 180) logging.info("Finished test_init.") - @patch("agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api") - def test_snapshot_controller_ready_managed(self, mock_v1_class): + def test_snapshot_controller_ready_managed(self): """Test snapshot_controller_ready for managed scenario.""" logging.info("Starting test_snapshot_controller_ready_managed...") - mock_v1 = MagicMock() - mock_v1_class.return_value = mock_v1 + mock_v1 = self.client.core_v1_api # Mock pods in gke-managed-pod-snapshots mock_pod_agent = MagicMock() @@ -109,12 +109,10 @@ def test_snapshot_controller_ready_managed(self, mock_v1_class): mock_v1.list_namespaced_pod.assert_called_with(SNAPSHOT_NAMESPACE_MANAGED) logging.info("Finished test_snapshot_controller_ready_managed.") - @patch("agentic_sandbox.gke_extensions.podsnapshot_client.client.CoreV1Api") - def test_snapshot_controller_ready_status_not_ready(self, mock_v1_class): - """Test snapshot_controller_ready when not ready.""" + def test_snapshot_controller_ready_status_not_ready(self): + """Test snapshot_controller_ready when not ready (pod missing).""" logging.info("Starting test_snapshot_controller_ready_status_not_ready...") - mock_v1 = MagicMock() - mock_v1_class.return_value = mock_v1 + mock_v1 = self.client.core_v1_api mock_pods = MagicMock() mock_pods.items = [] @@ -127,7 +125,61 @@ def test_snapshot_controller_ready_status_not_ready(self, mock_v1_class): self.assertFalse(self.client.controller_ready) logging.info("Finished test_snapshot_controller_ready_status_not_ready.") - @patch("agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch") + def test_snapshot_controller_ready_forbidden_with_crd(self): + """Test fallback to CRD check when pod listing is forbidden.""" + logging.info("Starting test_snapshot_controller_ready_forbidden_with_crd...") + mock_v1 = self.client.core_v1_api + mock_v1.list_namespaced_pod.side_effect = ApiException(status=403) + + # Mock CustomObjectsApi.get_api_resources + mock_resource_list = MagicMock() + mock_resource = MagicMock() + mock_resource.kind = PODSNAPSHOT_API_KIND + mock_resource_list.resources = [mock_resource] + self.client.custom_objects_api.get_api_resources.return_value = ( + mock_resource_list + ) + + self.client.controller_ready = False + result = self.client.snapshot_controller_ready() + + self.assertTrue(result) + self.assertTrue(self.client.controller_ready) + self.client.custom_objects_api.get_api_resources.assert_called_with( + group=PODSNAPSHOT_API_GROUP, version=PODSNAPSHOT_API_VERSION + ) + logging.info("Finished test_snapshot_controller_ready_forbidden_with_crd.") + + def test_snapshot_controller_ready_forbidden_no_crd(self): + """Test fallback to CRD check fails when CRD is missing.""" + logging.info("Starting test_snapshot_controller_ready_forbidden_no_crd...") + mock_v1 = self.client.core_v1_api + mock_v1.list_namespaced_pod.side_effect = ApiException(status=403) + + # Mock CustomObjectsApi.get_api_resources returning empty + self.client.custom_objects_api.get_api_resources.return_value = None + + self.client.controller_ready = False + result = self.client.snapshot_controller_ready() + + self.assertFalse(result) + self.assertFalse(self.client.controller_ready) + logging.info("Finished test_snapshot_controller_ready_forbidden_no_crd.") + + def test_snapshot_controller_ready_404(self): + """Test snapshot_controller_ready returns False on 404.""" + logging.info("Starting test_snapshot_controller_ready_404...") + mock_v1 = self.client.core_v1_api + mock_v1.list_namespaced_pod.side_effect = ApiException(status=404) + + self.client.controller_ready = False + result = self.client.snapshot_controller_ready() + + self.assertFalse(result) + self.assertFalse(self.client.controller_ready) + logging.info("Finished test_snapshot_controller_ready_404.") + + @patch("k8s_agent_sandbox.gke_extensions.podsnapshot_client.watch.Watch") def test_snapshot_success(self, mock_watch_class): """Test successful snapshot creation.""" logging.info("Starting test_snapshot_success...") @@ -159,6 +211,12 @@ def test_snapshot_success(self, mock_watch_class): } mock_watch.stream.return_value = [mock_event] + # Mock create to return an object with resourceVersion + mock_created_obj = {"metadata": {"resourceVersion": "123"}, "status": {}} + self.client.custom_objects_api.create_namespaced_custom_object.return_value = ( + mock_created_obj + ) + result = self.client.snapshot("test-trigger") self.assertEqual(result.error_code, 0) @@ -167,6 +225,10 @@ def test_snapshot_success(self, mock_watch_class): # Verify create call was made self.client.custom_objects_api.create_namespaced_custom_object.assert_called_once() + # Verify watch was called with resource_version + mock_watch.stream.assert_called_once() + _, kwargs = mock_watch.stream.call_args + self.assertEqual(kwargs.get("resource_version"), "123") logging.info("Finished test_snapshot_success.") def test_snapshot_controller_not_ready(self): @@ -194,8 +256,27 @@ def test_snapshot_no_pod_name(self): self.assertIn("Sandbox pod name not found", result.error_reason) logging.info("Finished test_snapshot_no_pod_name.") - @patch("agentic_sandbox.gke_extensions.podsnapshot_client.watch.Watch") - @patch("agentic_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi") + def test_snapshot_creation_api_exception(self): + """Test snapshot handling of API exception during creation.""" + logging.info("Starting test_snapshot_creation_api_exception...") + self.client.pod_name = "test-pod" + self.client.controller_ready = True + + self.client.custom_objects_api.create_namespaced_custom_object.side_effect = ( + ApiException("Create failed") + ) + + result = self.client.snapshot("test-trigger") + + self.assertFalse(result.success) + self.assertEqual(result.error_code, 1) + self.assertIn("Failed to create PodSnapshotManualTrigger", result.error_reason) + logging.info("Finished test_snapshot_creation_api_exception.") + + @patch("k8s_agent_sandbox.gke_extensions.podsnapshot_client.watch.Watch") + @patch( + "k8s_agent_sandbox.gke_extensions.podsnapshot_client.client.CustomObjectsApi" + ) def test_snapshot_timeout(self, mock_custom_class, mock_watch_class): """Test snapshot timeout scenario.""" logging.info("Starting test_snapshot_timeout...") @@ -219,13 +300,135 @@ def test_snapshot_timeout(self, mock_custom_class, mock_watch_class): self.assertIn("timed out", result.error_reason) logging.info("Finished test_snapshot_timeout.") - @patch("agentic_sandbox.gke_extensions.podsnapshot_client.SandboxClient.__exit__") - def test_exit(self, mock_super_exit): - """Test __exit__ method.""" - logging.info("Starting test_exit...") + @patch("k8s_agent_sandbox.gke_extensions.podsnapshot_client.SandboxClient.__exit__") + def test_exit_cleanup(self, mock_super_exit): + """Test __exit__ cleans up created triggers.""" + logging.info("Starting test_exit_cleanup...") + self.client.created_manual_triggers = ["trigger-1", "trigger-2"] + self.client.__exit__(None, None, None) + + # Check deletion calls + self.assertEqual( + self.client.custom_objects_api.delete_namespaced_custom_object.call_count, 2 + ) + + calls = [ + call( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.client.namespace, + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + name="trigger-1", + ), + call( + group=PODSNAPSHOT_API_GROUP, + version=PODSNAPSHOT_API_VERSION, + namespace=self.client.namespace, + plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, + name="trigger-2", + ), + ] + self.client.custom_objects_api.delete_namespaced_custom_object.assert_has_calls( + calls, any_order=True + ) + mock_super_exit.assert_called_once_with(None, None, None) - logging.info("Finished test_exit.") + logging.info("Finished test_exit_cleanup.") + + def test_is_restored_from_snapshot_success(self): + """Test is_restored_from_snapshot success case.""" + logging.info("Starting test_is_restored_from_snapshot_success...") + self.client.pod_name = "test-pod" + + mock_pod = MagicMock() + mock_condition = MagicMock() + mock_condition.type = "PodRestored" + mock_condition.status = "True" + mock_condition.message = "Restored from snapshot-uid-123" + mock_pod.status.conditions = [mock_condition] + + self.client.core_v1_api.read_namespaced_pod.return_value = mock_pod + + result = self.client.is_restored_from_snapshot("snapshot-uid-123") + + self.assertTrue(result.success) + self.assertEqual(result.error_code, 0) + logging.info("Finished test_is_restored_from_snapshot_success.") + + def test_is_restored_from_snapshot_mismatch(self): + """Test is_restored_from_snapshot when UID matches another snapshot.""" + logging.info("Starting test_is_restored_from_snapshot_mismatch...") + self.client.pod_name = "test-pod" + + mock_pod = MagicMock() + mock_condition = MagicMock() + mock_condition.type = "PodRestored" + mock_condition.status = "True" + mock_condition.message = "Restored from snapshot-uid-456" + mock_pod.status.conditions = [mock_condition] + + self.client.core_v1_api.read_namespaced_pod.return_value = mock_pod + + result = self.client.is_restored_from_snapshot("snapshot-uid-123") + + self.assertFalse(result.success) + self.assertEqual(result.error_code, 1) + self.assertIn("not restored from the given snapshot", result.error_reason) + logging.info("Finished test_is_restored_from_snapshot_mismatch.") + + def test_is_restored_from_snapshot_no_condition(self): + """Test is_restored_from_snapshot when PodRestored condition is missing.""" + logging.info("Starting test_is_restored_from_snapshot_no_condition...") + self.client.pod_name = "test-pod" + + mock_pod = MagicMock() + mock_pod.status.conditions = [] + + self.client.core_v1_api.read_namespaced_pod.return_value = mock_pod + + result = self.client.is_restored_from_snapshot("snapshot-uid-123") + + self.assertFalse(result.success) + self.assertEqual(result.error_code, 1) + self.assertIn("Pod status or conditions not found", result.error_reason) + logging.info("Finished test_is_restored_from_snapshot_no_condition.") + + def test_is_restored_from_snapshot_api_error(self): + """Test is_restored_from_snapshot API exception handling.""" + logging.info("Starting test_is_restored_from_snapshot_api_error...") + self.client.pod_name = "test-pod" + self.client.core_v1_api.read_namespaced_pod.side_effect = ApiException( + "API Error" + ) + + result = self.client.is_restored_from_snapshot("snapshot-uid-123") + + self.assertFalse(result.success) + self.assertEqual(result.error_code, 1) + self.assertIn("Failed to check pod restore status", result.error_reason) + logging.info("Finished test_is_restored_from_snapshot_api_error.") + + def test_is_restored_from_snapshot_no_pod_name(self): + """Test is_restored_from_snapshot when pod_name is missing.""" + logging.info("Starting test_is_restored_from_snapshot_no_pod_name...") + self.client.pod_name = None + result = self.client.is_restored_from_snapshot("snapshot-uid-123") + + self.assertFalse(result.success) + self.assertEqual(result.error_code, 1) + self.assertIn("Pod name not found", result.error_reason) + logging.info("Finished test_is_restored_from_snapshot_no_pod_name.") + + def test_is_restored_from_snapshot_empty_uid(self): + """Test is_restored_from_snapshot with empty UID.""" + logging.info("Starting test_is_restored_from_snapshot_empty_uid...") + result = self.client.is_restored_from_snapshot("") + + self.assertFalse(result.success) + self.assertEqual(result.error_code, 1) + self.assertIn("Snapshot UID cannot be empty", result.error_reason) + logging.info("Finished test_is_restored_from_snapshot_empty_uid.") if __name__ == "__main__": diff --git a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py index fdc59e6fb..04afbd81e 100644 --- a/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py +++ b/clients/python/agentic-sandbox-client/test_podsnapshot_extension.py @@ -18,7 +18,7 @@ import sys from kubernetes import client, config import re -from agentic_sandbox.gke_extensions import PodSnapshotSandboxClient +from k8s_agent_sandbox.gke_extensions import PodSnapshotSandboxClient def test_snapshot_response(snapshot_response, snapshot_name): @@ -66,7 +66,6 @@ async def main( wait_time = 10 first_snapshot_name = "test-snapshot-10" second_snapshot_name = "test-snapshot-20" - core_v1_api = client.CoreV1Api() try: print("\n***** Phase 1: Starting Counter *****") @@ -108,12 +107,12 @@ async def main( print("\nWaiting 5 seconds for restored pod to resume printing...") time.sleep(5) - is_restored, restored_snapshot_uid = sandbox_restored.is_restored() - assert is_restored, "Pod was not restored from a snapshot." - assert ( - restored_snapshot_uid == recent_snapshot_uid - ), "Restored snapshot UID does not match the most recent snapshot UID." + restore_result = sandbox_restored.is_restored_from_snapshot( + recent_snapshot_uid + ) + assert restore_result.success, "Pod was not restored from a snapshot." print("Pod was restored from the most recent snapshot.") + print("--- Pod Snapshot Test Passed! ---") except Exception as e: