-
Notifications
You must be signed in to change notification settings - Fork 188
feat: implement pod snapshot method #339
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,34 +13,66 @@ | |
| # limitations under the License. | ||
|
|
||
| import logging | ||
| from kubernetes import client | ||
| import uuid | ||
| from datetime import datetime, timezone | ||
| from typing import Any | ||
| from dataclasses import dataclass | ||
| from kubernetes import client, watch | ||
| from kubernetes.client import ApiException | ||
| from ..sandbox_client import SandboxClient | ||
| from ..constants import ( | ||
| PODSNAPSHOT_API_GROUP, | ||
| PODSNAPSHOT_API_VERSION, | ||
| PODSNAPSHOT_API_KIND, | ||
| PODSNAPSHOTMANUALTRIGGER_API_KIND, | ||
| PODSNAPSHOTMANUALTRIGGER_PLURAL, | ||
| ) | ||
|
|
||
| SNAPSHOT_SUCCESS_CODE = 0 | ||
| SNAPSHOT_ERROR_CODE = 1 | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| @dataclass | ||
| class SnapshotResult: | ||
| """Result of a snapshot processing operation.""" | ||
|
|
||
| snapshot_uid: str | ||
| snapshot_timestamp: str | ||
|
|
||
|
|
||
| @dataclass | ||
| class SnapshotResponse: | ||
| """Structured response for snapshot operations.""" | ||
|
|
||
| success: bool | ||
| trigger_name: str | ||
| snapshot_uid: str | ||
| snapshot_timestamp: str | ||
| error_reason: str | ||
| error_code: int | ||
|
|
||
|
|
||
| class PodSnapshotSandboxClient(SandboxClient): | ||
| """ | ||
| A specialized Sandbox client for interacting with the GKE Pod Snapshot Controller. | ||
|
|
||
| TODO: This class enables users to take a snapshot of their sandbox and restore from the taken snapshot. | ||
| This class enables users to take a manual trigger snapshot of their sandbox and restore from the taken snapshot. | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| template_name: str, | ||
| podsnapshot_timeout: int = 180, | ||
| **kwargs, | ||
| ): | ||
| super().__init__(template_name, **kwargs) | ||
|
|
||
| self.snapshot_crd_installed = False | ||
| self.core_v1_api = client.CoreV1Api() | ||
| self.podsnapshot_timeout = podsnapshot_timeout | ||
|
|
||
| self.created_manual_triggers = [] | ||
|
|
||
| def __enter__(self) -> "PodSnapshotSandboxClient": | ||
| try: | ||
|
|
@@ -61,7 +93,9 @@ def __enter__(self) -> "PodSnapshotSandboxClient": | |
|
|
||
| def _check_snapshot_crd_installed(self) -> bool: | ||
| """ | ||
| Checks if the PodSnapshot CRD is installed in the cluster. | ||
| Checks if the PodSnapshot CRD is installed and available in the cluster. | ||
| Returns: | ||
| bool: True if the CRD is installed, False otherwise. | ||
| """ | ||
|
|
||
| if self.snapshot_crd_installed: | ||
|
|
@@ -87,8 +121,221 @@ def _check_snapshot_crd_installed(self) -> bool: | |
| return False | ||
| raise | ||
|
|
||
| def _parse_created_snapshot_info(self, obj: dict[str, Any]) -> SnapshotResult: | ||
| """Parses the object to extract snapshot details.""" | ||
| status = obj.get("status", {}) | ||
| conditions = status.get("conditions") or [] | ||
| for condition in conditions: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the the API responds with an explicit |
||
| if ( | ||
| condition.get("type") == "Triggered" | ||
| and condition.get("status") == "True" | ||
| and condition.get("reason") == "Complete" | ||
shrutiyam-glitch marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ): | ||
| snapshot_created = status.get("snapshotCreated") or {} | ||
| snapshot_uid = snapshot_created.get("name") | ||
| snapshot_timestamp = condition.get("lastTransitionTime") | ||
| return SnapshotResult( | ||
| snapshot_uid=snapshot_uid, | ||
| snapshot_timestamp=snapshot_timestamp, | ||
| ) | ||
| elif condition.get("status") == "False" and condition.get("reason") in [ | ||
| "Failed", | ||
| "Error", | ||
| ]: | ||
| raise RuntimeError( | ||
| f"Snapshot failed. Condition: {condition.get('message', 'Unknown error')}" | ||
| ) | ||
| raise ValueError("Snapshot is not yet complete.") | ||
shrutiyam-glitch marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| def _wait_for_snapshot_to_be_completed( | ||
| self, trigger_name: str, resource_version: str | None = None | ||
| ) -> SnapshotResult: | ||
| """ | ||
| Waits for the PodSnapshotManualTrigger to be processed and returns SnapshotResult. | ||
| """ | ||
| w = watch.Watch() | ||
| logger.info( | ||
| f"Waiting for snapshot manual trigger '{trigger_name}' to be processed..." | ||
| ) | ||
|
|
||
| kwargs = {} | ||
| if resource_version: | ||
| kwargs["resource_version"] = resource_version | ||
|
|
||
| try: | ||
| for event in w.stream( | ||
| func=self.custom_objects_api.list_namespaced_custom_object, | ||
| namespace=self.namespace, | ||
| group=PODSNAPSHOT_API_GROUP, | ||
| version=PODSNAPSHOT_API_VERSION, | ||
| plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, | ||
| field_selector=f"metadata.name={trigger_name}", | ||
| timeout_seconds=self.podsnapshot_timeout, | ||
| **kwargs, | ||
| ): | ||
| if event["type"] in ["ADDED", "MODIFIED"]: | ||
shrutiyam-glitch marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| obj = event["object"] | ||
| try: | ||
| result = self._parse_created_snapshot_info(obj) | ||
| logger.info( | ||
| f"Snapshot manual trigger '{trigger_name}' processed successfully. Created Snapshot UID: {result.snapshot_uid}" | ||
| ) | ||
| return result | ||
| except ValueError: | ||
| # Continue watching if snapshot is not yet complete | ||
| continue | ||
| elif event["type"] == "ERROR": | ||
| logger.error( | ||
| f"Snapshot watch received error event: {event['object']}" | ||
| ) | ||
| raise RuntimeError(f"Snapshot watch error: {event['object']}") | ||
| elif event["type"] == "DELETED": | ||
| logger.error( | ||
| f"Snapshot manual trigger '{trigger_name}' was deleted before completion." | ||
| ) | ||
| raise RuntimeError( | ||
| f"Snapshot manual trigger '{trigger_name}' was deleted." | ||
| ) | ||
| except Exception as e: | ||
| logger.error(f"Error watching snapshot: {e}") | ||
| raise | ||
shrutiyam-glitch marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| finally: | ||
| w.stop() | ||
|
|
||
| raise TimeoutError( | ||
| f"Snapshot manual trigger '{trigger_name}' was not processed within {self.podsnapshot_timeout} seconds." | ||
| ) | ||
|
|
||
| def snapshot(self, trigger_name: str) -> SnapshotResponse: | ||
| """ | ||
| Triggers a snapshot of the specified pod by creating a PodSnapshotManualTrigger resource. | ||
| The trigger_name will be suffixed with a timestamp and random hex string. | ||
| Returns: | ||
| SnapshotResponse: The result of the operation. | ||
| """ | ||
| timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") | ||
| suffix = uuid.uuid4().hex[:8] | ||
| trigger_name = f"{trigger_name}-{timestamp}-{suffix}" | ||
shrutiyam-glitch marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| if not self.snapshot_crd_installed: | ||
| return SnapshotResponse( | ||
| success=False, | ||
| trigger_name=trigger_name, | ||
| snapshot_uid=None, | ||
| snapshot_timestamp=None, | ||
| error_reason="Snapshot CRD is not installed. Ensure it is installed and running.", | ||
| error_code=SNAPSHOT_ERROR_CODE, | ||
| ) | ||
| if not self.pod_name: | ||
| return SnapshotResponse( | ||
| success=False, | ||
| trigger_name=trigger_name, | ||
| snapshot_uid=None, | ||
| snapshot_timestamp=None, | ||
| error_reason="Sandbox pod name not found. Ensure sandbox is created.", | ||
| error_code=SNAPSHOT_ERROR_CODE, | ||
| ) | ||
|
|
||
| manifest = { | ||
| "apiVersion": f"{PODSNAPSHOT_API_GROUP}/{PODSNAPSHOT_API_VERSION}", | ||
| "kind": f"{PODSNAPSHOTMANUALTRIGGER_API_KIND}", | ||
| "metadata": {"name": trigger_name, "namespace": self.namespace}, | ||
| "spec": {"targetPod": self.pod_name}, | ||
| } | ||
|
|
||
| try: | ||
| created_obj = self.custom_objects_api.create_namespaced_custom_object( | ||
| group=PODSNAPSHOT_API_GROUP, | ||
| version=PODSNAPSHOT_API_VERSION, | ||
| namespace=self.namespace, | ||
| plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, | ||
| body=manifest, | ||
| ) | ||
| self.created_manual_triggers.append(trigger_name) | ||
|
|
||
| # Start watching from the version we just created to avoid missing updates | ||
| resource_version = created_obj.get("metadata", {}).get("resourceVersion") | ||
| snapshot_result = self._wait_for_snapshot_to_be_completed( | ||
| trigger_name, resource_version | ||
| ) | ||
|
|
||
| return SnapshotResponse( | ||
| success=True, | ||
| trigger_name=trigger_name, | ||
| snapshot_uid=snapshot_result.snapshot_uid, | ||
| snapshot_timestamp=snapshot_result.snapshot_timestamp, | ||
| error_reason="", | ||
| error_code=SNAPSHOT_SUCCESS_CODE, | ||
| ) | ||
| except ApiException as e: | ||
| logger.exception( | ||
| f"Failed to create PodSnapshotManualTrigger '{trigger_name}': {e}" | ||
| ) | ||
| return SnapshotResponse( | ||
| success=False, | ||
| trigger_name=trigger_name, | ||
| snapshot_uid=None, | ||
| snapshot_timestamp=None, | ||
| error_reason=f"Failed to create PodSnapshotManualTrigger: {e}", | ||
| error_code=SNAPSHOT_ERROR_CODE, | ||
| ) | ||
| except TimeoutError as e: | ||
shrutiyam-glitch marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| logger.exception( | ||
| f"Snapshot creation timed out for trigger '{trigger_name}': {e}" | ||
| ) | ||
| return SnapshotResponse( | ||
| success=False, | ||
| trigger_name=trigger_name, | ||
| snapshot_uid=None, | ||
| snapshot_timestamp=None, | ||
| error_reason=f"Snapshot creation timed out: {e}", | ||
| error_code=SNAPSHOT_ERROR_CODE, | ||
| ) | ||
| except RuntimeError as e: | ||
| logger.exception( | ||
| f"Snapshot creation failed for trigger '{trigger_name}': {e}" | ||
| ) | ||
| return SnapshotResponse( | ||
| success=False, | ||
| trigger_name=trigger_name, | ||
| snapshot_uid=None, | ||
| snapshot_timestamp=None, | ||
| error_reason=f"Snapshot creation failed: {e}", | ||
| error_code=SNAPSHOT_ERROR_CODE, | ||
| ) | ||
| except Exception as e: | ||
| logger.exception( | ||
| f"Unexpected error during snapshot creation for trigger '{trigger_name}': {e}" | ||
| ) | ||
| return SnapshotResponse( | ||
| success=False, | ||
| trigger_name=trigger_name, | ||
| snapshot_uid=None, | ||
| snapshot_timestamp=None, | ||
| error_reason=f"Unexpected error: {e}", | ||
| error_code=SNAPSHOT_ERROR_CODE, | ||
| ) | ||
|
|
||
| def __exit__(self, exc_type, exc_val, exc_tb): | ||
| """ | ||
| Cleans up the PodSnapshotManualTrigger Resources. | ||
shrutiyam-glitch marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Automatically cleans up the Sandbox. | ||
| """ | ||
| for trigger_name in self.created_manual_triggers: | ||
| try: | ||
| self.custom_objects_api.delete_namespaced_custom_object( | ||
| group=PODSNAPSHOT_API_GROUP, | ||
| version=PODSNAPSHOT_API_VERSION, | ||
| namespace=self.namespace, | ||
| plural=PODSNAPSHOTMANUALTRIGGER_PLURAL, | ||
| name=trigger_name, | ||
| ) | ||
| logger.info(f"Deleted PodSnapshotManualTrigger '{trigger_name}'") | ||
| except ApiException as e: | ||
| if e.status == 404: | ||
| # Ignore if the resource is already deleted | ||
| continue | ||
| logger.error( | ||
| f"Failed to delete PodSnapshotManualTrigger '{trigger_name}': {e}" | ||
| ) | ||
| super().__exit__(exc_type, exc_val, exc_tb) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
snapshot_timestampis extracted and stored inSnapshotResult, but it is never utilized or returned inSnapshotResponse. Should we add it to the finalSnapshotResponseso the user knows when the snapshot occurred