Skip to content

Commit 6822bc6

Browse files
committed
STAC-23598: Adding check-and-finalize command to stackgraph
1 parent 52fcdf7 commit 6822bc6

File tree

12 files changed

+712
-168
lines changed

12 files changed

+712
-168
lines changed

cmd/elasticsearch/restore-snapshot.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ func runRestore(appCtx *app.Context) error {
6565
defer func() {
6666
if len(scaledDeployments) > 0 {
6767
appCtx.Logger.Println()
68-
if err := scale.ScaleUp(appCtx.K8sClient, appCtx.Namespace, scaledDeployments, appCtx.Logger); err != nil {
68+
if err := scale.ScaleUpFromAnnotations(appCtx.K8sClient, appCtx.Namespace, appCtx.Config.Elasticsearch.Restore.ScaleDownLabelSelector, appCtx.Logger); err != nil {
6969
appCtx.Logger.Warningf("Failed to scale up deployments: %v", err)
7070
}
7171
}

cmd/root.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ var (
1717
// addBackupConfigFlags adds configuration flags needed for backup/restore operations
1818
// to commands that interact with data services (Elasticsearch, etc.)
1919
func addBackupConfigFlags(cmd *cobra.Command) {
20-
cmd.PersistentFlags().StringVar(&flags.Namespace, "namespace", "", "Kubernetes namespace (required)")
20+
cmd.PersistentFlags().StringVarP(&flags.Namespace, "namespace", "n", "", "Kubernetes namespace (required)")
2121
cmd.PersistentFlags().StringVar(&flags.Kubeconfig, "kubeconfig", "", "Path to kubeconfig file (default: ~/.kube/config)")
2222
cmd.PersistentFlags().BoolVar(&flags.Debug, "debug", false, "Enable debug output")
2323
cmd.PersistentFlags().BoolVarP(&flags.Quiet, "quiet", "q", false, "Suppress operational messages (only show errors and data output)")
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
package stackgraph
2+
3+
import (
4+
"fmt"
5+
"os"
6+
7+
"github.com/spf13/cobra"
8+
"github.com/stackvista/stackstate-backup-cli/internal/app"
9+
"github.com/stackvista/stackstate-backup-cli/internal/foundation/config"
10+
"github.com/stackvista/stackstate-backup-cli/internal/orchestration/scale"
11+
batchv1 "k8s.io/api/batch/v1"
12+
)
13+
14+
// Check and finalize command flags
15+
var (
16+
checkJobName string
17+
waitForJob bool
18+
)
19+
20+
func checkAndFinalizeCmd(globalFlags *config.CLIGlobalFlags) *cobra.Command {
21+
cmd := &cobra.Command{
22+
Use: "check-and-finalize",
23+
Short: "Check and finalize a Stackgraph restore job",
24+
Long: `Check the status of a background Stackgraph restore job and clean up resources.
25+
26+
This command is useful when a restore job was started with --background flag or was interrupted (Ctrl+C).
27+
It will check the job status, print logs if it failed, and clean up the job and PVC resources.
28+
29+
Examples:
30+
# Check job status without waiting
31+
sts-backup stackgraph check-and-finalize --job stackgraph-restore-20250128t143000 -n my-namespace
32+
33+
# Wait for job completion and cleanup
34+
sts-backup stackgraph check-and-finalize --job stackgraph-restore-20250128t143000 --wait -n my-namespace`,
35+
Run: func(_ *cobra.Command, _ []string) {
36+
appCtx, err := app.NewContext(globalFlags)
37+
if err != nil {
38+
_, _ = fmt.Fprintf(os.Stderr, "error: %v\n", err)
39+
os.Exit(1)
40+
}
41+
if err := runCheckAndFinalize(appCtx); err != nil {
42+
_, _ = fmt.Fprintf(os.Stderr, "error: %v\n", err)
43+
os.Exit(1)
44+
}
45+
},
46+
}
47+
48+
cmd.Flags().StringVarP(&checkJobName, "job", "j", "", "Stackgraph restore job name (required)")
49+
cmd.Flags().BoolVarP(&waitForJob, "wait", "w", false, "Wait for job to complete before cleanup")
50+
_ = cmd.MarkFlagRequired("job")
51+
52+
return cmd
53+
}
54+
55+
func runCheckAndFinalize(appCtx *app.Context) error {
56+
// Get job
57+
appCtx.Logger.Infof("Checking status of job: %s", checkJobName)
58+
job, err := appCtx.K8sClient.GetJob(appCtx.Namespace, checkJobName)
59+
if err != nil {
60+
return fmt.Errorf("failed to get job '%s': %w (job may not exist or has been deleted)", checkJobName, err)
61+
}
62+
63+
// Check if job is already complete
64+
completed, succeeded := isJobComplete(job)
65+
66+
if completed {
67+
// Job already finished - print status and cleanup
68+
return handleCompletedJob(appCtx, checkJobName, succeeded)
69+
}
70+
71+
// Job still running
72+
if waitForJob {
73+
// Wait for completion, then cleanup
74+
return waitAndFinalize(appCtx, checkJobName)
75+
}
76+
77+
// Not waiting - just print status
78+
printRunningJobStatus(appCtx.Logger, checkJobName, appCtx.Namespace, job.Status.Active)
79+
return nil
80+
}
81+
82+
// isJobComplete checks if job is in a terminal state
83+
func isJobComplete(job *batchv1.Job) (completed bool, succeeded bool) {
84+
if job.Status.Succeeded > 0 {
85+
return true, true
86+
}
87+
if job.Status.Failed > 0 {
88+
return true, false
89+
}
90+
return false, false
91+
}
92+
93+
// handleCompletedJob handles a job that's already complete
94+
func handleCompletedJob(appCtx *app.Context, jobName string, succeeded bool) error {
95+
appCtx.Logger.Println()
96+
if succeeded {
97+
appCtx.Logger.Successf("Job completed successfully: %s", jobName)
98+
appCtx.Logger.Println()
99+
100+
// Scale up deployments that were scaled down before restore
101+
scaleDownLabelSelector := appCtx.Config.Stackgraph.Restore.ScaleDownLabelSelector
102+
if err := scale.ScaleUpFromAnnotations(appCtx.K8sClient, appCtx.Namespace, scaleDownLabelSelector, appCtx.Logger); err != nil {
103+
appCtx.Logger.Warningf("Failed to scale up deployments: %v", err)
104+
}
105+
} else {
106+
appCtx.Logger.Errorf("Job failed: %s", jobName)
107+
appCtx.Logger.Println()
108+
appCtx.Logger.Infof("Fetching logs...")
109+
appCtx.Logger.Println()
110+
if err := printJobLogs(appCtx.K8sClient, appCtx.Namespace, jobName, appCtx.Logger); err != nil {
111+
appCtx.Logger.Warningf("Failed to fetch logs: %v", err)
112+
}
113+
}
114+
115+
// Cleanup resources
116+
appCtx.Logger.Println()
117+
return cleanupRestoreResources(appCtx.K8sClient, appCtx.Namespace, jobName, appCtx.Logger)
118+
}
119+
120+
// waitAndFinalize waits for job completion and then cleans up
121+
func waitAndFinalize(appCtx *app.Context, jobName string) error {
122+
printWaitingMessage(appCtx.Logger, jobName, appCtx.Namespace)
123+
124+
if err := waitForJobCompletion(appCtx.K8sClient, appCtx.Namespace, jobName, appCtx.Logger); err != nil {
125+
appCtx.Logger.Errorf("Job failed: %v", err)
126+
// Still cleanup even if failed
127+
appCtx.Logger.Println()
128+
_ = cleanupRestoreResources(appCtx.K8sClient, appCtx.Namespace, jobName, appCtx.Logger)
129+
return err
130+
}
131+
132+
appCtx.Logger.Println()
133+
appCtx.Logger.Successf("Job completed successfully: %s", jobName)
134+
appCtx.Logger.Println()
135+
136+
// Scale up deployments that were scaled down before restore
137+
scaleDownLabelSelector := appCtx.Config.Stackgraph.Restore.ScaleDownLabelSelector
138+
if err := scale.ScaleUpFromAnnotations(appCtx.K8sClient, appCtx.Namespace, scaleDownLabelSelector, appCtx.Logger); err != nil {
139+
appCtx.Logger.Warningf("Failed to scale up deployments: %v", err)
140+
}
141+
142+
appCtx.Logger.Println()
143+
return cleanupRestoreResources(appCtx.K8sClient, appCtx.Namespace, jobName, appCtx.Logger)
144+
}

cmd/stackgraph/restore.go

Lines changed: 57 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ import (
2121
"github.com/stackvista/stackstate-backup-cli/internal/orchestration/portforward"
2222
"github.com/stackvista/stackstate-backup-cli/internal/orchestration/scale"
2323
"github.com/stackvista/stackstate-backup-cli/internal/scripts"
24-
batchv1 "k8s.io/api/batch/v1"
2524
corev1 "k8s.io/api/core/v1"
2625
)
2726

@@ -111,7 +110,7 @@ func runRestore(appCtx *app.Context) error {
111110
defer func() {
112111
if len(scaledDeployments) > 0 && !background {
113112
appCtx.Logger.Println()
114-
if err := scale.ScaleUp(appCtx.K8sClient, appCtx.Namespace, scaledDeployments, appCtx.Logger); err != nil {
113+
if err := scale.ScaleUpFromAnnotations(appCtx.K8sClient, appCtx.Namespace, scaleDownLabelSelector, appCtx.Logger); err != nil {
115114
appCtx.Logger.Warningf("Failed to scale up deployments: %v", err)
116115
}
117116
}
@@ -129,19 +128,18 @@ func runRestore(appCtx *app.Context) error {
129128

130129
jobName := fmt.Sprintf("%s-%s", jobNameTemplate, time.Now().Format("20060102t150405"))
131130

132-
job, pvc, err := createRestoreJob(appCtx.K8sClient, appCtx.Namespace, jobName, backupFile, appCtx.Config)
133-
if err != nil {
131+
if err = createRestoreJob(appCtx.K8sClient, appCtx.Namespace, jobName, backupFile, appCtx.Config); err != nil {
134132
return fmt.Errorf("failed to create restore job: %w", err)
135133
}
136134

137135
appCtx.Logger.Successf("Restore job created: %s", jobName)
138136

139137
if background {
140-
printBackgroundModeInstructions(appCtx.Logger, jobName, appCtx.Namespace, scaleDownLabelSelector)
138+
printRunningJobStatus(appCtx.Logger, jobName, appCtx.Namespace, 0)
141139
return nil
142140
}
143141

144-
return waitAndCleanupRestoreJob(appCtx.K8sClient, appCtx.Namespace, jobName, job, pvc, appCtx.Logger)
142+
return waitAndCleanupRestoreJob(appCtx.K8sClient, appCtx.Namespace, jobName, appCtx.Logger)
145143
}
146144

147145
// ensureRestoreResources ensures that required Kubernetes resources exist for the restore job
@@ -186,22 +184,56 @@ func ensureRestoreResources(k8sClient *k8s.Client, namespace string, config *con
186184
return nil
187185
}
188186

189-
// printBackgroundModeInstructions prints instructions for monitoring and cleanup in background mode
190-
func printBackgroundModeInstructions(log *logger.Logger, jobName, namespace, scaleDownSelector string) {
187+
// printWaitingMessage prints waiting message with instructions for interruption
188+
func printWaitingMessage(log *logger.Logger, jobName, namespace string) {
191189
log.Println()
192-
log.Infof("Job is running in background. Use the following commands to monitor and cleanup:")
193-
log.Infof(" Monitor: kubectl logs --follow job/%s -n %s", jobName, namespace)
194-
log.Infof(" Cleanup: kubectl delete job,pvc %s -n %s", jobName, namespace)
195-
log.Infof(" Scale up: kubectl scale --replicas=1 deployments --selector=%s -n %s", scaleDownSelector, namespace)
190+
log.Infof("Waiting for restore job to complete (this may take several minutes)...")
196191
log.Println()
197-
log.Warningf("IMPORTANT: The restore job may take significant time to complete.")
198-
log.Warningf("Remember to scale up deployments after the job completes successfully!")
192+
log.Infof("You can safely interrupt this command with Ctrl+C.")
193+
log.Infof("To check status, scale up the required deployments and cleanup later, run:")
194+
log.Infof(" sts-backup stackgraph check-and-finalize --job %s --wait -n %s", jobName, namespace)
199195
}
200196

201-
// waitAndCleanupRestoreJob waits for job completion and cleans up resources
202-
func waitAndCleanupRestoreJob(k8sClient *k8s.Client, namespace, jobName string, job *batchv1.Job, pvc *corev1.PersistentVolumeClaim, log *logger.Logger) error {
197+
// printRunningJobStatus prints status and instructions for a running job
198+
func printRunningJobStatus(log *logger.Logger, jobName, namespace string, activePods int32) {
203199
log.Println()
204-
log.Infof("Waiting for restore job to complete (this may take several minutes)...")
200+
log.Infof("Job is running in background: %s", jobName)
201+
if activePods > 0 {
202+
log.Infof(" Active pods: %d", activePods)
203+
}
204+
log.Println()
205+
log.Infof("Monitoring commands:")
206+
log.Infof(" kubectl logs --follow job/%s -n %s", jobName, namespace)
207+
log.Infof(" kubectl get job %s -n %s", jobName, namespace)
208+
log.Println()
209+
log.Infof("To wait for completion, scaling up the necessary deployments and cleanup, run:")
210+
log.Infof(" sts-backup stackgraph check-and-finalize --job %s --wait -n %s", jobName, namespace)
211+
}
212+
213+
// cleanupRestoreResources cleans up job and PVC resources
214+
func cleanupRestoreResources(k8sClient *k8s.Client, namespace, jobName string, log *logger.Logger) error {
215+
log.Infof("Cleaning up job and PVC...")
216+
217+
// Delete job
218+
if err := k8sClient.DeleteJob(namespace, jobName); err != nil {
219+
log.Warningf("Failed to delete job: %v", err)
220+
} else {
221+
log.Successf("Job deleted: %s", jobName)
222+
}
223+
224+
// Delete PVC (same name as job)
225+
if err := k8sClient.DeletePVC(namespace, jobName); err != nil {
226+
log.Warningf("Failed to delete PVC: %v", err)
227+
} else {
228+
log.Successf("PVC deleted: %s", jobName)
229+
}
230+
231+
return nil
232+
}
233+
234+
// waitAndCleanupRestoreJob waits for job completion and cleans up resources
235+
func waitAndCleanupRestoreJob(k8sClient *k8s.Client, namespace, jobName string, log *logger.Logger) error {
236+
printWaitingMessage(log, jobName, namespace)
205237

206238
if err := waitForJobCompletion(k8sClient, namespace, jobName, log); err != nil {
207239
log.Errorf("Job failed: %v", err)
@@ -214,16 +246,9 @@ func waitAndCleanupRestoreJob(k8sClient *k8s.Client, namespace, jobName string,
214246
log.Println()
215247
log.Successf("Restore completed successfully")
216248

217-
// Cleanup job and PVC
218-
log.Infof("Cleaning up job and PVC...")
219-
if err := k8sClient.DeleteJob(namespace, job.Name); err != nil {
220-
log.Warningf("Failed to delete job: %v", err)
221-
}
222-
if err := k8sClient.DeletePVC(namespace, pvc.Name); err != nil {
223-
log.Warningf("Failed to delete PVC: %v", err)
224-
}
225-
226-
return nil
249+
// Cleanup job and PVC using shared function
250+
log.Println()
251+
return cleanupRestoreResources(k8sClient, namespace, jobName, log)
227252
}
228253

229254
// getLatestBackup retrieves the most recent backup from S3
@@ -305,7 +330,7 @@ func buildPVCSpec(name string, config *config.Config, labels map[string]string)
305330
}
306331

307332
// createRestoreJob creates a Kubernetes Job and PVC for restoring from backup
308-
func createRestoreJob(k8sClient *k8s.Client, namespace, jobName, backupFile string, config *config.Config) (*batchv1.Job, *corev1.PersistentVolumeClaim, error) {
333+
func createRestoreJob(k8sClient *k8s.Client, namespace, jobName, backupFile string, config *config.Config) error {
309334
defaultMode := int32(configMapDefaultFileMode)
310335

311336
// Merge common labels with resource-specific labels
@@ -316,7 +341,7 @@ func createRestoreJob(k8sClient *k8s.Client, namespace, jobName, backupFile stri
316341
pvcSpec := buildPVCSpec(jobName, config, pvcLabels)
317342
pvc, err := k8sClient.CreatePVC(namespace, pvcSpec)
318343
if err != nil {
319-
return nil, nil, fmt.Errorf("failed to create PVC: %w", err)
344+
return fmt.Errorf("failed to create PVC: %w", err)
320345
}
321346

322347
// Build job spec using configuration
@@ -339,14 +364,14 @@ func createRestoreJob(k8sClient *k8s.Client, namespace, jobName, backupFile stri
339364
}
340365

341366
// Create job
342-
job, err := k8sClient.CreateBackupJob(namespace, spec)
367+
_, err = k8sClient.CreateBackupJob(namespace, spec)
343368
if err != nil {
344369
// Cleanup PVC if job creation fails
345370
_ = k8sClient.DeletePVC(namespace, pvc.Name)
346-
return nil, nil, fmt.Errorf("failed to create job: %w", err)
371+
return fmt.Errorf("failed to create job: %w", err)
347372
}
348373

349-
return job, pvc, nil
374+
return nil
350375
}
351376

352377
// buildRestoreEnvVars constructs environment variables for the restore job

cmd/stackgraph/stackgraph.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ func Cmd(globalFlags *config.CLIGlobalFlags) *cobra.Command {
1313

1414
cmd.AddCommand(listCmd(globalFlags))
1515
cmd.AddCommand(restoreCmd(globalFlags))
16+
cmd.AddCommand(checkAndFinalizeCmd(globalFlags))
1617

1718
return cmd
1819
}

0 commit comments

Comments
 (0)