diff --git a/.gitignore b/.gitignore index 2dde72219..a1e41b033 100644 --- a/.gitignore +++ b/.gitignore @@ -22,10 +22,16 @@ scripts/utils/export-metrics/wal scripts/utils/export-metrics/queries.active scripts/utils/export-metrics/*.tar.gz +# Results and reports +/summarize-results* + # Binaries /dartboard /scripts/soak/soak /qasereporter-k6/qasereporter-k6 +/summarize-tools/export-metrics/export-metrics +/summarize-tools/collect-profile/collect-profile +/summarize-tools/resource-counts/cr # Vendored binaries /internal/vendored/bin diff --git a/cmd/dartboard/main.go b/cmd/dartboard/main.go index 2d268958c..6853fa5b0 100644 --- a/cmd/dartboard/main.go +++ b/cmd/dartboard/main.go @@ -88,6 +88,38 @@ func main() { Description: "runs `tofu destroy` and then deploys all the provisioned clusters", Action: subcommands.Redeploy, }, + { + Name: "summarize", + Usage: "Summarize the current deployment by capturing metrics, profiles, and resource counts", + Description: "runs `export-metrics`, `collect-profile`, and `resource-counts` against the deployed clusters", + Action: subcommands.Summarize, + Flags: []cli.Flag{ + &cli.BoolFlag{ + Name: "metrics", + Aliases: []string{"m"}, + Value: false, + Usage: "only include metrics in summary", + }, + &cli.BoolFlag{ + Name: "counts", + Aliases: []string{"c"}, + Value: false, + Usage: "only include resource counts in summary", + }, + &cli.BoolFlag{ + Name: "profiles", + Aliases: []string{"p"}, + Value: false, + Usage: "only include profiles in summary", + }, + &cli.BoolFlag{ + Name: "all", + Aliases: []string{"a"}, + Value: false, + Usage: "include all summaies (metrics, resource counts and profiles)", + }, + }, + }, }, } diff --git a/cmd/dartboard/subcommands/deploy.go b/cmd/dartboard/subcommands/deploy.go index 89e1672b4..682b464ed 100644 --- a/cmd/dartboard/subcommands/deploy.go +++ b/cmd/dartboard/subcommands/deploy.go @@ -111,6 +111,9 @@ func Deploy(cli *cli.Context) error { if err = chartInstallRancherMonitoring(r, &upstream); err != nil { return err } + if err = updateMonitoringProject(&upstream); err != nil { + return err + } if err = importDownstreamClusters(r, rancherImageTag, tf, clusters); err != nil { return err } @@ -659,3 +662,26 @@ func importClustersDownstreamGetYAML(clusters map[string]tofu.Cluster, name stri return } + +func updateMonitoringProject(cluster *tofu.Cluster) error { + var b strings.Builder + args := []string{ + "get", "namespace", "cattle-system", + "-o", "jsonpath={.metadata.annotations.field\\.cattle\\.io/projectId}", + } + if err := kubectl.Exec(cluster.Kubeconfig, &b, args...); err != nil { + return fmt.Errorf("failed to read projectId from cattle-system: %w", err) + } + projID := strings.TrimSpace(b.String()) + if projID == "" { + return fmt.Errorf("no projectId found") + } + + if err := kubectl.Exec(cluster.Kubeconfig, log.Writer(), + "annotate", "namespace", "cattle-monitoring-system", + "field.cattle.io/projectId="+projID, "--overwrite"); err != nil { + return fmt.Errorf("failed to annotate cattle-monitoring-system: %w", err) + } + + return nil +} \ No newline at end of file diff --git a/cmd/dartboard/subcommands/summarize.go b/cmd/dartboard/subcommands/summarize.go new file mode 100644 index 000000000..0e497462c --- /dev/null +++ b/cmd/dartboard/subcommands/summarize.go @@ -0,0 +1,123 @@ +/* +Copyright © 2024 SUSE LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package subcommands + +import ( + "fmt" + "os" + "time" + + "github.com/rancher/dartboard/internal/summarize/collectprofiles" + "github.com/rancher/dartboard/internal/summarize/exportmetrics" + "github.com/rancher/dartboard/internal/summarize/countresources" + "github.com/urfave/cli/v2" +) + +func Summarize(cli *cli.Context) error { + tf, _, err := prepare(cli) + if err != nil { + return err + } + + clusters, err := tf.OutputClusters(cli.Context) + if err != nil { + return err + } + + // Select upstream cluster configuration + upstream := clusters["upstream"] + + // Read flags; allow shorthand via Aliases set in main.go + metrics := cli.Bool("metrics") + counts := cli.Bool("counts") + profiles := cli.Bool("profiles") + allFlag := cli.Bool("all") + + // If --all provided, enable everything + if allFlag { + metrics, counts, profiles = true, true, true + } + + // If user didn't specify any of these flags, default to all + if !(cli.IsSet("metrics") || cli.IsSet("counts") || cli.IsSet("profiles") || cli.IsSet("all")) { + metrics, counts, profiles = true, true, true + } + + // Create top-level summary directory for this run so all tool outputs aggregate there + summaryDir := fmt.Sprintf("summarize-results-%s", time.Now().Format("2006-01-02")) + if err := os.MkdirAll(summaryDir, 0755); err != nil { + return fmt.Errorf("failed to create summary directory %s: %w", summaryDir, err) + } + + // Change working directory to summaryDir so tools output files there + originalWd, err := os.Getwd() + if err != nil { + return fmt.Errorf("failed to determine working directory: %w", err) + } + if err := os.Chdir(summaryDir); err != nil { + return fmt.Errorf("failed to change directory to %s: %w", summaryDir, err) + } + defer func() { + if err := os.Chdir(originalWd); err != nil { + fmt.Fprintf(os.Stderr, "warning: failed to restore working directory: %v\n", err) + } + }() + + ctx := cli.Context + + // Run Tools + if profiles { + fmt.Println(">>> Running collect-profile...") + // Defaults match original flags + cfg := collectprofile.Config{ + App: "rancher", + Profiles: "goroutine,heap,profile", + Duration: 30, + LogLevel: "debug", + } + if err := collectprofile.Run(ctx, cfg); err != nil { + fmt.Printf("Error running collect-profile: %v\n", err) + } + } + + if counts { + fmt.Println(">>> Running resource-counts...") + cfg := countresources.Config{ + Kubeconfig: upstream.Kubeconfig, + } + if err := countresources.Run(ctx, cfg); err != nil { + fmt.Printf("Error running resource-counts: %v\n", err) + } + } + + if metrics { + fmt.Println(">>> Running export-metrics...") + cfg := exportmetrics.Config{ + Kubeconfig: upstream.Kubeconfig, + // Defaults + Selector: `{__name__!=""}`, + OffsetSeconds: 3600, + ToSeconds: time.Now().Unix(), + FromSeconds: time.Now().Add(-1 * time.Hour).Unix(), + } + if err := exportmetrics.Run(ctx, cfg); err != nil { + fmt.Printf("Error running export-metrics: %v\n", err) + } + } + + return nil +} \ No newline at end of file diff --git a/internal/summarize/collectprofiles/collect_profiles.go b/internal/summarize/collectprofiles/collect_profiles.go new file mode 100644 index 000000000..b530ae636 --- /dev/null +++ b/internal/summarize/collectprofiles/collect_profiles.go @@ -0,0 +1,368 @@ +package collectprofile + +import ( + "context" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "time" +) + +type Config struct { + App string + Profiles string + Duration int + LogLevel string + Namespace string + Container string + Prefix string + BlobURL string + BlobToken string + MainFilename string +} + +func Run(ctx context.Context, cfg Config) error { + // Validate App choice + validApps := map[string]bool{ + "rancher": true, + "cattle-cluster-agent": true, + "fleet-controller": true, + "fleet-agent": true, + } + if !validApps[cfg.App] { + return fmt.Errorf("invalid app: %s. Supported: rancher, cattle-cluster-agent, fleet-controller, fleet-agent", cfg.App) + } + + // Setup defaults + if cfg.Prefix == "" { + cfg.Prefix = "rancher" + } + if cfg.MainFilename == "" { + cfg.MainFilename = fmt.Sprintf("profiles-%s.tar", time.Now().Format("2006-01-02_15_04")) + } + if cfg.BlobURL == "" { + cfg.BlobURL = os.Getenv("BLOB_URL") + } + if cfg.BlobToken == "" { + cfg.BlobToken = os.Getenv("BLOB_TOKEN") + } + if cfg.LogLevel == "" { + cfg.LogLevel = "debug" + } + + // Set timezone to UTC + os.Setenv("TZ", "UTC") + + var portForwardCmd *exec.Cmd + var err error + + // Ensure cleanup runs on exit + defer func() { + cleanup(cfg, portForwardCmd) + }() + + // Prepare Environment + switch cfg.App { + case "rancher": + cfg.Container = "rancher" + cfg.Namespace = "cattle-system" + setRancherLogLevel(cfg, cfg.LogLevel) + case "cattle-cluster-agent": + cfg.Container = "cluster-register" + cfg.Namespace = "cattle-system" + case "fleet-controller": + cfg.Container = "fleet-controller" + cfg.Namespace = "cattle-fleet-system" + portForwardCmd, err = startPortForward(cfg, 60601, 6060) + if err != nil { + return err + } + case "fleet-agent": + cfg.Container = "fleet-agent" + // Check for local system namespace first + if err := exec.Command("kubectl", "get", "namespace", "cattle-fleet-local-system").Run(); err == nil { + cfg.Namespace = "cattle-fleet-local-system" + } else { + cfg.Namespace = "cattle-fleet-system" + } + portForwardCmd, err = startPortForward(cfg, 60601, 6060) + if err != nil { + return err + } + } + + return collect(cfg) +} + +func collect(cfg Config) error { + tmpDir, err := os.MkdirTemp("", "rancher-profile-") + if err != nil { + return fmt.Errorf("failed to create temp directory: %w", err) + } + techo("Created " + tmpDir) + defer func() { + techo("Removing " + tmpDir) + os.RemoveAll(tmpDir) + }() + + // Timestamps + appendToFile(filepath.Join(tmpDir, "timestamps.txt"), "Start: "+time.Now().Format(time.RFC3339)+"\n") + + // Global Cluster Stats + shellExecToFile(filepath.Join(tmpDir, "top-pods.txt"), "kubectl", "top", "pods", "-A") + shellExecToFile(filepath.Join(tmpDir, "top-nodes.txt"), "kubectl", "top", "nodes") + + // Collect App Specific Data + if cfg.App == "rancher" || cfg.App == "cattle-cluster-agent" { + collectRancherLogic(cfg, tmpDir) + } else { + collectFleetLogic(cfg, tmpDir) + } + + // Final Global Stats + techo("Getting leases") + shellExecToFile(filepath.Join(tmpDir, "leases.txt"), "kubectl", "get", "leases", "-n", "kube-system") + + techo("Getting pod details") + shellExecToFile(filepath.Join(tmpDir, "pods-wide.txt"), "kubectl", "get", "pods", "-A", "-o", "wide") + + appendToFile(filepath.Join(tmpDir, "timestamps.txt"), "End: "+time.Now().Format(time.RFC3339)+"\n") + + // Create Tarball + tarName := fmt.Sprintf("%s-profiles-%s.tar.xz", cfg.Prefix, time.Now().Format("2006-01-02_15_04")) + tarPath := filepath.Join(os.TempDir(), tarName) + techo("Creating tarball " + tarName) + + // Using exec for tar to handle XZ compression easily without external go libs + cmd := exec.Command("tar", "cfJ", tarPath, "--directory", tmpDir, ".") + if out, err := cmd.CombinedOutput(); err != nil { + techo("Error creating tarball: " + string(out)) + } else { + handleUploadOrAppend(cfg, tarPath, tarName) + } + + return nil +} + +func collectRancherLogic(cfg Config, tmpDir string) { + pods := getPodNames(cfg) + profiles := strings.Split(cfg.Profiles, ",") + + for _, pod := range pods { + if pod == "" { + continue + } + // Profiles + for _, profile := range profiles { + profile = strings.TrimSpace(profile) + techo(fmt.Sprintf("Getting %s profile for %s", profile, pod)) + url := fmt.Sprintf("http://localhost:6060/debug/pprof/%s", profile) + if profile == "profile" { + url += fmt.Sprintf("?seconds=%d", cfg.Duration) + } + + // For Rancher/Agent we curl FROM INSIDE the pod + outFile := filepath.Join(tmpDir, fmt.Sprintf("%s-%s-%s", pod, profile, time.Now().Format("2006-01-02T15_04_05"))) + execKubectlCurl(cfg, pod, url, outFile) + } + + collectCommonLogsAndEvents(cfg, tmpDir, pod) + + // Specific Rancher items + if cfg.App == "rancher" { + techo("Getting rancher-audit-logs for " + pod) + shellExecToFile(filepath.Join(tmpDir, pod+"-audit.log"), "kubectl", "logs", "--since=5m", "-n", cfg.Namespace, pod, "-c", "rancher-audit-log") + + techo("Getting metrics for Rancher") + // Complex bash command inside exec needs careful wrapping + metricsCmd := `curl -s -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" -k https://127.0.0.1/metrics` + outFile := filepath.Join(tmpDir, pod+"-metrics.txt") + + cmd := exec.Command("kubectl", "exec", "-n", cfg.Namespace, pod, "-c", cfg.Container, "--", "bash", "-c", metricsCmd) + output, _ := cmd.CombinedOutput() + if err := os.WriteFile(outFile, output, 0644); err != nil { + techo(fmt.Sprintf("Failed to write metrics to %s: %v", outFile, err)) + } + } + } +} + +func collectFleetLogic(cfg Config, tmpDir string) { + pods := getPodNames(cfg) + if len(pods) == 0 { + techo("No pods found for " + cfg.App) + return + } + // Fleet usually targets one leader, we take the first one found + pod := pods[0] + profiles := strings.Split(cfg.Profiles, ",") + + for _, profile := range profiles { + profile = strings.TrimSpace(profile) + techo(fmt.Sprintf("Getting %s profile for %s", profile, pod)) + + url := fmt.Sprintf("http://localhost:60601/debug/pprof/%s", profile) + if profile == "profile" { + url += fmt.Sprintf("?seconds=%d", cfg.Duration) + } + + outFile := filepath.Join(tmpDir, fmt.Sprintf("%s-%s-%s", pod, profile, time.Now().Format("2006-01-02T15_04_05"))) + // For Fleet we curl LOCALHOST via port-forward + execLocalCurl(url, outFile) + } + + collectCommonLogsAndEvents(cfg, tmpDir, pod) +} + +func collectCommonLogsAndEvents(cfg Config, tmpDir string, pod string) { + techo("Getting logs for " + pod) + shellExecToFile(filepath.Join(tmpDir, pod+".log"), "kubectl", "logs", "--since=5m", "-n", cfg.Namespace, pod, "-c", cfg.Container) + + techo("Getting previous logs for " + pod) + shellExecToFile(filepath.Join(tmpDir, pod+"-previous.log"), "kubectl", "logs", "-n", cfg.Namespace, pod, "-c", cfg.Container, "--previous=true") + + techo("Getting events for " + pod) + shellExecToFile(filepath.Join(tmpDir, pod+"-events.txt"), "kubectl", "get", "event", "--namespace", cfg.Namespace, "--field-selector", "involvedObject.name="+pod) + + techo("Getting describe for " + pod) + shellExecToFile(filepath.Join(tmpDir, pod+"-describe.txt"), "kubectl", "describe", "pod", pod, "-n", cfg.Namespace) +} + +// Helpers + +func getPodNames(cfg Config) []string { + out, err := exec.Command("kubectl", "-n", cfg.Namespace, "get", "pods", "-l", "app="+cfg.App, "--no-headers", "-o", "custom-columns=name:.metadata.name").Output() + if err != nil { + techo("Error getting pods: " + err.Error()) + return []string{} + } + lines := strings.Split(string(out), "\n") + var pods []string + for _, l := range lines { + if strings.TrimSpace(l) != "" { + pods = append(pods, strings.TrimSpace(l)) + } + } + return pods +} + +func execKubectlCurl(cfg Config, pod, url, outFile string) { + // kubectl exec -n NS pod -c container -- curl -s URL + cmd := exec.Command("kubectl", "exec", "-n", cfg.Namespace, pod, "-c", cfg.Container, "--", "curl", "-s", url) + out, err := cmd.CombinedOutput() + if err != nil { + techo("Error curling " + url + ": " + err.Error()) + } + if err := os.WriteFile(outFile, out, 0644); err != nil { + techo("Error writing to " + outFile + ": " + err.Error()) + } +} + +func execLocalCurl(url, outFile string) { + resp, err := http.Get(url) + if err != nil { + techo("Error fetching " + url + ": " + err.Error()) + return + } + defer resp.Body.Close() + + out, err := os.Create(outFile) + if err != nil { + techo("Error creating " + outFile + ": " + err.Error()) + return + } + defer out.Close() + if _, err := io.Copy(out, resp.Body); err != nil { + techo("Error writing response body to " + outFile + ": " + err.Error()) + } +} + +func shellExecToFile(filename string, name string, args ...string) { + cmd := exec.Command(name, args...) + out, _ := cmd.CombinedOutput() // Ignore error, just write what we got + if err := os.WriteFile(filename, out, 0644); err != nil { + techo("Error writing to " + filename + ": " + err.Error()) + } +} + +func appendToFile(filename string, text string) { + f, err := os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return + } + defer f.Close() + f.WriteString(text) +} + +func startPortForward(cfg Config, localPort, remotePort int) (*exec.Cmd, error) { + pods := getPodNames(cfg) + if len(pods) == 0 { + techo("No pods found to port-forward") + return nil, fmt.Errorf("no pods found") + } + pod := pods[0] + + cmdStr := fmt.Sprintf("%d:%d", localPort, remotePort) + cmd := exec.Command("kubectl", "port-forward", "-n", cfg.Namespace, pod, cmdStr) + + err := cmd.Start() + if err != nil { + techo("Failed to start port-forward: " + err.Error()) + return nil, err + } + + // Give it a moment to establish + time.Sleep(2 * time.Second) + techo(fmt.Sprintf("Port forward started for %s %s", pod, cmdStr)) + return cmd, nil +} + +func setRancherLogLevel(cfg Config, level string) { + pods := getPodNames(cfg) + for _, pod := range pods { + techo(fmt.Sprintf("Setting %s logging to %s", pod, level)) + exec.Command("kubectl", "--namespace", "cattle-system", "exec", pod, "-c", "rancher", "--", "loglevel", "--set", level).Run() + } +} + +func handleUploadOrAppend(cfg Config, srcPath, srcName string) { + if cfg.BlobURL != "" { + techo("Uploading " + srcName) + // Use curl for upload to avoid complex Go http client setup for SAS tokens in a single file + fullURL := fmt.Sprintf("%s/%s?%s", cfg.BlobURL, srcName, cfg.BlobToken) + cmd := exec.Command("curl", "-H", "x-ms-blob-type: BlockBlob", "--upload-file", srcPath, fullURL) + if out, err := cmd.CombinedOutput(); err != nil { + techo("Upload failed: " + string(out)) + } + } else { + // Note: 'tar rf' appends. 'tar' needs to be available. + techo("Appending to " + cfg.MainFilename) + exec.Command("tar", "rf", cfg.MainFilename, srcPath).Run() + if err := os.Remove(srcPath); err != nil { + techo("Failed to remove " + srcPath + ": " + err.Error()) + } + } +} + +func cleanup(cfg Config, portForwardCmd *exec.Cmd) { + if cfg.App == "rancher" { + techo("Resetting Rancher log level to info") + setRancherLogLevel(cfg, "info") + } else if (cfg.App == "fleet-controller" || cfg.App == "fleet-agent") && portForwardCmd != nil { + if err := portForwardCmd.Process.Kill(); err != nil { + techo("Error killing port-forward: " + err.Error()) + } + if err := portForwardCmd.Wait(); err != nil { + techo("Error waiting for port-forward to exit: " + err.Error()) + } + techo("Killing port-forward") + } +} + +func techo(msg string) { + fmt.Printf("%s: %s\n", time.Now().Format("2006-01-02 15:04:05"), msg) +} \ No newline at end of file diff --git a/internal/summarize/countresources/count_resources.go b/internal/summarize/countresources/count_resources.go new file mode 100644 index 000000000..923fae7b7 --- /dev/null +++ b/internal/summarize/countresources/count_resources.go @@ -0,0 +1,107 @@ +package countresources + +import ( + "bufio" + "bytes" + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "time" +) + +type Config struct { + Kubeconfig string +} + +func Run(ctx context.Context, cfg Config) error { + // Set the KUBECONFIG environment variable for the current process + if cfg.Kubeconfig != "" { + os.Setenv("KUBECONFIG", cfg.Kubeconfig) + } + + // Prepare Directory Paths + now := time.Now() + + // Parent directory: counts-MM-DD + parentDirDate := now.Format("01-02") + parentDir := fmt.Sprintf("counts-%s", parentDirDate) + + // Output directory: cr-outputs-MM-DD-HH-MM + subDirDate := now.Format("01-02-15-04") + outputDir := filepath.Join(parentDir, fmt.Sprintf("cr-outputs-%s", subDirDate)) + + // Create directories + if err := os.MkdirAll(outputDir, 0755); err != nil { + return fmt.Errorf("failed to create directories: %w", err) + } + + // Prepare Output Filename + baseName := filepath.Base(cfg.Kubeconfig) + if baseName == "." || baseName == "" { + baseName = "kubeconfig" + } + // Remove the file extension (e.g., .yaml) + cleanName := strings.TrimSuffix(baseName, filepath.Ext(baseName)) + + // Timestamp: MM-DD-HH-MM-SS + fileDate := now.Format("01-02-15-04-05") + outputFilename := fmt.Sprintf("%s-%s.txt", cleanName, fileDate) + outputPath := filepath.Join(outputDir, outputFilename) + + // Create the output file + outFile, err := os.Create(outputPath) + if err != nil { + return fmt.Errorf("failed to create output file: %w", err) + } + defer outFile.Close() + + fmt.Printf("Using Kubeconfig: %s\n", cfg.Kubeconfig) + fmt.Printf("Writing report to: %s\n", outputPath) + + // Get List of Resources + cmd := exec.CommandContext(ctx, "kubectl", "api-resources", "--no-headers", "-o", "wide") + output, err := cmd.Output() + if err != nil { + return fmt.Errorf("failed to get api-resources: %w", err) + } + + scanner := bufio.NewScanner(bytes.NewReader(output)) + for scanner.Scan() { + line := scanner.Text() + fields := strings.Fields(line) + if len(fields) == 0 { + continue + } + resource := fields[0] + + // Count Resources + countCmd := exec.CommandContext(ctx, "kubectl", "get", resource, "-A", "--no-headers", "--ignore-not-found") + countOutput, err := countCmd.Output() + + var count int + if err == nil { + // Count non-empty lines + lines := bytes.Split(countOutput, []byte{'\n'}) + for _, l := range lines { + if len(bytes.TrimSpace(l)) > 0 { + count++ + } + } + } + + // Write to File + lineOutput := fmt.Sprintf(" %s : %d\n", resource, count) + if _, err := outFile.WriteString(lineOutput); err != nil { + fmt.Printf("Error writing to file: %v\n", err) + } + } + + if err := scanner.Err(); err != nil { + return fmt.Errorf("error reading api-resources output: %w", err) + } + + return nil +} \ No newline at end of file diff --git a/internal/summarize/exportmetrics/export_metrics.go b/internal/summarize/exportmetrics/export_metrics.go new file mode 100644 index 000000000..7567c016d --- /dev/null +++ b/internal/summarize/exportmetrics/export_metrics.go @@ -0,0 +1,192 @@ +package exportmetrics + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "time" +) + +const ( + PromTimeFormat = "2006-01-02T15:04:05Z" + FilenameTimeFormat = "2006-01-02T15-04-05" + Namespace = "cattle-monitoring-system" + PodName = "mimirtool" +) + +type Config struct { + Kubeconfig string + Selector string + FromSeconds int64 + ToSeconds int64 + OffsetSeconds int64 +} + +func Run(ctx context.Context, cfg Config) error { + // Ensure kubeconfig is set for subprocesses + if cfg.Kubeconfig != "" { + os.Setenv("KUBECONFIG", cfg.Kubeconfig) + } + + // Set defaults if not provided + if cfg.Selector == "" { + cfg.Selector = `{__name__!=""}` + } + if cfg.OffsetSeconds == 0 { + cfg.OffsetSeconds = 3600 + } + if cfg.ToSeconds == 0 { + cfg.ToSeconds = time.Now().Unix() + } + if cfg.FromSeconds == 0 { + cfg.FromSeconds = time.Now().Add(-1 * time.Hour).Unix() + } + + // Logic for limiting offset based on Prometheus memory + if cfg.OffsetSeconds > 7200 { + cfg.OffsetSeconds = 7200 + out, err := exec.CommandContext(ctx, "kubectl", "get", "statefulsets", "-n", Namespace, "prometheus-rancher-monitoring-prometheus", "-o", "jsonpath={.spec.template.spec.containers[0].resources.limits.memory}").Output() + if err == nil { + memStr := strings.TrimSuffix(string(out), "Mi") + if mem, err := strconv.Atoi(memStr); err == nil && mem < 3001 { + cfg.OffsetSeconds = 3600 + } + } + } + + fmt.Printf("Starting export-metrics...\n\n") + fmt.Printf(" Prometheus query: %s\n", cfg.Selector) + fmt.Printf(" Query start: %s\n", time.Unix(cfg.FromSeconds, 0).UTC().Format(PromTimeFormat)) + fmt.Printf(" Query end: %s\n", time.Unix(cfg.ToSeconds, 0).UTC().Format(PromTimeFormat)) + + if cfg.OffsetSeconds > 3600 { + fmt.Printf(" OFFSET: %d\n\n", cfg.OffsetSeconds) + } else { + fmt.Printf("\n") + } + + // 1. Confirm access + if err := runCmd(ctx, "kubectl", "get", "all", "-A"); err != nil { + return fmt.Errorf("failed to access cluster: %w", err) + } + fmt.Println(" - Confirm kubeconfig access \033[32mPASS\033[0m") + + // 2. Cleanup old pod + runCmd(ctx, "kubectl", "delete", "pod", "-n", Namespace, PodName) + + // 3. Apply mimirtool + yamlPath := "../summarize-tools/export-metrics/mimirtool.yaml" + + if err := runCmd(ctx, "kubectl", "apply", "-f", yamlPath); err != nil { + return fmt.Errorf("failed to apply mimirtool.yaml: %w", err) + } + + // Wait for pod to be ready + fmt.Println("Waiting for mimirtool pod to be ready...") + time.Sleep(10 * time.Second) + if err := runCmd(ctx, "kubectl", "wait", "--for=condition=Ready", "pod", "-n", Namespace, PodName, "--timeout=60s"); err != nil { + return fmt.Errorf("mimirtool pod not ready: %w", err) + } + fmt.Println(" - Confirm mimirtool pod is running \033[32mPASS\033[0m") + + // 4. Setup local directory + ts1 := time.Now().Format("2006-01-02") + kubeName := "cluster" + if cfg.Kubeconfig != "" { + kubeName = strings.Split(filepath.Base(cfg.Kubeconfig), ".")[0] + } + exportDir := fmt.Sprintf("metrics-%s-%s", kubeName, ts1) + if err := os.MkdirAll(exportDir, 0755); err != nil { + return fmt.Errorf("failed to create export directory %q: %w", exportDir, err) + } + + // 5. Loop through time ranges + currentTo := cfg.ToSeconds + for currentTo > cfg.FromSeconds { + offset := cfg.OffsetSeconds + if (currentTo - cfg.FromSeconds) < offset { + offset = currentTo - cfg.FromSeconds + } + + rangeStart := currentTo - offset + fromStr := time.Unix(rangeStart, 0).UTC().Format(PromTimeFormat) + toStr := time.Unix(currentTo, 0).UTC().Format(PromTimeFormat) + ts2 := time.Unix(rangeStart, 0).UTC().Format(FilenameTimeFormat) + + fmt.Printf("Exporting range: %s to %s\n", fromStr, toStr) + + var err error + for attempt := 1; attempt <= 3; attempt++ { + if attempt > 1 { + fmt.Printf(" - Retrying... (Attempt %d/3)\n", attempt) + // Cleanup before retry + runCmd(ctx, "kubectl", "exec", "-n", Namespace, PodName, "--", "rm", "-rf", "prometheus-export") + time.Sleep(2 * time.Second) + } + + // Remote Read + err = runCmd(ctx, "kubectl", "exec", "-n", Namespace, PodName, "--", "mimirtool", "remote-read", "export", + "--tsdb-path", "./prometheus-export", + "--address", "http://rancher-monitoring-prometheus:9090", + "--remote-read-path", "/api/v1/read", + "--to="+toStr, "--from="+fromStr, "--selector", cfg.Selector) + if err != nil { + fmt.Printf(" - Remote read failed: %v\n", err) + continue + } + + // Tar in pod + err = runCmd(ctx, "kubectl", "exec", "-n", Namespace, PodName, "--", "tar", "zcf", "/tmp/prometheus-export.tar.gz", "./prometheus-export") + if err != nil { + fmt.Printf(" - Tar failed: %v\n", err) + continue + } + + // Copy locally + localTar := filepath.Join(exportDir, fmt.Sprintf("prometheus-export-%s.tar.gz", ts2)) + err = runCmd(ctx, "kubectl", "-n", Namespace, "cp", PodName+":/tmp/prometheus-export.tar.gz", localTar) + if err != nil { + fmt.Printf(" - Copy failed: %v\n", err) + continue + } + + // Success + err = nil + break + } + + if err != nil { + fmt.Printf("Failed to export range %s to %s after 3 attempts.\n", fromStr, toStr) + } + + // Cleanup pod files + runCmd(ctx, "kubectl", "exec", "-n", Namespace, PodName, "--", "rm", "-rf", "prometheus-export") + + currentTo -= offset + // Small pause between exports to reduce load and avoid potential issues with rapid queries + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(2 * time.Second): + } + } + + // 6. Cleanup + runCmd(ctx, "kubectl", "delete", "pod", "-n", Namespace, PodName) + + finalPath, _ := filepath.Abs(exportDir) + fmt.Printf("\n\033[32mMetrics export complete!\033[0m\n") + fmt.Printf("Metrics saved to: %s\n", finalPath) + + return nil +} + +func runCmd(ctx context.Context, name string, args ...string) error { + cmd := exec.CommandContext(ctx, name, args...) + cmd.Stderr = os.Stderr + return cmd.Run() +} \ No newline at end of file diff --git a/summarize-tools/collect-profile/build_collect_profile.sh b/summarize-tools/collect-profile/build_collect_profile.sh new file mode 100755 index 000000000..73e807444 --- /dev/null +++ b/summarize-tools/collect-profile/build_collect_profile.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail +[[ "${DEBUG:-}" == "1" ]] && set -x + +# Path to the Go source code for the collector +COLLECTOR_SRC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Output binary path (place binary next to this script) +OUTPUT_BINARY="${COLLECTOR_SRC_DIR}/collect-profile" + +echo "Building collect-profile..." +echo "Source directory: ${COLLECTOR_SRC_DIR}" +echo "Output binary: ${OUTPUT_BINARY}" + +# Ensure we are in the correct directory to resolve modules +cd "${COLLECTOR_SRC_DIR}" + +# Tidy and build the Go application +go mod tidy +go build -o "${OUTPUT_BINARY}" . + +echo "Build complete." diff --git a/summarize-tools/collect-profile/main.go b/summarize-tools/collect-profile/main.go new file mode 100644 index 000000000..e08d3f558 --- /dev/null +++ b/summarize-tools/collect-profile/main.go @@ -0,0 +1,26 @@ +package main + +import ( + "context" + "flag" + "log" + "time" + + "github.com/rancher/dartboard/internal/summarize/collectprofiles" +) + +func main() { + var cfg collectprofile.Config + flag.StringVar(&cfg.App, "a", "rancher", "Application: rancher, cattle-cluster-agent, fleet-controller, or fleet-agent") + flag.StringVar(&cfg.Profiles, "p", "goroutine,heap,profile", "Profiles to be collected (comma separated)") + flag.IntVar(&cfg.Duration, "t", 30, "Time of CPU profile collections (seconds)") + flag.StringVar(&cfg.LogLevel, "l", "debug", "Log level of the Rancher pods: debug or trace") + flag.Parse() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + + if err := collectprofile.Run(ctx, cfg); err != nil { + log.Fatalf("collect-profile: %v", err) + } +} \ No newline at end of file diff --git a/summarize-tools/count-resources/build_cr.sh b/summarize-tools/count-resources/build_cr.sh new file mode 100755 index 000000000..1532d0260 --- /dev/null +++ b/summarize-tools/count-resources/build_cr.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail +[[ "${DEBUG:-}" == "1" ]] && set -x + +# Path to the Go source code for the resource-counts tool +CR_SRC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Output binary path (place binary next to this script) +OUTPUT_BINARY="${CR_SRC_DIR}/count-resources" + +echo "Building count-resources..." +echo "Source directory: ${CR_SRC_DIR}" +echo "Output binary: ${OUTPUT_BINARY}" + +# Ensure we are in the correct directory to resolve modules +cd "${CR_SRC_DIR}" + +# Tidy and build the Go application +go mod tidy +go build -o "${OUTPUT_BINARY}" . + +echo "Build complete." diff --git a/summarize-tools/count-resources/main.go b/summarize-tools/count-resources/main.go new file mode 100644 index 000000000..fa0452fb3 --- /dev/null +++ b/summarize-tools/count-resources/main.go @@ -0,0 +1,40 @@ +package main + +import ( + "context" + "flag" + "log" + "os" + "time" + + "github.com/rancher/dartboard/internal/summarize/countresources" +) + +func main() { + var kubeconfigPath string + flag.StringVar(&kubeconfigPath, "kubeconfig", "", "Path to kubeconfig file") + flag.Parse() + + if kubeconfigPath == "" { + if flag.NArg() > 0 { + kubeconfigPath = flag.Arg(0) + } else { + kubeconfigPath = os.Getenv("KUBECONFIG") + } + } + + if kubeconfigPath == "" { + log.Fatal("Error: Kubeconfig not found. Please set KUBECONFIG env var or pass as argument.") + } + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute) + defer cancel() + + cfg := countresources.Config{ + Kubeconfig: kubeconfigPath, + } + + if err := countresources.Run(ctx, cfg); err != nil { + log.Fatalf("count-resources: %v", err) + } +} \ No newline at end of file diff --git a/summarize-tools/export-metrics/build_export_metrics.sh b/summarize-tools/export-metrics/build_export_metrics.sh new file mode 100755 index 000000000..50ef0d517 --- /dev/null +++ b/summarize-tools/export-metrics/build_export_metrics.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail +[[ "${DEBUG:-}" == "1" ]] && set -x + +# Path to the Go source code for the exporter +EXPORTER_SRC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Output binary path (place binary next to this script) +OUTPUT_BINARY="${EXPORTER_SRC_DIR}/export-metrics" + +echo "Building export-metrics..." +echo "Source directory: ${EXPORTER_SRC_DIR}" +echo "Output binary: ${OUTPUT_BINARY}" + +# Ensure we are in the correct directory to resolve modules +cd "${EXPORTER_SRC_DIR}" + +# Tidy and build the Go application +go mod tidy +go build -o "${OUTPUT_BINARY}" . + +echo "Build complete." diff --git a/summarize-tools/export-metrics/main.go b/summarize-tools/export-metrics/main.go new file mode 100644 index 000000000..1ec9f31aa --- /dev/null +++ b/summarize-tools/export-metrics/main.go @@ -0,0 +1,68 @@ +package main + +import ( + "context" + "flag" + "log" + "os" + "time" + + "github.com/rancher/dartboard/internal/summarize/exportmetrics" +) + +func main() { + var kubeconfigPath string + var selector string + var startStr string + var endStr string + var offset int64 + + flag.StringVar(&kubeconfigPath, "kubeconfig", "", "Path to kubeconfig file") + flag.StringVar(&selector, "selector", "", "Prometheus selector query") + flag.StringVar(&startStr, "start", "", "Start time (RFC3339: 2006-01-02T15:04:05Z)") + flag.StringVar(&endStr, "end", "", "End time (RFC3339: 2006-01-02T15:04:05Z)") + flag.Int64Var(&offset, "offset", 0, "Offset in seconds") + flag.Parse() + + if kubeconfigPath == "" { + if flag.NArg() > 0 { + kubeconfigPath = flag.Arg(0) + } else { + kubeconfigPath = os.Getenv("KUBECONFIG") + } + } + + if kubeconfigPath == "" { + log.Fatal("Error: Kubeconfig not found. Please set KUBECONFIG env var or pass as argument.") + } + + var fromSeconds, toSeconds int64 + if startStr != "" { + t, err := time.Parse(exportmetrics.PromTimeFormat, startStr) + if err != nil { + log.Fatalf("Invalid start time format: %v", err) + } + fromSeconds = t.Unix() + } + if endStr != "" { + t, err := time.Parse(exportmetrics.PromTimeFormat, endStr) + if err != nil { + log.Fatalf("Invalid end time format: %v", err) + } + toSeconds = t.Unix() + } + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + cfg := exportmetrics.Config{ + Kubeconfig: kubeconfigPath, + Selector: selector, + FromSeconds: fromSeconds, + ToSeconds: toSeconds, + OffsetSeconds: offset, + } + + if err := exportmetrics.Run(ctx, cfg); err != nil { + log.Fatalf("export-metrics: %v", err) + } +} diff --git a/summarize-tools/export-metrics/mimirtool.yaml b/summarize-tools/export-metrics/mimirtool.yaml new file mode 100644 index 000000000..137a996a0 --- /dev/null +++ b/summarize-tools/export-metrics/mimirtool.yaml @@ -0,0 +1,20 @@ +### mimirtool.yaml +apiVersion: v1 +kind: Pod +metadata: + name: mimirtool + namespace: cattle-monitoring-system + labels: + app: mimirtool +spec: + containers: + - name: mimirtool + image: grafana/mimirtool:2.13.0 + command: ["/bin/sh", "-c"] + args: + - | + echo "Mimirtool pod is running. Use 'kubectl exec' to run commands." + # Keep the container running + while true; do + sleep 30 + done \ No newline at end of file