diff --git a/cadctl/cmd/investigate/investigate.go b/cadctl/cmd/investigate/investigate.go index 4d4a285d..98ac9a45 100644 --- a/cadctl/cmd/investigate/investigate.go +++ b/cadctl/cmd/investigate/investigate.go @@ -17,6 +17,7 @@ limitations under the License. package investigate import ( + "context" "errors" "fmt" "os" @@ -24,7 +25,8 @@ import ( "strings" "github.com/openshift/configuration-anomaly-detection/pkg/backplane" - "github.com/openshift/configuration-anomaly-detection/pkg/investigations" + "github.com/openshift/configuration-anomaly-detection/pkg/executor" + investigations "github.com/openshift/configuration-anomaly-detection/pkg/investigations" "github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam" "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" "github.com/openshift/configuration-anomaly-detection/pkg/investigations/precheck" @@ -200,14 +202,26 @@ func run(_ *cobra.Command, _ []string) error { if err != nil { return err } + // FIXME: Once all migrations are converted this can be removed. updateMetrics(alertInvestigation.Name(), &result) + // Execute ccam actions if any + if err := executeActions(builder, &result, ocmClient, pdClient, "ccam"); err != nil { + return fmt.Errorf("failed to execute ccam actions: %w", err) + } + logging.Infof("Starting investigation for %s", alertInvestigation.Name()) result, err = alertInvestigation.Run(builder) if err != nil { return err } updateMetrics(alertInvestigation.Name(), &result) + + // Execute investigation actions if any + if err := executeActions(builder, &result, ocmClient, pdClient, alertInvestigation.Name()); err != nil { + return fmt.Errorf("failed to execute %s actions: %w", alertInvestigation.Name(), err) + } + return updateIncidentTitle(pdClient) } @@ -289,3 +303,52 @@ func updateIncidentTitle(pdClient *pagerduty.SdkClient) error { } return nil } + +// executeActions executes any actions returned by an investigation +func executeActions( + builder investigation.ResourceBuilder, + result *investigation.InvestigationResult, + ocmClient *ocm.SdkClient, + pdClient *pagerduty.SdkClient, + investigationName string, +) error { + // If no actions, return early + if len(result.Actions) == 0 { + logging.Debug("No actions to execute") + return nil + } + + // Build resources to get cluster and notes + resources, err := builder.Build() + if err != nil { + return fmt.Errorf("failed to build resources for action execution: %w", err) + } + + // Create executor + exec := executor.NewExecutor(ocmClient, pdClient, logging.RawLogger) + + // Execute actions with default options + input := &executor.ExecutorInput{ + InvestigationName: investigationName, + Actions: result.Actions, + Cluster: resources.Cluster, + Notes: resources.Notes, + Options: executor.ExecutionOptions{ + DryRun: false, + StopOnError: false, // Continue executing actions even if one fails + MaxRetries: 3, + ConcurrentActions: true, // Use concurrent execution for better performance + }, + } + + logging.Infof("Executing %d actions for %s", len(result.Actions), investigationName) + if err := exec.Execute(context.Background(), input); err != nil { + // Log the error but don't fail the investigation + // This matches the current behavior where we log failures but continue + logging.Errorf("Action execution failed for %s: %v", investigationName, err) + return err + } + + logging.Infof("Successfully executed all actions for %s", investigationName) + return nil +} diff --git a/docs/architecture/builder-pattern.md b/docs/architecture/builder-pattern.md new file mode 100644 index 00000000..96aa386e --- /dev/null +++ b/docs/architecture/builder-pattern.md @@ -0,0 +1,707 @@ +# Builder Pattern Architecture + +## Overview + +The Configuration Anomaly Detection (CAD) codebase uses the Builder pattern extensively to provide clean, type-safe, and composable interfaces for constructing complex objects. This document describes our current usage, conventions, and guidelines for working with and extending builders. + +## Philosophy + +Builders in CAD serve several key purposes: + +1. **Progressive Construction**: Build complex objects step-by-step with clear intent +2. **Optional Dependencies**: Request only the resources you need +3. **Error Handling**: Centralize resource initialization and error handling +4. **Immutability**: Build once, use many times +5. **Fluent Interface**: Chain method calls for readability + +## Current Builder Implementations + +### 1. ResourceBuilder + +**Location**: `pkg/investigations/investigation/investigation.go` + +**Purpose**: Constructs investigation resources (cluster info, clients, notes) on-demand. + +#### Interface + +```go +type ResourceBuilder interface { + WithCluster() ResourceBuilder + WithClusterDeployment() ResourceBuilder + WithAwsClient() ResourceBuilder + WithK8sClient() ResourceBuilder + WithNotes() ResourceBuilder + Build() (*Resources, error) +} +``` + +#### Key Characteristics + +- **Lazy Loading**: Resources are only fetched when explicitly requested +- **Dependency Chain**: Some resources depend on others (e.g., AWS client requires Cluster) +- **Caching**: Built resources are cached to avoid duplicate API calls +- **Partial Success**: Returns whatever was built successfully, even on error + +#### Usage Example + +```go +func (c *Investigation) Run(rb investigation.ResourceBuilder) (investigation.InvestigationResult, error) { + // Request only what you need + r, err := rb. + WithCluster(). + WithAwsClient(). + WithNotes(). + Build() + + if err != nil { + // Handle errors - r may contain partial results + return investigation.InvestigationResult{}, err + } + + // Use resources + cluster := r.Cluster + awsClient := r.AwsClient + notes := r.Notes + + // ... investigation logic ... +} +``` + +#### Implementation Details + +```go +type ResourceBuilderT struct { + // Flags indicating what to build + buildCluster bool + buildClusterDeployment bool + buildAwsClient bool + buildK8sClient bool + buildNotes bool + buildLogger bool + + // Input parameters + clusterId string + name string + logLevel string + pipelineName string + ocmClient *ocm.SdkClient + + // Cached results + builtResources *Resources + buildErr error +} +``` + +**Key Pattern**: Each `WithX()` method sets a boolean flag and returns the builder for chaining. The `Build()` method checks flags and constructs only requested resources. + +#### Dependency Resolution + +Resources have dependencies that are automatically satisfied: + +- `WithAwsClient()` → automatically calls `WithCluster()` +- `WithClusterDeployment()` → automatically calls `WithCluster()` +- `WithK8sClient()` → no automatic dependencies + +#### Error Handling Strategy + +The ResourceBuilder uses **typed errors** to distinguish failure modes: + +```go +// Typed errors for different resource failures +type ClusterNotFoundError struct { + ClusterID string + Err error +} + +type AWSClientError struct { + ClusterID string + Err error +} + +type K8SClientError struct { + ClusterID string + Err error +} + +type ClusterDeploymentNotFoundError struct { + ClusterID string + Err error +} +``` + +Investigations can use `errors.As()` to handle specific failures: + +```go +resources, err := rb.WithAwsClient().Build() + +awsClientErr := &investigation.AWSClientError{} +if errors.As(err, awsClientErr) { + // Handle AWS-specific failure + // Note: resources.Cluster may still be valid! +} +``` + +### 2. OCM LimitedSupportReasonBuilder + +**Location**: `pkg/ocm/ocm.go` + +**Purpose**: Constructs OCM SDK limited support reason objects. + +#### Implementation + +```go +func newLimitedSupportReasonBuilder(ls *LimitedSupportReason) *cmv1.LimitedSupportReasonBuilder { + builder := cmv1.NewLimitedSupportReason() + builder.Summary(ls.Summary) + builder.Details(ls.Details) + builder.DetectionType(cmv1.DetectionTypeManual) + return builder +} +``` + +**Note**: This is a thin wrapper around the OCM SDK's builder. We use it to ensure consistent defaults (like `DetectionType`). + +### 3. OCM ServiceLog Builder Pattern + +**Location**: `pkg/ocm/ocm.go` + +**Purpose**: Constructs OCM SDK service log entries. + +#### Implementation + +```go +func (c *SdkClient) PostServiceLog(cluster *cmv1.Cluster, sl *ServiceLog) error { + builder := &servicelogsv1.LogEntryBuilder{} + builder.Severity(servicelogsv1.Severity(sl.Severity)) + builder.ServiceName(sl.ServiceName) + builder.Summary(sl.Summary) + builder.Description(sl.Description) + builder.InternalOnly(sl.InternalOnly) + builder.ClusterID(cluster.ID()) + + le, err := builder.Build() + if err != nil { + return fmt.Errorf("could not create post request (SL): %w", err) + } + + // Send the built object + request := c.conn.ServiceLogs().V1().ClusterLogs().Add() + request = request.Body(le) + _, err = request.Send() + return err +} +``` + +**Pattern**: Convert from our internal type to OCM SDK builder, set all fields, build, then send. + +## Builder Pattern Conventions + +### Naming Conventions + +1. **Builder Types**: Suffix with `Builder` (e.g., `ResourceBuilder`, `ActionBuilder`) +2. **Builder Methods**: + - Prefix with `With` for adding optional components: `WithCluster()`, `WithAwsClient()` + - Prefix with `Add` for adding items to collections: `AddAction()`, `AddDetail()` + - Prefix with `Set` for required fields (rare, prefer constructor args) +3. **Finalization**: Always use `Build()` to get the final object + +### Method Signatures + +All builder methods should follow these patterns: + +#### Fluent Chaining Pattern + +```go +// Return the builder for chaining +func (b *Builder) WithX() *Builder { + b.field = value + return b +} +``` + +#### Adding Items Pattern + +```go +// Return the builder for chaining +func (b *Builder) AddItem(item Item) *Builder { + b.items = append(b.items, item) + return b +} +``` + +#### Build Pattern + +```go +// Return the constructed object and any error +func (b *Builder) Build() (Result, error) { + // Validate + if err := b.validate(); err != nil { + return Result{}, err + } + + // Construct + return b.construct(), nil +} +``` + +### Constructor Pattern + +Always provide a constructor function for builders: + +```go +// NewXBuilder creates a new builder with required parameters +func NewXBuilder(required1 string, required2 int) *XBuilder { + return &XBuilder{ + required1: required1, + required2: required2, + // Initialize optional fields with sensible defaults + optionalField: defaultValue, + } +} +``` + +**Rationale**: Required parameters go in constructor, optional ones use `WithX()` methods. + +## Extending Builders + +### Adding a New Resource to ResourceBuilder + +**Scenario**: You want to add a new client type (e.g., BackplaneReportsClient) to ResourceBuilder. + +#### Step 1: Add to Resources Struct + +```go +type Resources struct { + Name string + Cluster *cmv1.Cluster + ClusterDeployment *hivev1.ClusterDeployment + AwsClient aws.Client + K8sClient k8sclient.Client + OcmClient ocm.Client + PdClient pagerduty.Client + Notes *notewriter.NoteWriter + + // NEW: Add your resource + BackplaneReportsClient reports.Client +} +``` + +#### Step 2: Add Flag to Builder Struct + +```go +type ResourceBuilderT struct { + buildCluster bool + buildClusterDeployment bool + buildAwsClient bool + buildK8sClient bool + buildNotes bool + buildLogger bool + + // NEW: Add your flag + buildBackplaneReportsClient bool + + // ... other fields ... +} +``` + +#### Step 3: Add WithX Method + +```go +func (r *ResourceBuilderT) WithBackplaneReportsClient() ResourceBuilder { + r.buildBackplaneReportsClient = true + + // If your resource depends on others, call them: + // r.WithCluster() + + return r +} +``` + +#### Step 4: Add Build Logic + +```go +func (r *ResourceBuilderT) Build() (*Resources, error) { + if r.buildErr != nil { + return r.builtResources, r.buildErr + } + + // ... existing build logic ... + + // NEW: Add your build logic + if r.buildBackplaneReportsClient && r.builtResources.BackplaneReportsClient == nil { + r.builtResources.BackplaneReportsClient, err = reports.NewClient(r.builtResources.Cluster.ID()) + if err != nil { + r.buildErr = BackplaneReportsClientError{ClusterID: r.clusterId, Err: err} + return r.builtResources, r.buildErr + } + } + + return r.builtResources, nil +} +``` + +#### Step 5: Add Typed Error (Optional but Recommended) + +```go +type BackplaneReportsClientError struct { + ClusterID string + Err error +} + +func (e BackplaneReportsClientError) Error() string { + return fmt.Sprintf("failed to create backplane reports client for cluster %s: %v", e.ClusterID, e.Err) +} + +func (e BackplaneReportsClientError) Unwrap() error { + return e.Err +} +``` + +### Creating a New Builder + +**Scenario**: You need a builder for constructing investigation results. + +#### Step 1: Define the Result Type + +```go +type InvestigationResult struct { + Findings *InvestigationFindings + Actions []Action + Outcome InvestigationOutcome +} +``` + +#### Step 2: Create Builder Struct + +```go +type InvestigationResultBuilder struct { + result *InvestigationResult +} +``` + +#### Step 3: Add Constructor + +```go +func NewResultBuilder() *InvestigationResultBuilder { + return &InvestigationResultBuilder{ + result: &InvestigationResult{ + Findings: &InvestigationFindings{ + MetricsLabels: make(map[string]string), + }, + Actions: []Action{}, + Outcome: OutcomeContinue, // sensible default + }, + } +} +``` + +#### Step 4: Add Fluent Methods + +```go +// Simple field setter +func (b *InvestigationResultBuilder) WithSummary(summary string) *InvestigationResultBuilder { + b.result.Findings.Summary = summary + return b +} + +// Adding to collection +func (b *InvestigationResultBuilder) AddAction(action Action) *InvestigationResultBuilder { + b.result.Actions = append(b.result.Actions, action) + return b +} + +// Convenience wrapper +func (b *InvestigationResultBuilder) AddServiceLog(sl *ocm.ServiceLog, reason string) *InvestigationResultBuilder { + return b.AddAction(&ServiceLogAction{ + ServiceLog: sl, + Reason: reason, + }) +} +``` + +#### Step 5: Add Build Method + +```go +func (b *InvestigationResultBuilder) Build() InvestigationResult { + // Optionally validate + // if b.result.Findings.Summary == "" { + // panic("Summary is required") // or return error + // } + + // Return value (not pointer) to prevent mutation after build + return *b.result +} +``` + +## Best Practices + +### 1. Return Values, Not Pointers from Build() + +**Good**: +```go +func (b *Builder) Build() InvestigationResult { + return *b.result // Return value +} +``` + +**Why**: Prevents accidental mutation of the built object. Once built, it's immutable. + +**Exception**: When the built object is inherently a pointer (like `*Resources`), return the pointer. + +### 2. Initialize Collections in Constructor + +**Good**: +```go +func NewBuilder() *Builder { + return &Builder{ + items: []Item{}, // Initialize to empty slice, not nil + labels: make(map[string]string), + } +} +``` + +**Why**: Prevents nil pointer panics when appending/setting. + +### 3. Use Typed Errors for Build Failures + +**Good**: +```go +type ClusterNotFoundError struct { + ClusterID string + Err error +} + +func (r *ResourceBuilderT) Build() (*Resources, error) { + cluster, err := r.ocmClient.GetClusterInfo(r.clusterId) + if err != nil { + return r.builtResources, ClusterNotFoundError{ + ClusterID: r.clusterId, + Err: err, + } + } +} +``` + +**Why**: Allows callers to handle different error types differently. + +### 4. Provide Sensible Defaults + +**Good**: +```go +func NewResultBuilder() *InvestigationResultBuilder { + return &InvestigationResultBuilder{ + result: &InvestigationResult{ + Outcome: OutcomeContinue, // Default to continue + Severity: FindingSeverityInfo, // Default severity + }, + } +} +``` + +**Why**: Most use cases work with defaults; explicit overrides only when needed. + +### 5. Document Dependencies + +If `WithX()` automatically calls `WithY()`, document it: + +```go +// WithAwsClient requests an AWS client for the cluster. +// This automatically calls WithCluster() as the cluster info is required. +func (r *ResourceBuilderT) WithAwsClient() ResourceBuilder { + r.WithCluster() // Automatic dependency + r.buildAwsClient = true + return r +} +``` + +### 6. Cache Built Resources + +**Good** (ResourceBuilder pattern): +```go +func (r *ResourceBuilderT) Build() (*Resources, error) { + // Check if already built + if r.builtResources.Cluster != nil { + return r.builtResources, r.buildErr + } + + // Build and cache + r.builtResources.Cluster, err = r.ocmClient.GetClusterInfo(r.clusterId) + if err != nil { + r.buildErr = err + return r.builtResources, r.buildErr + } + + return r.builtResources, nil +} +``` + +**Why**: Allows calling `Build()` multiple times without duplicate work. Important for ResourceBuilder since investigations call it multiple times. + +### 7. Support Partial Success + +**ResourceBuilder Pattern Only**: + +```go +func (r *ResourceBuilderT) Build() (*Resources, error) { + // Always return builtResources, even on error + // Caller can access successfully built resources + return r.builtResources, r.buildErr +} +``` + +**Why**: An investigation might still be able to run with partial resources. For example, if AWS client fails but cluster info succeeded, some checks can still run. + +**Note**: This is specific to ResourceBuilder. Most builders should return zero-value on error. + +## Testing Builders + +### Testing Builder Itself + +```go +func TestResourceBuilder(t *testing.T) { + // Test successful build + t.Run("successful cluster build", func(t *testing.T) { + mockOcmClient := // ... create mock + + rb, _ := investigation.NewResourceBuilder( + mockPdClient, + mockOcmClient, + "cluster-123", + "test-investigation", + "info", + "test-pipeline", + ) + + resources, err := rb.WithCluster().Build() + + assert.NoError(t, err) + assert.NotNil(t, resources.Cluster) + }) + + // Test error handling + t.Run("cluster not found error", func(t *testing.T) { + mockOcmClient := // ... mock to return error + + rb, _ := investigation.NewResourceBuilder(...) + resources, err := rb.WithCluster().Build() + + var clusterErr *investigation.ClusterNotFoundError + assert.True(t, errors.As(err, &clusterErr)) + assert.Equal(t, "cluster-123", clusterErr.ClusterID) + }) +} +``` + +### Mocking Builders in Tests + +For investigations, use the mock implementation: + +```go +func TestInvestigation(t *testing.T) { + mockResources := &investigation.Resources{ + Cluster: mockCluster, + AwsClient: mockAwsClient, + Notes: notewriter.New("test", logger), + } + + mockBuilder := &investigation.ResourceBuilderMock{ + Resources: mockResources, + BuildError: nil, + } + + result, err := investigation.Run(mockBuilder) + + // Assert on result +} +``` + +## Anti-Patterns to Avoid + +### ❌ Don't: Mutate After Build + +```go +// BAD +builder := NewResultBuilder() +result := builder.Build() +result.Actions = append(result.Actions, newAction) // Mutating built result +``` + +**Solution**: Build returns a value copy, so this won't compile. But if you change to return pointer, don't do this. + +### ❌ Don't: Build Multiple Objects from Same Builder + +```go +// BAD +builder := NewResultBuilder().WithSummary("First") +result1 := builder.Build() + +builder.WithSummary("Second") // Trying to reuse builder +result2 := builder.Build() // result1 is now corrupted +``` + +**Solution**: Create a new builder for each object. + +### ❌ Don't: Mix Required and Optional in WithX Methods + +```go +// BAD +func NewBuilder() *Builder { + return &Builder{} // Missing required fields +} + +func (b *Builder) WithRequiredField(field string) *Builder { + b.required = field // Required fields should be in constructor + return b +} +``` + +**Solution**: Required fields go in constructor, optional ones in `WithX()`. + +### ❌ Don't: Return Errors from WithX Methods + +```go +// BAD +func (b *Builder) WithValidatedField(field string) (*Builder, error) { + if field == "" { + return nil, errors.New("field required") + } + b.field = field + return b, nil +} +``` + +**Solution**: Validation happens in `Build()`, not in `WithX()` methods. Keep the fluent interface clean. + +### ❌ Don't: Have Side Effects in WithX Methods + +```go +// BAD +func (r *ResourceBuilderT) WithCluster() ResourceBuilder { + // DON'T fetch the cluster here + cluster, err := r.ocmClient.GetClusterInfo(r.clusterId) + r.builtResources.Cluster = cluster + r.buildErr = err + return r +} +``` + +**Solution**: `WithX()` should only set flags. `Build()` does the actual work. + +## Future Directions + +### Builder Variants Being Considered + +1. **Validated Builders**: Builders that enforce constraints at compile time using type states +2. **Async Builders**: Builders that can fetch resources concurrently +3. **Conditional Builders**: Builders with conditional logic (build X only if Y succeeded) + +## References + +- ResourceBuilder implementation: `pkg/investigations/investigation/investigation.go` +- Example usages: All files in `pkg/investigations/*/` +- OCM builders: `pkg/ocm/ocm.go` +- Builder pattern discussion: https://golang.cafe/blog/golang-builder-pattern.html + +## Questions? + +For questions about builders or to propose a new builder, contact the CAD team or open a discussion in the repository. diff --git a/docs/architecture/investigation-guidelines.md b/docs/architecture/investigation-guidelines.md new file mode 100644 index 00000000..9ce7aba5 --- /dev/null +++ b/docs/architecture/investigation-guidelines.md @@ -0,0 +1,592 @@ +# Investigation Guidelines + +## Overview + +This document provides guidelines for implementing investigations in the Configuration Anomaly Detection (CAD) system. It covers the executor pattern, action-based architecture, and best practices for creating robust, maintainable investigations. + +## Architecture: Separation of Investigation and Execution + +### The Executor Pattern + +CAD uses an **executor pattern** where investigations focus on **analyzing problems** and **returning actions**, while the executor handles **executing those actions** against external systems. + +``` +┌─────────────────┐ +│ Investigation │ ← Analyzes cluster state, builds investigation logic +└────────┬────────┘ + │ returns + ▼ +┌─────────────────┐ +│ Actions │ ← Describes what to do (ServiceLog, LimitedSupport, etc.) +└────────┬────────┘ + │ executes + ▼ +┌─────────────────┐ +│ Executor │ ← Handles execution, retries, metrics, error handling +└────────┬────────┘ + │ calls + ▼ +┌─────────────────┐ +│ External Systems│ ← PagerDuty, OCM, Backplane +└─────────────────┘ +``` + +### Why This Pattern? + +**Benefits:** +- ✅ **Accurate Metrics**: Metrics are emitted only when actions actually succeed +- ✅ **Separation of Concerns**: Investigation logic is separate from execution details +- ✅ **Testability**: Can test investigations without mocking external clients +- ✅ **Retry Logic**: Executor provides automatic retry for transient failures +- ✅ **Consistency**: All investigations follow the same pattern +- ✅ **Observability**: Centralized logging and metrics for all actions + +**Anti-Pattern (Old Way - DON'T DO THIS):** +```go +// ❌ BAD: Investigation directly calls external systems +func (i *Investigation) Run(rb investigation.ResourceBuilder) (result investigation.InvestigationResult, err error) { + r, err := rb.Build() + if err != nil { + return result, err + } + + // ❌ DON'T: Direct PagerDuty call + err = r.PdClient.SilenceIncidentWithNote(notes.String()) + + // ❌ DON'T: Direct OCM call + err = r.OcmClient.PostServiceLog(r.Cluster, serviceLog) + + // ❌ DON'T: Manual metric tracking (might be incorrect if action fails later) + result.ServiceLogSent = investigation.InvestigationStep{Performed: true} + + return result, err +} +``` + +**Correct Pattern (New Way - DO THIS):** +```go +// ✅ GOOD: Investigation returns actions +func (i *Investigation) Run(rb investigation.ResourceBuilder) (result investigation.InvestigationResult, err error) { + r, err := rb.Build() + if err != nil { + return result, err + } + + // Perform investigation logic + if problemDetected { + notes.AppendAutomation("Problem detected, sending service log and silencing alert") + + // ✅ DO: Return actions for the executor to handle + result.Actions = []types.Action{ + executor.NewServiceLogAction(sl.Severity, sl.Summary). + WithDescription(sl.Description). + WithServiceName(sl.ServiceName). + Build(), + executor.NoteFrom(notes), + executor.Silence("Reason for silencing"), + } + return result, nil + } + + // No issues found + notes.AppendSuccess("No issues detected") + result.Actions = []types.Action{ + executor.NoteFrom(notes), + executor.Escalate("Manual investigation required"), + } + return result, nil +} +``` + +## Available Actions + +### 1. ServiceLogAction + +Send a service log to the customer via OCM. + +```go +// Basic usage +action := executor.NewServiceLogAction("Major", "Action required: review configuration"). + WithDescription("Your cluster configuration needs attention..."). + WithServiceName("SREManualAction"). + Build() + +// Convenience function +action := executor.ServiceLog("Major", "Summary", "Description") +``` + +**Parameters:** +- `severity`: "Info", "Warning", "Major", "Critical" +- `summary`: Brief title of the service log +- `description`: Detailed explanation and remediation steps +- `serviceName`: Defaults to "SREManualAction" + +**Options:** +- `.InternalOnly()`: Mark as internal-only (not visible to customer) +- `.AllowDuplicates()`: Send even if identical service log exists +- `.WithReason(string)`: Add reason for logging purposes + +### 2. LimitedSupportAction + +Set a cluster into limited support with a specific reason. + +```go +// Basic usage +action := executor.NewLimitedSupportAction(summary, details). + WithContext("EgressBlocked"). + Build() + +// Convenience function +action := executor.LimitedSupport("Summary", "Detailed explanation...") +``` + +**Parameters:** +- `summary`: Brief reason for limited support +- `details`: Detailed explanation including remediation steps + +**Options:** +- `.WithContext(string)`: Add context for logging and metrics (used as metric label) +- `.AllowDuplicates()`: Set even if identical LS exists + +### 3. PagerDutyNoteAction + +Add a note to the current PagerDuty incident. + +```go +// From notewriter +action := executor.NoteFrom(notewriter) + +// Direct string +action := executor.Note("Investigation findings...") + +// Builder pattern +action := executor.NewPagerDutyNoteAction(). + AppendLine("Finding 1"). + AppendLine("Finding 2"). + AppendSection("Details", "More information..."). + Build() +``` + +### 4. SilenceIncidentAction + +Silence (resolve) the current PagerDuty incident. + +```go +action := executor.Silence("Customer misconfigured UWM - sent service log") + +// Builder pattern +action := executor.NewSilenceIncidentAction("Reason for silencing").Build() +``` + +### 5. EscalateIncidentAction + +Escalate the current PagerDuty incident to primary. + +```go +action := executor.Escalate("Manual investigation required") + +// Builder pattern +action := executor.NewEscalateIncidentAction("Reason for escalation").Build() +``` + +### 6. BackplaneReportAction + +Upload a report to the backplane reports API (future implementation). + +```go +action := executor.NewBackplaneReportAction("investigation-report", reportData).Build() +``` + +## Action Execution Order + +Actions are executed in the following order based on their type: + +1. **PagerDuty actions** (sequentially): + - PagerDutyNoteAction + - SilenceIncidentAction or EscalateIncidentAction + +2. **OCM actions** (in parallel): + - ServiceLogAction + - LimitedSupportAction + +3. **Backplane actions** (in parallel): + - BackplaneReportAction + +**Example:** +```go +result.Actions = []types.Action{ + executor.ServiceLog(...), // Executes in parallel with LimitedSupport + executor.LimitedSupport(...), // Executes in parallel with ServiceLog + executor.NoteFrom(notes), // Executes sequentially (first PD action) + executor.Silence("reason"), // Executes sequentially (after Note) +} +``` + +## Common Patterns + +### Pattern 1: Customer Misconfiguration → Service Log + Silence + +Use when the customer has misconfigured something and can fix it themselves. + +```go +if customerMisconfigurationDetected { + notes.AppendAutomation("Customer misconfigured X, sending service log and silencing alert") + + serviceLog := &ocm.ServiceLog{ + Severity: "Major", + Summary: "Action required: review X configuration", + Description: "Your cluster's X is misconfigured. Please review...", + ServiceName: "SREManualAction", + } + + result.Actions = []types.Action{ + executor.NewServiceLogAction(serviceLog.Severity, serviceLog.Summary). + WithDescription(serviceLog.Description). + WithServiceName(serviceLog.ServiceName). + Build(), + executor.NoteFrom(notes), + executor.Silence("Customer misconfigured X"), + } + return result, nil +} +``` + +### Pattern 2: Unsupported Action → Limited Support + Silence + +Use when the customer performed an unsupported action (e.g., stopped instances). + +```go +if unsupportedActionDetected { + notes.AppendAutomation("Customer performed unsupported action, setting limited support") + + limitedSupportReason := ocm.LimitedSupportReason{ + Summary: "Cluster is in Limited Support due to unsupported action", + Details: "Your cluster performed X which is not supported. Please...", + } + + result.Actions = []types.Action{ + executor.NewLimitedSupportAction(limitedSupportReason.Summary, limitedSupportReason.Details). + WithContext("ActionType"). // Used for metrics + Build(), + executor.NoteFrom(notes), + executor.Silence("Customer performed unsupported action"), + } + return result, nil +} +``` + +### Pattern 3: No Automation Available → Note + Escalate + +Use when CAD cannot automatically remediate the issue. + +```go +notes.AppendSuccess("Investigation completed, no automated remediation available") +notes.AppendWarning("Found issue X, requires manual investigation") + +result.Actions = []types.Action{ + executor.NoteFrom(notes), + executor.Escalate("Manual investigation required"), +} +return result, nil +``` + +### Pattern 4: Network/Egress Issues → Service Log + Escalate + +Use when issues are detected but don't warrant limited support. + +```go +if issueDetectedButNotCritical { + notes.AppendWarning("Network issue detected, sending service log") + + result.Actions = []types.Action{ + executor.NewServiceLogAction("Warning", "Network connectivity issue detected"). + WithDescription("We detected that..."). + Build(), + executor.NoteFrom(notes), + executor.Escalate("Manual investigation required"), + } + return result, nil +} +``` + +## Error Handling + +### Investigation Errors vs Action Failures + +**Investigation Errors**: Return error from `Run()` when the investigation itself fails. + +```go +// Infrastructure/transient errors (retry the investigation) +if isInfrastructureError(err) { + return result, fmt.Errorf("investigation infrastructure failure: %w", err) +} + +// Investigation findings that need manual review +notes.AppendWarning("Could not complete: %s", err.Error()) +result.Actions = []types.Action{ + executor.NoteFrom(notes), + executor.Escalate("Investigation incomplete - manual review required"), +} +return result, nil +``` + +**Action Failures**: The executor handles action failures with retry logic and error reporting. + +```go +// ❌ DON'T: Handle action failures in investigation +if err := r.OcmClient.PostServiceLog(...); err != nil { + return result, err // Wrong! +} + +// ✅ DO: Return actions, let executor handle failures +result.Actions = []types.Action{ + executor.ServiceLog(...), // Executor will retry on failure +} +return result, nil +``` + +### K8s Client Errors + +Handle K8s client errors specially to escalate with appropriate context: + +```go +func (i *Investigation) Run(rb investigation.ResourceBuilder) (result investigation.InvestigationResult, err error) { + r, err := rb.WithK8sClient().Build() + if err != nil { + k8sErr := &investigation.K8SClientError{} + if errors.As(err, k8sErr) { + if errors.Is(k8sErr.Err, k8sclient.ErrAPIServerUnavailable) { + result.Actions = []types.Action{ + executor.Escalate("CAD was unable to access cluster's kube-api. Please investigate manually."), + } + return result, nil + } + if errors.Is(k8sErr.Err, k8sclient.ErrCannotAccessInfra) { + result.Actions = []types.Action{ + executor.Escalate("CAD is not allowed to access hive, management or service cluster's kube-api. Please investigate manually."), + } + return result, nil + } + return result, err + } + return result, err + } + + // Continue with investigation... +} +``` + +## Metrics + +### Automatic Metrics Emission + +The executor **automatically emits metrics** when actions succeed. You do not need to manually set `InvestigationStep` fields. + +```go +// ❌ DON'T: Manually track metrics +result.ServiceLogSent = investigation.InvestigationStep{Performed: true} +result.LimitedSupportSet = investigation.InvestigationStep{Performed: true} + +// ✅ DO: Just return actions - executor emits metrics +result.Actions = []types.Action{ + executor.ServiceLog(...), // Executor emits servicelog_sent metric on success + executor.LimitedSupport(...), // Executor emits limitedsupport_set metric on success +} +``` + +### Metric Labels + +Metrics are automatically labeled with: +- `investigationName`: From the investigation's `Name()` method +- Additional labels based on action context: + - `LimitedSupportAction.Context`: Used as secondary label for LS metrics + +```go +// Metric: limitedsupport_set{investigation="chgm", context="EgressBlocked"} +executor.NewLimitedSupportAction(summary, details). + WithContext("EgressBlocked"). + Build() +``` + +## Testing + +### Testing Investigations + +Test investigations by verifying the **actions** they return, not by mocking external clients. + +```go +// ✅ GOOD: Test actions returned +It("should send service log and silence when misconfiguration detected", func() { + result, err := investigation.Run(builder) + + Expect(err).NotTo(HaveOccurred()) + Expect(result.Actions).To(HaveLen(3)) + Expect(hasServiceLogAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) + Expect(hasSilenceAction(result.Actions)).To(BeTrue()) +}) + +// Helper functions +func hasServiceLogAction(actions []types.Action) bool { + for _, action := range actions { + if _, ok := action.(*executor.ServiceLogAction); ok { + return true + } + } + return false +} +``` + +### Testing the Executor + +The executor has its own test suite. Investigation tests should focus on the investigation logic. + +## Migration Guide + +### Migrating Existing Investigations + +If you're working on an investigation that still uses the old pattern: + +1. **Replace direct PagerDuty calls:** + ```go + // Before + return result, r.PdClient.SilenceIncidentWithNote(notes.String()) + + // After + result.Actions = []types.Action{ + executor.NoteFrom(notes), + executor.Silence("Reason"), + } + return result, nil + ``` + +2. **Replace direct OCM calls:** + ```go + // Before + err = r.OcmClient.PostServiceLog(r.Cluster, serviceLog) + if err != nil { + return result, fmt.Errorf("failed posting servicelog: %w", err) + } + + // After + result.Actions = []types.Action{ + executor.NewServiceLogAction(serviceLog.Severity, serviceLog.Summary). + WithDescription(serviceLog.Description). + Build(), + } + ``` + +3. **Remove manual metric tracking:** + ```go + // Before + result.ServiceLogSent = investigation.InvestigationStep{Performed: true} + + // After + // Remove this line - executor handles metrics automatically + ``` + +4. **Update tests:** + ```go + // Before + Expect(result.ServiceLogSent.Performed).To(BeTrue()) + + // After + Expect(hasServiceLogAction(result.Actions)).To(BeTrue()) + ``` + +## Best Practices + +### DO ✅ + +- **Return actions instead of executing**: Let the executor handle external system calls +- **Use notewriter for investigation findings**: Build notes throughout investigation +- **Return nil error on success**: When actions are created successfully, return `nil` error +- **Use builder pattern**: Chain method calls for readable action construction +- **Provide clear reasons**: Always include reason strings for Silence/Escalate actions +- **Use context for metrics**: Add `.WithContext()` to LimitedSupportAction for better metrics + +### DON'T ❌ + +- **Call PagerDuty/OCM directly**: Never call `r.PdClient.*` or `r.OcmClient.PostServiceLog()` +- **Manually track metrics**: Don't set `result.ServiceLogSent` or `result.LimitedSupportSet` +- **Return errors for action failures**: Only return errors when investigation logic fails +- **Mock external clients in tests**: Test the actions returned, not execution details +- **Skip note context**: Always add notes before silencing/escalating for SRE visibility + +## Examples + +### Complete Investigation Example + +See the CHGM investigation (`pkg/investigations/chgm/chgm.go`) for a complete example of: +- Handling multiple scenarios +- Using different action combinations +- Error handling +- Note building + +### ClusterMonitoringErrorBudgetBurn Example + +See the clustermonitoringerrorbudgetburn investigation for examples of: +- K8s client error handling +- Service log creation +- Multiple detection scenarios + +## Reference + +### Investigation Interface + +```go +type Investigation interface { + Run(builder ResourceBuilder) (InvestigationResult, error) + Name() string + AlertTitle() string + Description() string + IsExperimental() bool +} +``` + +### InvestigationResult + +```go +type InvestigationResult struct { + // NEW: Actions to execute via executor (modern approach) + Actions []types.Action + + // DEPRECATED: Legacy fields (maintained for backwards compatibility) + // These will be removed once all investigations are migrated + LimitedSupportSet InvestigationStep + ServiceLogPrepared InvestigationStep + ServiceLogSent InvestigationStep + + // If not nil, indicates fatal error preventing further investigations + StopInvestigations error +} +``` + +### Action Interface + +```go +type Action interface { + // Execute performs the action with the provided execution context + Execute(ctx context.Context, execCtx *ExecutionContext) error + + // Type returns the action type identifier as a string + Type() string + + // Validate checks if the action can be executed + Validate() error +} +``` + +## Questions or Issues? + +If you have questions about implementing investigations or encounter issues with the executor pattern, please: + +1. Review existing migrated investigations (CHGM, clustermonitoringerrorbudgetburn) +2. Check this documentation and `docs/architecture/builder-pattern.md` +3. Reach out to the CAD team for guidance + +--- + +**Document Version**: 1.0 +**Last Updated**: 2025-11-21 +**Applies to**: CAD with executor module (post-migration) diff --git a/pkg/executor/action_builders.go b/pkg/executor/action_builders.go new file mode 100644 index 00000000..b6da7582 --- /dev/null +++ b/pkg/executor/action_builders.go @@ -0,0 +1,287 @@ +package executor + +import ( + "fmt" + "strings" + + "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" + "github.com/openshift/configuration-anomaly-detection/pkg/ocm" +) + +// ServiceLogActionBuilder builds ServiceLogAction instances +type ServiceLogActionBuilder struct { + severity string + serviceName string + summary string + description string + internalOnly bool + reason string + allowDuplicates bool +} + +// NewServiceLogAction creates a builder with required fields +// severity: "Info", "Warning", "Major", "Critical" +// summary: Brief title of the service log +func NewServiceLogAction(severity, summary string) *ServiceLogActionBuilder { + return &ServiceLogActionBuilder{ + severity: severity, + summary: summary, + serviceName: "SREManualAction", // Default service name + internalOnly: false, // Default to customer-visible + allowDuplicates: false, // Default to skip duplicates + } +} + +// WithDescription sets the detailed description +func (b *ServiceLogActionBuilder) WithDescription(description string) *ServiceLogActionBuilder { + b.description = description + return b +} + +// WithServiceName sets the service name (defaults to "SREManualAction") +func (b *ServiceLogActionBuilder) WithServiceName(name string) *ServiceLogActionBuilder { + b.serviceName = name + return b +} + +// InternalOnly marks the service log as internal-only (not visible to customer) +func (b *ServiceLogActionBuilder) InternalOnly() *ServiceLogActionBuilder { + b.internalOnly = true + return b +} + +// WithReason sets the reason for logging purposes +func (b *ServiceLogActionBuilder) WithReason(reason string) *ServiceLogActionBuilder { + b.reason = reason + return b +} + +// AllowDuplicates permits sending even if identical service log exists +func (b *ServiceLogActionBuilder) AllowDuplicates() *ServiceLogActionBuilder { + b.allowDuplicates = true + return b +} + +// Build creates the ServiceLogAction +func (b *ServiceLogActionBuilder) Build() Action { + return &ServiceLogAction{ + ServiceLog: &ocm.ServiceLog{ + Severity: b.severity, + ServiceName: b.serviceName, + Summary: b.summary, + Description: b.description, + InternalOnly: b.internalOnly, + }, + Reason: b.reason, + AllowDuplicates: b.allowDuplicates, + } +} + +// LimitedSupportActionBuilder builds LimitedSupportAction instances +type LimitedSupportActionBuilder struct { + summary string + details string + context string + allowDuplicates bool +} + +// NewLimitedSupportAction creates a builder with required fields +// summary: Brief reason for limited support +// details: Detailed explanation including remediation steps +func NewLimitedSupportAction(summary, details string) *LimitedSupportActionBuilder { + return &LimitedSupportActionBuilder{ + summary: summary, + details: details, + allowDuplicates: false, // Default to skip duplicates + } +} + +// WithContext adds context for logging/metrics +func (b *LimitedSupportActionBuilder) WithContext(context string) *LimitedSupportActionBuilder { + b.context = context + return b +} + +// AllowDuplicates permits setting even if identical LS exists +func (b *LimitedSupportActionBuilder) AllowDuplicates() *LimitedSupportActionBuilder { + b.allowDuplicates = true + return b +} + +// Build creates the LimitedSupportAction +func (b *LimitedSupportActionBuilder) Build() Action { + return &LimitedSupportAction{ + Reason: &ocm.LimitedSupportReason{ + Summary: b.summary, + Details: b.details, + }, + Context: b.context, + AllowDuplicates: b.allowDuplicates, + } +} + +// PagerDutyNoteActionBuilder builds PagerDutyNoteAction instances +type PagerDutyNoteActionBuilder struct { + content strings.Builder +} + +// NewPagerDutyNoteAction creates a builder +// Can be initialized empty and built up, or with initial content +func NewPagerDutyNoteAction(initialContent ...string) *PagerDutyNoteActionBuilder { + b := &PagerDutyNoteActionBuilder{} + + if len(initialContent) > 0 { + b.content.WriteString(initialContent[0]) + } + + return b +} + +// WithContent sets the note content (replaces existing) +func (b *PagerDutyNoteActionBuilder) WithContent(content string) *PagerDutyNoteActionBuilder { + b.content.Reset() + b.content.WriteString(content) + return b +} + +// AppendLine adds a line to the note +func (b *PagerDutyNoteActionBuilder) AppendLine(line string) *PagerDutyNoteActionBuilder { + if b.content.Len() > 0 { + b.content.WriteString("\n") + } + b.content.WriteString(line) + return b +} + +// AppendSection adds a section with a header +func (b *PagerDutyNoteActionBuilder) AppendSection(header, content string) *PagerDutyNoteActionBuilder { + if b.content.Len() > 0 { + b.content.WriteString("\n\n") + } + b.content.WriteString(fmt.Sprintf("## %s\n%s", header, content)) + return b +} + +// FromNoteWriter uses a notewriter's content +func (b *PagerDutyNoteActionBuilder) FromNoteWriter(nw *notewriter.NoteWriter) *PagerDutyNoteActionBuilder { + return b.WithContent(nw.String()) +} + +// Build creates the PagerDutyNoteAction +func (b *PagerDutyNoteActionBuilder) Build() Action { + return &PagerDutyNoteAction{ + Content: b.content.String(), + } +} + +// SilenceIncidentActionBuilder builds SilenceIncidentAction instances +type SilenceIncidentActionBuilder struct { + reason string +} + +// NewSilenceIncidentAction creates a builder +func NewSilenceIncidentAction(reason string) *SilenceIncidentActionBuilder { + return &SilenceIncidentActionBuilder{ + reason: reason, + } +} + +// WithReason sets the reason for silencing +func (b *SilenceIncidentActionBuilder) WithReason(reason string) *SilenceIncidentActionBuilder { + b.reason = reason + return b +} + +// Build creates the SilenceIncidentAction +func (b *SilenceIncidentActionBuilder) Build() Action { + return &SilenceIncidentAction{ + Reason: b.reason, + } +} + +// EscalateIncidentActionBuilder builds EscalateIncidentAction instances +type EscalateIncidentActionBuilder struct { + reason string +} + +// NewEscalateIncidentAction creates a builder +func NewEscalateIncidentAction(reason string) *EscalateIncidentActionBuilder { + return &EscalateIncidentActionBuilder{ + reason: reason, + } +} + +// WithReason sets the reason for escalating +func (b *EscalateIncidentActionBuilder) WithReason(reason string) *EscalateIncidentActionBuilder { + b.reason = reason + return b +} + +// Build creates the EscalateIncidentAction +func (b *EscalateIncidentActionBuilder) Build() Action { + return &EscalateIncidentAction{ + Reason: b.reason, + } +} + +// BackplaneReportActionBuilder builds BackplaneReportAction instances +type BackplaneReportActionBuilder struct { + report BackplaneReport + reportType string +} + +// NewBackplaneReportAction creates a builder with required fields +func NewBackplaneReportAction(reportType string, report BackplaneReport) *BackplaneReportActionBuilder { + return &BackplaneReportActionBuilder{ + reportType: reportType, + report: report, + } +} + +// WithReport sets the report payload +func (b *BackplaneReportActionBuilder) WithReport(report BackplaneReport) *BackplaneReportActionBuilder { + b.report = report + return b +} + +// Build creates the BackplaneReportAction +func (b *BackplaneReportActionBuilder) Build() Action { + return &BackplaneReportAction{ + Report: b.report, + ReportType: b.reportType, + } +} + +// Convenience functions for simple cases + +// ServiceLog creates a basic service log action +func ServiceLog(severity, summary, description string) Action { + return NewServiceLogAction(severity, summary). + WithDescription(description). + Build() +} + +// LimitedSupport creates a basic limited support action +func LimitedSupport(summary, details string) Action { + return NewLimitedSupportAction(summary, details).Build() +} + +// Note creates a PagerDuty note action +func Note(content string) Action { + return NewPagerDutyNoteAction(content).Build() +} + +// NoteFrom creates a PagerDuty note from a notewriter +func NoteFrom(nw *notewriter.NoteWriter) Action { + return NewPagerDutyNoteAction().FromNoteWriter(nw).Build() +} + +// Silence creates a silence incident action +func Silence(reason string) Action { + return NewSilenceIncidentAction(reason).Build() +} + +// Escalate creates an escalate incident action +func Escalate(reason string) Action { + return NewEscalateIncidentAction(reason).Build() +} diff --git a/pkg/executor/actions.go b/pkg/executor/actions.go new file mode 100644 index 00000000..a6338231 --- /dev/null +++ b/pkg/executor/actions.go @@ -0,0 +1,242 @@ +// Package reporter provides external system update functionality for investigation results +package executor + +import ( + "context" + "fmt" + "strings" + + "github.com/openshift/configuration-anomaly-detection/pkg/ocm" + "github.com/openshift/configuration-anomaly-detection/pkg/types" +) + +// Action is the types.Action interface - all reporter actions implement it +type Action = types.Action + +// ExecutionContext is the types.ExecutionContext - aliased for convenience +type ExecutionContext = types.ExecutionContext + +// ActionType identifies the kind of action +type ActionType string + +const ( + ActionTypeServiceLog ActionType = "service_log" + ActionTypeLimitedSupport ActionType = "limited_support" + ActionTypePagerDutyNote ActionType = "pagerduty_note" + ActionTypeSilenceIncident ActionType = "silence_incident" + ActionTypeEscalateIncident ActionType = "escalate_incident" + ActionTypeBackplaneReport ActionType = "backplane_report" +) + +// ServiceLogAction sends a service log via OCM +type ServiceLogAction struct { + // ServiceLog to send + ServiceLog *ocm.ServiceLog + + // Reason explains why this service log is being sent (for logging/metrics) + Reason string + + // AllowDuplicates permits sending even if identical SL exists + AllowDuplicates bool +} + +func (a *ServiceLogAction) Type() string { + return string(ActionTypeServiceLog) +} + +func (a *ServiceLogAction) ActionType() ActionType { + return ActionTypeServiceLog +} + +func (a *ServiceLogAction) Validate() error { + if a.ServiceLog == nil { + return fmt.Errorf("ServiceLog cannot be nil") + } + if a.ServiceLog.Summary == "" { + return fmt.Errorf("ServiceLog.Summary is required") + } + if a.ServiceLog.Severity == "" { + return fmt.Errorf("ServiceLog.Severity is required") + } + return nil +} + +func (a *ServiceLogAction) Execute(ctx context.Context, execCtx *ExecutionContext) error { + if execCtx.Cluster == nil { + return fmt.Errorf("cluster required for ServiceLog action") + } + + execCtx.Logger.Infof("Sending service log: %s (reason: %s)", + a.ServiceLog.Summary, a.Reason) + + // Optional: Check for duplicates + if !a.AllowDuplicates { + existing, err := execCtx.OCMClient.GetServiceLog(execCtx.Cluster, + fmt.Sprintf("summary='%s'", a.ServiceLog.Summary)) + if err == nil && existing.Total() > 0 { + execCtx.Logger.Infof("Skipping duplicate service log: %s", a.ServiceLog.Summary) + return nil + } + } + + return execCtx.OCMClient.PostServiceLog(execCtx.Cluster, a.ServiceLog) +} + +// LimitedSupportAction sets a cluster into limited support +type LimitedSupportAction struct { + // Reason for limited support + Reason *ocm.LimitedSupportReason + + // Context provides additional info for logging/metrics + Context string + + // AllowDuplicates permits setting even if identical LS exists + AllowDuplicates bool +} + +func (a *LimitedSupportAction) Type() string { + return string(ActionTypeLimitedSupport) +} + +func (a *LimitedSupportAction) ActionType() ActionType { + return ActionTypeLimitedSupport +} + +func (a *LimitedSupportAction) Validate() error { + if a.Reason == nil { + return fmt.Errorf("reason cannot be nil") + } + if a.Reason.Summary == "" { + return fmt.Errorf("reason.Summary is required") + } + if a.Reason.Details == "" { + return fmt.Errorf("reason.Details is required") + } + return nil +} + +func (a *LimitedSupportAction) Execute(ctx context.Context, execCtx *ExecutionContext) error { + if execCtx.Cluster == nil { + return fmt.Errorf("cluster required for LimitedSupport action") + } + + execCtx.Logger.Infof("Setting limited support: %s (context: %s)", + a.Reason.Summary, a.Context) + + // Note: OCM API handles duplicate checking internally + return execCtx.OCMClient.PostLimitedSupportReason(execCtx.Cluster, a.Reason) +} + +// PagerDutyNoteAction adds a note to the current PagerDuty incident +type PagerDutyNoteAction struct { + // Content of the note (can be from notewriter.String()) + Content string +} + +func (a *PagerDutyNoteAction) Type() string { + return string(ActionTypePagerDutyNote) +} + +func (a *PagerDutyNoteAction) ActionType() ActionType { + return ActionTypePagerDutyNote +} + +func (a *PagerDutyNoteAction) Validate() error { + if strings.TrimSpace(a.Content) == "" { + return fmt.Errorf("note content cannot be empty") + } + return nil +} + +func (a *PagerDutyNoteAction) Execute(ctx context.Context, execCtx *ExecutionContext) error { + execCtx.Logger.Infof("Adding PagerDuty note (%d chars)", len(a.Content)) + return execCtx.PDClient.AddNote(a.Content) +} + +// SilenceIncidentAction silences the current PagerDuty incident +type SilenceIncidentAction struct { + // Reason explains why we're silencing (for logging) + Reason string +} + +func (a *SilenceIncidentAction) Type() string { + return string(ActionTypeSilenceIncident) +} + +func (a *SilenceIncidentAction) ActionType() ActionType { + return ActionTypeSilenceIncident +} + +func (a *SilenceIncidentAction) Validate() error { + return nil // No validation needed +} + +func (a *SilenceIncidentAction) Execute(ctx context.Context, execCtx *ExecutionContext) error { + execCtx.Logger.Infof("Silencing incident: %s", a.Reason) + return execCtx.PDClient.SilenceIncident() +} + +// EscalateIncidentAction escalates the current PagerDuty incident +type EscalateIncidentAction struct { + // Reason explains why we're escalating (for logging) + Reason string +} + +func (a *EscalateIncidentAction) Type() string { + return string(ActionTypeEscalateIncident) +} + +func (a *EscalateIncidentAction) ActionType() ActionType { + return ActionTypeEscalateIncident +} + +func (a *EscalateIncidentAction) Validate() error { + return nil // No validation needed +} + +func (a *EscalateIncidentAction) Execute(ctx context.Context, execCtx *ExecutionContext) error { + execCtx.Logger.Infof("Escalating incident: %s", a.Reason) + return execCtx.PDClient.EscalateIncident() +} + +// BackplaneReport is the minimal interface for report payloads +type BackplaneReport interface { + // ToJSON serializes the report + ToJSON() ([]byte, error) + + // Validate checks if report is valid + Validate() error +} + +// BackplaneReportAction uploads a report to backplane reports API +type BackplaneReportAction struct { + // Report contains the structured report data + Report BackplaneReport + + // ReportType identifies the kind of report + ReportType string +} + +func (a *BackplaneReportAction) Type() string { + return string(ActionTypeBackplaneReport) +} + +func (a *BackplaneReportAction) ActionType() ActionType { + return ActionTypeBackplaneReport +} + +func (a *BackplaneReportAction) Validate() error { + if a.Report == nil { + return fmt.Errorf("report cannot be nil") + } + return a.Report.Validate() +} + +func (a *BackplaneReportAction) Execute(ctx context.Context, execCtx *ExecutionContext) error { + execCtx.Logger.Infof("Uploading backplane report (type: %s)", a.ReportType) + + // Future implementation: + // return exec.BackplaneClient.UploadReport(ctx, exec.Cluster.ID(), a.Report) + + return fmt.Errorf("backplane reports not yet implemented") +} diff --git a/pkg/executor/errors.go b/pkg/executor/errors.go new file mode 100644 index 00000000..2bb1ffe5 --- /dev/null +++ b/pkg/executor/errors.go @@ -0,0 +1,57 @@ +package executor + +import ( + "fmt" + "strings" +) + +// ActionValidationError indicates an action failed validation +type ActionValidationError struct { + ActionType ActionType + Err error +} + +func (e ActionValidationError) Error() string { + return fmt.Sprintf("action %s validation failed: %v", e.ActionType, e.Err) +} + +func (e ActionValidationError) Unwrap() error { + return e.Err +} + +// ActionExecutionError indicates an action failed to execute +type ActionExecutionError struct { + ActionType ActionType + Attempt int + Err error +} + +func (e ActionExecutionError) Error() string { + return fmt.Sprintf("action %s failed (attempt %d): %v", e.ActionType, e.Attempt, e.Err) +} + +func (e ActionExecutionError) Unwrap() error { + return e.Err +} + +// MultipleActionsError wraps multiple action failures +type MultipleActionsError struct { + Errors []error +} + +func (e MultipleActionsError) Error() string { + errStrings := make([]string, 0, len(e.Errors)) + for _, subErr := range e.Errors { + errString := fmt.Sprintf("- %s", subErr.Error()) + errStrings = append(errStrings, errString) + } + errString := strings.Join(errStrings, "\n") + return fmt.Sprintf("%d actions failed: %v", len(e.Errors), errString) +} + +func (e MultipleActionsError) Unwrap() error { + if len(e.Errors) > 0 { + return e.Errors[0] + } + return nil +} diff --git a/pkg/executor/executor.go b/pkg/executor/executor.go new file mode 100644 index 00000000..48162e2c --- /dev/null +++ b/pkg/executor/executor.go @@ -0,0 +1,267 @@ +package executor + +import ( + "context" + "errors" + "fmt" + "net" + "strings" + "sync" + "time" + + "github.com/openshift/configuration-anomaly-detection/pkg/metrics" +) + +func (e *DefaultExecutor) executeSequential( + ctx context.Context, + actions []Action, + execCtx *ExecutionContext, + opts ExecutionOptions, +) error { + actionErrors := make([]error, 0, len(actions)) + + for i, action := range actions { + actionLogger := execCtx.Logger.With( + "action_index", i, + "action_type", action.Type(), + ) + + // Update context with action-specific logger + actionExecCtx := &ExecutionContext{ + Cluster: execCtx.Cluster, + OCMClient: execCtx.OCMClient, + PDClient: execCtx.PDClient, + InvestigationName: execCtx.InvestigationName, + Logger: actionLogger, + } + + if opts.DryRun { + actionLogger.Infof("DRY RUN: Would execute action %s", action.Type()) + continue + } + + // Execute with retry + err := e.executeWithRetry(ctx, action, actionExecCtx, opts.MaxRetries) + if err != nil { + actionLogger.Errorf("Action failed: %v", err) + actionErrors = append(actionErrors, ActionExecutionError{ + ActionType: ActionType(action.Type()), + Attempt: opts.MaxRetries + 1, + Err: err, + }) + + if opts.StopOnError { + break + } + } else { + actionLogger.Infof("Action completed successfully") + } + } + + if len(actionErrors) > 0 { + return MultipleActionsError{Errors: actionErrors} + } + + return nil +} + +func (e *DefaultExecutor) executeConcurrent( + ctx context.Context, + actions []Action, + execCtx *ExecutionContext, + opts ExecutionOptions, +) error { + // Group actions by type to determine what can run in parallel + // Rules: + // - PagerDuty actions must run sequentially (note, then silence/escalate) + // - OCM actions can run in parallel + // - Backplane actions can run in parallel + + type actionWithIndex struct { + action Action + index int + } + + var ( + pdActions []actionWithIndex + ocmActions []actionWithIndex + bpActions []actionWithIndex + ) + + for i, action := range actions { + actionType := action.Type() + switch actionType { + case string(ActionTypePagerDutyNote), string(ActionTypeSilenceIncident), string(ActionTypeEscalateIncident): + pdActions = append(pdActions, actionWithIndex{action, i}) + case string(ActionTypeServiceLog), string(ActionTypeLimitedSupport): + ocmActions = append(ocmActions, actionWithIndex{action, i}) + case string(ActionTypeBackplaneReport): + bpActions = append(bpActions, actionWithIndex{action, i}) + } + } + + var wg sync.WaitGroup + errorsChan := make(chan error, len(actions)) + + // Execute PagerDuty actions sequentially (in original order) + wg.Add(1) + go func() { + defer wg.Done() + for _, a := range pdActions { + if err := e.executeWithRetry(ctx, a.action, execCtx, opts.MaxRetries); err != nil { + errorsChan <- ActionExecutionError{ + ActionType: ActionType(a.action.Type()), + Attempt: opts.MaxRetries + 1, + Err: err, + } + if opts.StopOnError { + return + } + } + } + }() + + // Execute OCM actions in parallel + for _, a := range ocmActions { + wg.Add(1) + go func(a actionWithIndex) { + defer wg.Done() + if err := e.executeWithRetry(ctx, a.action, execCtx, opts.MaxRetries); err != nil { + errorsChan <- ActionExecutionError{ + ActionType: ActionType(a.action.Type()), + Attempt: opts.MaxRetries + 1, + Err: err, + } + } + }(a) + } + + // Execute Backplane actions in parallel + for _, a := range bpActions { + wg.Add(1) + go func(a actionWithIndex) { + defer wg.Done() + if err := e.executeWithRetry(ctx, a.action, execCtx, opts.MaxRetries); err != nil { + errorsChan <- ActionExecutionError{ + ActionType: ActionType(a.action.Type()), + Attempt: opts.MaxRetries + 1, + Err: err, + } + } + }(a) + } + + wg.Wait() + close(errorsChan) + + actionErrors := make([]error, 0, len(actions)) + for err := range errorsChan { + actionErrors = append(actionErrors, err) + } + + if len(actionErrors) > 0 { + return MultipleActionsError{Errors: actionErrors} + } + + return nil +} + +func (e *DefaultExecutor) executeWithRetry( + ctx context.Context, + action Action, + execCtx *ExecutionContext, + maxRetries int, +) error { + var lastErr error + + for attempt := 0; attempt <= maxRetries; attempt++ { + if attempt > 0 { + // Exponential backoff + backoff := time.Duration(attempt*attempt) * time.Second + execCtx.Logger.Infof("Retrying action %s after %v (attempt %d/%d)", + action.Type(), backoff, attempt, maxRetries) + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(backoff): + } + } + + lastErr = action.Execute(ctx, execCtx) + if lastErr == nil { + if attempt > 0 { + execCtx.Logger.Infof("Action %s succeeded on retry %d", action.Type(), attempt) + } + // Emit metrics on successful action execution + emitMetricsForAction(action, execCtx) + return nil + } + + // Check if error is retryable + if !isRetryable(lastErr) { + execCtx.Logger.Warnf("Action %s failed with non-retryable error: %v", + action.Type(), lastErr) + return lastErr + } + + execCtx.Logger.Warnf("Action %s failed (attempt %d/%d): %v", + action.Type(), attempt+1, maxRetries+1, lastErr) + } + + return fmt.Errorf("action failed after %d retries: %w", maxRetries, lastErr) +} + +func isRetryable(err error) bool { + if err == nil { + return false + } + + // Network errors are retryable + var netErr net.Error + if errors.As(err, &netErr) { + return netErr.Timeout() + } + + // HTTP 5xx errors are retryable + errStr := err.Error() + if strings.Contains(errStr, "status is 5") || + strings.Contains(errStr, "timeout") || + strings.Contains(errStr, "connection refused") { + return true + } + + // HTTP 429 (rate limit) is retryable + if strings.Contains(errStr, "429") || strings.Contains(errStr, "rate limit") { + return true + } + + return false +} + +// emitMetricsForAction emits metrics after successful action execution +func emitMetricsForAction(action Action, execCtx *ExecutionContext) { + investigationName := execCtx.InvestigationName + + switch a := action.(type) { + case *ServiceLogAction: + // Emit ServicelogSent metric + metrics.Inc(metrics.ServicelogSent, investigationName) + execCtx.Logger.Debugf("Emitted servicelog_sent metric for %s", investigationName) + + case *LimitedSupportAction: + // Emit LimitedSupportSet metric with context as label + labels := []string{investigationName} + if a.Context != "" { + labels = append(labels, a.Context) + } + metrics.Inc(metrics.LimitedSupportSet, labels...) + execCtx.Logger.Debugf("Emitted limitedsupport_set metric for %s", investigationName) + + // Note: PagerDuty actions (Note, Silence, Escalate) don't have dedicated metrics + // Note: BackplaneReport doesn't have metrics yet + default: + // No metrics for other action types + execCtx.Logger.Debugf("No metrics defined for action type %s", action.Type()) + } +} diff --git a/pkg/executor/reporter.go b/pkg/executor/reporter.go new file mode 100644 index 00000000..ee05cdf5 --- /dev/null +++ b/pkg/executor/reporter.go @@ -0,0 +1,114 @@ +package executor + +import ( + "context" + "fmt" + + cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" + "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" + "github.com/openshift/configuration-anomaly-detection/pkg/ocm" + "github.com/openshift/configuration-anomaly-detection/pkg/pagerduty" + "go.uber.org/zap" +) + +// Executor executes external system updates based on investigation results +type Executor interface { + // Execute executes all actions from an investigation result + Execute(ctx context.Context, input *ExecutorInput) error +} + +// ExecutorInput contains all context needed to execute actions +type ExecutorInput struct { + // InvestigationName identifies which investigation produced these actions + InvestigationName string + + // Actions to execute + Actions []Action + + // Cluster context for actions that need it + Cluster *cmv1.Cluster + + // Notes accumulated during investigation (optional) + Notes *notewriter.NoteWriter + + // ExecutionOptions controls how actions are executed + Options ExecutionOptions +} + +// ExecutionOptions controls reporter behavior +type ExecutionOptions struct { + // DryRun logs what would happen without executing + DryRun bool + + // StopOnError halts execution on first error + StopOnError bool + + // MaxRetries for transient failures + MaxRetries int + + // ConcurrentActions allows parallel execution of independent actions + ConcurrentActions bool +} + +// DefaultExecutor is the production implementation of Executor +type DefaultExecutor struct { + ocmClient ocm.Client + pdClient pagerduty.Client + // Future: backplaneClient backplane.Client + + logger *zap.SugaredLogger +} + +// NewExecutor creates a new DefaultExecutor +func NewExecutor(ocmClient ocm.Client, pdClient pagerduty.Client, logger *zap.SugaredLogger) Executor { + return &DefaultExecutor{ + ocmClient: ocmClient, + pdClient: pdClient, + logger: logger, + } +} + +func (e *DefaultExecutor) Execute(ctx context.Context, input *ExecutorInput) error { + if input == nil { + return fmt.Errorf("ExecutorInput cannot be nil") + } + + if len(input.Actions) == 0 { + e.logger.Debug("No actions to execute") + return nil + } + + // Apply default options + opts := input.Options + if opts.MaxRetries == 0 { + opts.MaxRetries = 3 // Default retry count + } + + e.logger.Infof("Executing %d actions for investigation %s", + len(input.Actions), input.InvestigationName) + + // Validate all actions first + for i, action := range input.Actions { + if err := action.Validate(); err != nil { + return ActionValidationError{ + ActionType: ActionType(action.Type()), + Err: fmt.Errorf("action %d: %w", i, err), + } + } + } + + // Create execution context + execCtx := &ExecutionContext{ + Cluster: input.Cluster, + OCMClient: e.ocmClient, + PDClient: e.pdClient, + InvestigationName: input.InvestigationName, + Logger: e.logger, + } + + // Execute actions + if opts.ConcurrentActions { + return e.executeConcurrent(ctx, input.Actions, execCtx, opts) + } + return e.executeSequential(ctx, input.Actions, execCtx, opts) +} diff --git a/pkg/executor/result_builder.go b/pkg/executor/result_builder.go new file mode 100644 index 00000000..2709a502 --- /dev/null +++ b/pkg/executor/result_builder.go @@ -0,0 +1,91 @@ +package executor + +import ( + "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" + "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" + "github.com/openshift/configuration-anomaly-detection/pkg/types" +) + +// ResultWithActionsBuilder helps build InvestigationResults with actions +type ResultWithActionsBuilder struct { + actions []Action +} + +// NewResultWithActions creates a new builder +func NewResultWithActions() *ResultWithActionsBuilder { + return &ResultWithActionsBuilder{ + actions: []Action{}, + } +} + +// AddAction adds a pre-built action +func (b *ResultWithActionsBuilder) AddAction(action Action) *ResultWithActionsBuilder { + b.actions = append(b.actions, action) + return b +} + +// AddServiceLog adds a service log action with a builder function +func (b *ResultWithActionsBuilder) AddServiceLog( + severity, summary string, + configure func(*ServiceLogActionBuilder), +) *ResultWithActionsBuilder { + builder := NewServiceLogAction(severity, summary) + if configure != nil { + configure(builder) + } + b.actions = append(b.actions, builder.Build()) + return b +} + +// AddLimitedSupport adds a limited support action with a builder function +func (b *ResultWithActionsBuilder) AddLimitedSupport( + summary, details string, + configure func(*LimitedSupportActionBuilder), +) *ResultWithActionsBuilder { + builder := NewLimitedSupportAction(summary, details) + if configure != nil { + configure(builder) + } + b.actions = append(b.actions, builder.Build()) + return b +} + +// AddNote adds a PagerDuty note +func (b *ResultWithActionsBuilder) AddNote(content string) *ResultWithActionsBuilder { + b.actions = append(b.actions, + NewPagerDutyNoteAction(content).Build()) + return b +} + +// AddNoteFromNoteWriter adds a PagerDuty note from a notewriter +func (b *ResultWithActionsBuilder) AddNoteFromNoteWriter(nw *notewriter.NoteWriter) *ResultWithActionsBuilder { + b.actions = append(b.actions, + NewPagerDutyNoteAction().FromNoteWriter(nw).Build()) + return b +} + +// Silence adds a silence action +func (b *ResultWithActionsBuilder) Silence(reason string) *ResultWithActionsBuilder { + b.actions = append(b.actions, + NewSilenceIncidentAction(reason).Build()) + return b +} + +// Escalate adds an escalate action +func (b *ResultWithActionsBuilder) Escalate(reason string) *ResultWithActionsBuilder { + b.actions = append(b.actions, + NewEscalateIncidentAction(reason).Build()) + return b +} + +// Build creates the InvestigationResult +func (b *ResultWithActionsBuilder) Build() investigation.InvestigationResult { + // Convert reporter.Action to types.Action + // They are the same type (alias), so we can just assign the slice + invActions := make([]types.Action, len(b.actions)) + copy(invActions, b.actions) + + return investigation.InvestigationResult{ + Actions: invActions, + } +} diff --git a/pkg/investigations/chgm/chgm.go b/pkg/investigations/chgm/chgm.go index 1557b1e3..da5d2751 100644 --- a/pkg/investigations/chgm/chgm.go +++ b/pkg/investigations/chgm/chgm.go @@ -11,14 +11,14 @@ import ( ec2v2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" "github.com/openshift/configuration-anomaly-detection/pkg/aws" - "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" + "github.com/openshift/configuration-anomaly-detection/pkg/executor" + investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" "github.com/openshift/configuration-anomaly-detection/pkg/logging" "github.com/openshift/configuration-anomaly-detection/pkg/networkverifier" "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" "github.com/openshift/configuration-anomaly-detection/pkg/ocm" - "github.com/openshift/configuration-anomaly-detection/pkg/pagerduty" "github.com/openshift/configuration-anomaly-detection/pkg/reports" - "github.com/openshift/configuration-anomaly-detection/pkg/utils" + "github.com/openshift/configuration-anomaly-detection/pkg/types" hivev1 "github.com/openshift/hive/apis/hive/v1" ) @@ -36,6 +36,55 @@ var ( type Investigation struct{} +// isInfrastructureError determines if an error is a transient infrastructure +// failure that should cause the investigation to be retried (return error), +// vs an investigation finding that should be reported (return actions). +func isInfrastructureError(err error) bool { + if err == nil { + return false + } + + errStr := err.Error() + + // Transient API failures that should trigger retry + infraPatterns := []string{ + "could not retrieve non running instances", + "could not retrieve running cluster nodes", + "could not retrieve expected cluster nodes", + "could not PollStopEventsFor", + "failed to retrieve", + "timeout", + "rate limit", + "connection refused", + "service unavailable", + } + + for _, pattern := range infraPatterns { + if strings.Contains(errStr, pattern) { + return true + } + } + + // Investigation findings that should be reported (NOT retried) + findingPatterns := []string{ + "stopped instances but no stoppedInstancesEvents", // CloudTrail data too old + "clusterdeployment is empty", // Data missing but not retriable + "no non running instances found", // Investigation finding + "could not extractUserDetails", // Invalid CloudTrail event data + "failed to parse CloudTrail event version", // Invalid CloudTrail event data + "cannot parse a nil input", // Invalid CloudTrail event data + } + + for _, pattern := range findingPatterns { + if strings.Contains(errStr, pattern) { + return false + } + } + + // Default: assume infrastructure error to be safe (retry) + return true +} + // Run runs the investigation for a triggered chgm pagerduty event func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.InvestigationResult, error) { ctx := context.Background() @@ -49,19 +98,33 @@ func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.Inv // 1. Check if the user stopped instances res, err := investigateStoppedInstances(r.Cluster, r.ClusterDeployment, r.AwsClient, r.OcmClient) if err != nil { - return result, r.PdClient.EscalateIncidentWithNote(fmt.Sprintf("InvestigateInstances failed: %s\n", err.Error())) + // Check if this is a transient infrastructure error (AWS/OCM API failures) + // These should trigger a retry of the entire investigation + if isInfrastructureError(err) { + return result, fmt.Errorf("investigation infrastructure failure: %w", err) + } + + // Otherwise, it's an investigation finding (e.g., CloudTrail data too old) + // Report this as a finding that needs manual investigation + r.Notes.AppendWarning("Could not complete instance investigation: %s", err.Error()) + result.Actions = []types.Action{ + executor.NoteFrom(r.Notes), + executor.Escalate("Investigation incomplete - manual review required"), + } + return result, nil } logging.Debugf("the investigation returned: [infras running: %d] - [masters running: %d]", res.RunningInstances.Infra, res.RunningInstances.Master) if !res.UserAuthorized { logging.Infof("Instances were stopped by unauthorized user: %s / arn: %s", res.User.UserName, res.User.IssuerUserName) - return result, utils.WithRetries(func() error { - err := postStoppedInfraLimitedSupport(r.Cluster, r.OcmClient, r.PdClient) - // XXX: metrics.Inc(metrics.ServicelogSent, investigationName) - result.LimitedSupportSet = investigation.InvestigationStep{Performed: true, Labels: []string{"StoppedInstances"}} + r.Notes.AppendAutomation("Customer stopped instances. Sent LS and silencing alert.") - return err - }) + result.Actions = []types.Action{ + executor.NewLimitedSupportAction(stoppedInfraLS.Summary, stoppedInfraLS.Details).Build(), + executor.NoteFrom(r.Notes), + executor.Silence("Customer stopped instances - cluster in limited support"), + } + return result, nil } r.Notes.AppendSuccess("Customer did not stop nodes.") logging.Info("The customer has not stopped/terminated any nodes.") @@ -93,29 +156,32 @@ func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.Inv logging.Infof("Network verifier reported failure: %s", failureReason) if strings.Contains(failureReason, "nosnch.in") { - err := r.OcmClient.PostLimitedSupportReason(r.Cluster, &egressLS) - if err != nil { - return result, err - } - - // XXX: metrics.Inc(metrics.LimitedSupportSet, investigationName, "EgressBlocked") - result.LimitedSupportSet = investigation.InvestigationStep{Performed: true, Labels: []string{"EgressBlocked"}} - r.Notes.AppendAutomation("Egress `nosnch.in` blocked, sent limited support.") - return result, r.PdClient.SilenceIncidentWithNote(r.Notes.String()) + + result.Actions = []types.Action{ + executor.NewLimitedSupportAction(egressLS.Summary, egressLS.Details). + WithContext("EgressBlocked"). + Build(), + executor.NoteFrom(r.Notes), + executor.Silence("Deadman's snitch blocked - cluster in limited support"), + } + return result, nil } docLink := ocm.DocumentationLink(product, ocm.DocumentationTopicPrivatelinkFirewall) egressSL := createEgressSL(failureReason, docLink) - err := r.OcmClient.PostServiceLog(r.Cluster, egressSL) - if err != nil { - return result, err - } - - // XXX: metrics.Inc(metrics.ServicelogSent, investigationName) - result.ServiceLogSent = investigation.InvestigationStep{Performed: true, Labels: nil} r.Notes.AppendWarning("NetworkVerifier found unreachable targets and sent the SL, but deadmanssnitch is not blocked! \n⚠️ Please investigate this cluster.\nUnreachable: \n%s", failureReason) + + result.Actions = []types.Action{ + executor.NewServiceLogAction(egressSL.Severity, egressSL.Summary). + WithDescription(egressSL.Description). + WithServiceName(egressSL.ServiceName). + Build(), + executor.NoteFrom(r.Notes), + executor.Escalate("Egress blocked but not deadman's snitch - manual investigation required"), + } + return result, nil case networkverifier.Success: r.Notes.AppendSuccess("Network verifier passed") logging.Info("Network verifier passed.") @@ -134,7 +200,11 @@ func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.Inv r.Notes.AppendAutomation("%s", report.GenerateStringForNoteWriter()) // Found no issues that CAD can handle by itself - forward notes to SRE. - return result, r.PdClient.EscalateIncidentWithNote(r.Notes.String()) + result.Actions = []types.Action{ + executor.NoteFrom(r.Notes), + executor.Escalate("No automated remediation available - manual investigation required"), + } + return result, nil } func (i *Investigation) Name() string { @@ -465,13 +535,3 @@ func extractUserDetails(cloudTrailEvent *string) (CloudTrailEventRaw, error) { return res, nil } - -// postStoppedInfraLimitedSupport will put the cluster on limited support because the user has stopped instances -func postStoppedInfraLimitedSupport(cluster *cmv1.Cluster, ocmCli ocm.Client, pdCli pagerduty.Client) error { - err := ocmCli.PostLimitedSupportReason(cluster, &stoppedInfraLS) - if err != nil { - return fmt.Errorf("failed sending limited support reason: %w", err) - } - - return pdCli.SilenceIncidentWithNote("Customer stopped instances. Sent SL and silencing alert.") -} diff --git a/pkg/investigations/chgm/chgm_test.go b/pkg/investigations/chgm/chgm_test.go index a8849cc0..3a03e8f7 100644 --- a/pkg/investigations/chgm/chgm_test.go +++ b/pkg/investigations/chgm/chgm_test.go @@ -12,14 +12,42 @@ import ( servicelogsv1 "github.com/openshift-online/ocm-sdk-go/servicelogs/v1" awsmock "github.com/openshift/configuration-anomaly-detection/pkg/aws/mock" backplanemock "github.com/openshift/configuration-anomaly-detection/pkg/backplane/mock" - "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" + "github.com/openshift/configuration-anomaly-detection/pkg/executor" + investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" "github.com/openshift/configuration-anomaly-detection/pkg/logging" ocmmock "github.com/openshift/configuration-anomaly-detection/pkg/ocm/mock" pdmock "github.com/openshift/configuration-anomaly-detection/pkg/pagerduty/mock" + "github.com/openshift/configuration-anomaly-detection/pkg/types" hivev1 "github.com/openshift/hive/apis/hive/v1" "go.uber.org/mock/gomock" ) +// Test helper functions to check for specific action types +func hasActionType(actions []types.Action, actionType string) bool { + for _, action := range actions { + if action.Type() == actionType { + return true + } + } + return false +} + +func hasLimitedSupportAction(actions []types.Action) bool { + return hasActionType(actions, string(executor.ActionTypeLimitedSupport)) +} + +func hasSilenceAction(actions []types.Action) bool { + return hasActionType(actions, string(executor.ActionTypeSilenceIncident)) +} + +func hasEscalateAction(actions []types.Action) bool { + return hasActionType(actions, string(executor.ActionTypeEscalateIncident)) +} + +func hasNoteAction(actions []types.Action) bool { + return hasActionType(actions, string(executor.ActionTypePagerDutyNote)) +} + var _ = Describe("chgm", func() { // this is a var but I use it as a const fakeErr := fmt.Errorf("test triggered") @@ -111,15 +139,14 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListRunningInstances(gomock.Eq(infraID)).Return([]ec2v2types.Instance{masterInstance, infraInstance}, nil) r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetClusterMachinePools(gomock.Any()).Return(machinePools, nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil) - r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().PostLimitedSupportReason(gomock.Eq(cluster), gomock.Eq(&stoppedInfraLS)).Return(nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeTrue()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasLimitedSupportAction(result.Actions)).To(BeTrue()) + Expect(hasSilenceAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("Triggered finds instances stopped by the customer with CloudTrail eventVersion 1.99", func() { @@ -133,28 +160,24 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListRunningInstances(gomock.Eq(infraID)).Return([]ec2v2types.Instance{masterInstance, infraInstance}, nil) r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetClusterMachinePools(gomock.Any()).Return(machinePools, nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil) - r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().PostLimitedSupportReason(gomock.Eq(cluster), gomock.Eq(&stoppedInfraLS)).Return(nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeTrue()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasLimitedSupportAction(result.Actions)).To(BeTrue()) + Expect(hasSilenceAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("Triggered errors", func() { - It("should update the incident notes and escalate to primary", func() { - r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListNonRunningInstances(gomock.Any()).Return(nil, fakeErr) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) + It("should return infrastructure error for retry", func() { + r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListNonRunningInstances(gomock.Any()).Return(nil, fmt.Errorf("could not retrieve non running instances: %w", fakeErr)) - result, gotErr := inv.Run(r) + _, gotErr := inv.Run(r) - Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(gotErr).To(HaveOccurred()) + Expect(gotErr.Error()).To(ContainSubstring("investigation infrastructure failure")) }) }) When("there were no stopped instances", func() { @@ -165,30 +188,25 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSecurityGroupID(gomock.Eq(infraID)).Return(gomock.Any().String(), nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetBaseConfig().Return(&awsv2.Config{}) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSubnetID(gomock.Eq(infraID)).Return([]string{"string1", "string2"}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil) result, gotErr := inv.Run(r) // Assert Expect(gotErr).ToNot(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("there was an error getting StopInstancesEvents", func() { - It("should update and escalate to primary", func() { + It("should return infrastructure error for retry", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListNonRunningInstances(gomock.Eq(infraID)).Return([]ec2v2types.Instance{instance}, nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListRunningInstances(gomock.Eq(infraID)).Return([]ec2v2types.Instance{instance}, nil) r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetClusterMachinePools(gomock.Any()).Return(machinePools, nil) - r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return(nil, fakeErr) + r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return(nil, fmt.Errorf("could not PollStopEventsFor: %w", fakeErr)) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) - - result, gotErr := inv.Run(r) - Expect(gotErr).ToNot(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + _, gotErr := inv.Run(r) + Expect(gotErr).To(HaveOccurred()) + Expect(gotErr.Error()).To(ContainSubstring("investigation infrastructure failure")) }) }) When("there were no StopInstancesEvents", func() { @@ -198,14 +216,13 @@ var _ = Describe("chgm", func() { r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetClusterMachinePools(gomock.Any()).Return(machinePools, nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListNonRunningInstances(gomock.Eq(infraID)).Return([]ec2v2types.Instance{instance}, nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) // Act result, gotErr := inv.Run(r) Expect(gotErr).ToNot(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("the returned CloudTrailEventRaw base data is correct, but the sessionissue's username is not an authorized user", func() { @@ -215,15 +232,14 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListRunningInstances(gomock.Eq(infraID)).Return([]ec2v2types.Instance{instance}, nil) event.CloudTrailEvent = awsv2.String(`{"eventVersion":"1.08", "userIdentity":{"type":"AssumedRole", "sessionContext":{"sessionIssuer":{"type":"Role", "userName": "654321"}}}}`) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil) - r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().PostLimitedSupportReason(gomock.Eq(cluster), gomock.Eq(&stoppedInfraLS)).Return(nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()).Return(nil) // Act result, gotErr := inv.Run(r) // Assert Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeTrue()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasLimitedSupportAction(result.Actions)).To(BeTrue()) + Expect(hasSilenceAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("issuer user is authorized (openshift-machine-api-aws)", func() { @@ -237,14 +253,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSecurityGroupID(gomock.Eq(infraID)).Return(gomock.Any().String(), nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetBaseConfig().Return(&awsv2.Config{}) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSubnetID(gomock.Eq(infraID)).Return([]string{"string1", "string2"}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("username role is OrganizationAccountAccessRole on a non CCS cluster", func() { @@ -257,14 +272,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSecurityGroupID(gomock.Eq(infraID)).Return(gomock.Any().String(), nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetBaseConfig().Return(&awsv2.Config{}) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSubnetID(gomock.Eq(infraID)).Return([]string{"string1", "string2"}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) @@ -278,14 +292,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSecurityGroupID(gomock.Eq(infraID)).Return(gomock.Any().String(), nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetBaseConfig().Return(&awsv2.Config{}) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSubnetID(gomock.Eq(infraID)).Return([]string{"string1", "string2"}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Any(), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) @@ -299,14 +312,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSecurityGroupID(gomock.Eq(infraID)).Return(gomock.Any().String(), nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetBaseConfig().Return(&awsv2.Config{}) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSubnetID(gomock.Eq(infraID)).Return([]string{"string1", "string2"}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("issuer user is authorized (customprefix-Installer-Role)", func() { @@ -319,14 +331,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSecurityGroupID(gomock.Eq(infraID)).Return(gomock.Any().String(), nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetBaseConfig().Return(&awsv2.Config{}) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSubnetID(gomock.Eq(infraID)).Return([]string{"string1", "string2"}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("issuer user is authorized (ManagedOpenShift-Support-.*)", func() { @@ -339,14 +350,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSecurityGroupID(gomock.Eq(infraID)).Return(gomock.Any().String(), nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetBaseConfig().Return(&awsv2.Config{}) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSubnetID(gomock.Eq(infraID)).Return([]string{"string1", "string2"}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("issuer user is authorized (.*-Support-Role)", func() { @@ -359,14 +369,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSecurityGroupID(gomock.Eq(infraID)).Return(gomock.Any().String(), nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetBaseConfig().Return(&awsv2.Config{}) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSubnetID(gomock.Eq(infraID)).Return([]string{"string1", "string2"}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("the returned CloudTrailEvent has a matching whitelisted user (osdManagedAdmin-.*)", func() { @@ -379,14 +388,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSecurityGroupID(gomock.Eq(infraID)).Return(gomock.Any().String(), nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetBaseConfig().Return(&awsv2.Config{}) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSubnetID(gomock.Eq(infraID)).Return([]string{"string1", "string2"}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("the returned CloudTrailEvent has a matching whitelisted user (osdCcsAdmin)", func() { @@ -399,14 +407,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSecurityGroupID(gomock.Eq(infraID)).Return(gomock.Any().String(), nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetBaseConfig().Return(&awsv2.Config{}) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSubnetID(gomock.Eq(infraID)).Return([]string{"string1", "string2"}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("the returned CloudTrailEvent has a matching whitelisted user (.*openshift-machine-api-awsv2.*)", func() { @@ -419,14 +426,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSecurityGroupID(gomock.Eq(infraID)).Return(gomock.Any().String(), nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetBaseConfig().Return(&awsv2.Config{}) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().GetSubnetID(gomock.Eq(infraID)).Return([]string{"string1", "string2"}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().GetServiceLog(gomock.Eq(cluster), gomock.Eq("log_type='cluster-state-updates'")).Return(&servicelogsv1.ClusterLogsUUIDListResponse{}, nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("the returned CloudTrailEventRaw has an empty userIdentity", func() { @@ -436,14 +442,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListRunningInstances(gomock.Eq(infraID)).Return([]ec2v2types.Instance{instance}, nil) event.CloudTrailEvent = awsv2.String(`{"eventVersion":"1.08", "userIdentity":{}}`) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil) - r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().PostLimitedSupportReason(gomock.Eq(cluster), gomock.Eq(&stoppedInfraLS)).Return(nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()).Return(nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeTrue()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasLimitedSupportAction(result.Actions)).To(BeTrue()) + Expect(hasSilenceAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("issuer user is unauthorized (testuser assumed role)", func() { @@ -453,14 +458,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListRunningInstances(gomock.Eq(infraID)).Return([]ec2v2types.Instance{instance}, nil) event.CloudTrailEvent = awsv2.String(`{"eventVersion":"1.08","userIdentity":{"type":"AssumedRole","principalId":"REDACTED:OCM","arn":"arn:aws:sts::1234:assumed-role/testuser/OCM","accountId":"1234","accessKeyId":"REDACTED","sessionContext":{"sessionIssuer":{"type":"Role","principalId":"REDACTED","arn":"arn:aws:iam::1234:role/testuser","accountId":"1234","userName":"testuser"},"webIdFederationData":{},"attributes":{"creationDate":"2023-02-21T04:08:01Z","mfaAuthenticated":"false"}}},"eventTime":"2023-02-21T04:10:40Z","eventSource":"ec2v2types.amazonawsv2.com","eventName":"TerminateInstances","awsRegion":"ap-southeast-1","sourceIPAddress":"192.168.0.0","userAgent":"aws-sdk-go-v2/1.17.3 os/linux lang/go/1.19.5 md/GOOS/linux md/GOARCH/amd64 api/ec2/1.25.0","requestParameters":{"instancesSet":{"items":[{"instanceId":"i-00c1f1234567"}]}},"responseElements":{"requestId":"credacted","instancesSet":{"items":[{"instanceId":"i-00c1f1234567","currentState":{"code":32,"name":"shutting-down"},"previousState":{"code":16,"name":"running"}}]}},"requestID":"credacted","eventID":"e55a8a64-9949-47a9-9fff-12345678","readOnly":false,"eventType":"AwsApiCall","managementEvent":true,"recipientAccountId":"1234","eventCategory":"Management","tlsDetails":{"tlsVersion":"TLSv1.2","cipherSuite":"ECDHE-RSA-AES128-GCM-SHA256","clientProvidedHostHeader":"ec2v2types.ap-southeast-1.amazonawsv2.com"}}`) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil) - r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().PostLimitedSupportReason(gomock.Eq(cluster), gomock.Eq(&stoppedInfraLS)).Return(nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()).Return(nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeTrue()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasLimitedSupportAction(result.Actions)).To(BeTrue()) + Expect(hasSilenceAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("the returned CloudTrailEventRaw base data is correct, but the sessionissue's role is not role", func() { @@ -470,14 +474,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListRunningInstances(gomock.Eq(infraID)).Return([]ec2v2types.Instance{instance}, nil) event.CloudTrailEvent = awsv2.String(`{"eventVersion":"1.08", "userIdentity":{"type":"AssumedRole", "sessionContext":{"sessionIssuer":{"type":"test"}}}}`) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil) - r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().PostLimitedSupportReason(gomock.Eq(cluster), gomock.Eq(&stoppedInfraLS)).Return(nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()).Return(nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeTrue()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasLimitedSupportAction(result.Actions)).To(BeTrue()) + Expect(hasSilenceAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("the returned CloudTrailEventRaw has no data", func() { @@ -486,14 +489,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListNonRunningInstances(gomock.Eq(infraID)).Return([]ec2v2types.Instance{instance}, nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListRunningInstances(gomock.Eq(infraID)).Return([]ec2v2types.Instance{instance}, nil) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil) - r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().PostLimitedSupportReason(gomock.Eq(cluster), gomock.Eq(&stoppedInfraLS)).Return(nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()).Return(nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeTrue()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasLimitedSupportAction(result.Actions)).To(BeTrue()) + Expect(hasSilenceAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) @@ -504,14 +506,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListRunningInstances(gomock.Eq(infraID)).Return([]ec2v2types.Instance{instance}, nil) event.CloudTrailEvent = awsv2.String(`{"eventVersion":"1.08", "userIdentity":{}}`) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil) - r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().PostLimitedSupportReason(gomock.Eq(cluster), gomock.Eq(&stoppedInfraLS)).Return(nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()).Return(nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeTrue()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasLimitedSupportAction(result.Actions)).To(BeTrue()) + Expect(hasSilenceAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) @@ -522,14 +523,13 @@ var _ = Describe("chgm", func() { r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().ListRunningInstances(gomock.Eq(infraID)).Return([]ec2v2types.Instance{instance}, nil) event.CloudTrailEvent = awsv2.String(`{"eventVersion":"1.08", "userIdentity":{"type":"IAMUser"}}`) r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil) - r.Resources.OcmClient.(*ocmmock.MockClient).EXPECT().PostLimitedSupportReason(gomock.Eq(cluster), gomock.Eq(&stoppedInfraLS)).Return(nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().SilenceIncidentWithNote(gomock.Any()).Return(nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeTrue()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasLimitedSupportAction(result.Actions)).To(BeTrue()) + Expect(hasSilenceAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("the returned CloudTrailEvent has more than one resource", func() { @@ -541,13 +541,12 @@ var _ = Describe("chgm", func() { cloudTrailResource := cloudtrailv2types.Resource{ResourceName: awsv2.String("123456")} event.Resources = []cloudtrailv2types.Resource{cloudTrailResource} r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("the returned CloudTrailEvent is empty", func() { @@ -559,13 +558,12 @@ var _ = Describe("chgm", func() { cloudTrailResource := cloudtrailv2types.Resource{ResourceName: awsv2.String("123456")} event.Resources = []cloudtrailv2types.Resource{cloudTrailResource} r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("the returned CloudTrailEvent is an empty string", func() { @@ -577,13 +575,12 @@ var _ = Describe("chgm", func() { cloudTrailResource := cloudtrailv2types.Resource{ResourceName: awsv2.String("123456")} event.Resources = []cloudtrailv2types.Resource{cloudTrailResource} r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("the returned CloudTrailEvent is an empty json", func() { @@ -595,13 +592,12 @@ var _ = Describe("chgm", func() { cloudTrailResource := cloudtrailv2types.Resource{ResourceName: awsv2.String("123456")} event.Resources = []cloudtrailv2types.Resource{cloudTrailResource} r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) When("the returned CloudTrailEvent is an invalid json", func() { @@ -613,13 +609,12 @@ var _ = Describe("chgm", func() { cloudTrailResource := cloudtrailv2types.Resource{ResourceName: awsv2.String("123456")} event.Resources = []cloudtrailv2types.Resource{cloudTrailResource} r.Resources.AwsClient.(*awsmock.MockClient).EXPECT().PollInstanceStopEventsFor(gomock.Any(), gomock.Any()).Return([]cloudtrailv2types.Event{event}, nil) - r.Resources.PdClient.(*pdmock.MockClient).EXPECT().EscalateIncidentWithNote(gomock.Any()).Return(nil) result, gotErr := inv.Run(r) Expect(gotErr).NotTo(HaveOccurred()) - Expect(result.ServiceLogPrepared.Performed).To(BeFalse()) - Expect(result.ServiceLogSent.Performed).To(BeFalse()) - Expect(result.LimitedSupportSet.Performed).To(BeFalse()) + Expect(result.Actions).NotTo(BeEmpty()) + Expect(hasEscalateAction(result.Actions)).To(BeTrue()) + Expect(hasNoteAction(result.Actions)).To(BeTrue()) }) }) }) diff --git a/pkg/investigations/clustermonitoringerrorbudgetburn/clustermonitoringerrorbudgetburn.go b/pkg/investigations/clustermonitoringerrorbudgetburn/clustermonitoringerrorbudgetburn.go index e28bf958..b19b3d45 100644 --- a/pkg/investigations/clustermonitoringerrorbudgetburn/clustermonitoringerrorbudgetburn.go +++ b/pkg/investigations/clustermonitoringerrorbudgetburn/clustermonitoringerrorbudgetburn.go @@ -8,11 +8,13 @@ import ( "strings" configv1 "github.com/openshift/api/config/v1" + "github.com/openshift/configuration-anomaly-detection/pkg/executor" "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s" "github.com/openshift/configuration-anomaly-detection/pkg/logging" "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" "github.com/openshift/configuration-anomaly-detection/pkg/ocm" + "github.com/openshift/configuration-anomaly-detection/pkg/types" "k8s.io/apimachinery/pkg/fields" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -69,10 +71,16 @@ func (c *Investigation) Run(rb investigation.ResourceBuilder) (result investigat k8sErr := &investigation.K8SClientError{} if errors.As(err, k8sErr) { if errors.Is(k8sErr.Err, k8sclient.ErrAPIServerUnavailable) { - return result, r.PdClient.EscalateIncidentWithNote("CAD was unable to access cluster's kube-api. Please investigate manually.") + result.Actions = []types.Action{ + executor.Escalate("CAD was unable to access cluster's kube-api. Please investigate manually."), + } + return result, nil } if errors.Is(k8sErr.Err, k8sclient.ErrCannotAccessInfra) { - return result, r.PdClient.EscalateIncidentWithNote("CAD is not allowed to access hive, management or service cluster's kube-api. Please investigate manually.") + result.Actions = []types.Action{ + executor.Escalate("CAD is not allowed to access hive, management or service cluster's kube-api. Please investigate manually."), + } + return result, nil } return result, err } @@ -105,46 +113,56 @@ func (c *Investigation) Run(rb investigation.ResourceBuilder) (result investigat if isUWMConfigInvalid(&monitoringCo) { notes.AppendAutomation("Customer misconfigured the UWM configmap, sending service log and silencing the alert") configMapSL := newUwmConfigMapMisconfiguredSL(monitoringDocLink) - err = r.OcmClient.PostServiceLog(r.Cluster, configMapSL) - if err != nil { - return result, fmt.Errorf("failed posting servicelog: %w", err) - } - // XXX: No metric before - result.ServiceLogSent = investigation.InvestigationStep{Performed: true, Labels: nil} - return result, r.PdClient.SilenceIncidentWithNote(notes.String()) + result.Actions = []types.Action{ + executor.NewServiceLogAction(configMapSL.Severity, configMapSL.Summary). + WithDescription(configMapSL.Description). + WithServiceName(configMapSL.ServiceName). + Build(), + executor.NoteFrom(notes), + executor.Silence("Customer misconfigured UWM configmap"), + } + return result, nil } if isUWMAlertManagerBroken(&monitoringCo) { notes.AppendAutomation("Customer misconfigured the UWM (UpdatingUserWorkloadAlertmanager), sending service log and silencing the alert") alertManagerSL := newUwmAMMisconfiguredSL(monitoringDocLink) - err = r.OcmClient.PostServiceLog(r.Cluster, alertManagerSL) - if err != nil { - return result, fmt.Errorf("failed posting servicelog: %w", err) - } - // XXX: No metric before - result.ServiceLogSent = investigation.InvestigationStep{Performed: true, Labels: nil} - return result, r.PdClient.SilenceIncidentWithNote(notes.String()) + result.Actions = []types.Action{ + executor.NewServiceLogAction(alertManagerSL.Severity, alertManagerSL.Summary). + WithDescription(alertManagerSL.Description). + WithServiceName(alertManagerSL.ServiceName). + Build(), + executor.NoteFrom(notes), + executor.Silence("Customer misconfigured UWM AlertManager"), + } + return result, nil } if isUWMPrometheusBroken(&monitoringCo) { notes.AppendAutomation("Customer misconfigured the UWM (UpdatingUserWorkloadPrometheus), sending service log and silencing the alert") genericSL := newUwmGenericMisconfiguredSL(monitoringDocLink) - err = r.OcmClient.PostServiceLog(r.Cluster, genericSL) - if err != nil { - return result, fmt.Errorf("failed posting servicelog: %w", err) - } - // XXX: No metric before - result.ServiceLogSent = investigation.InvestigationStep{Performed: true, Labels: nil} - return result, r.PdClient.SilenceIncidentWithNote(notes.String()) + result.Actions = []types.Action{ + executor.NewServiceLogAction(genericSL.Severity, genericSL.Summary). + WithDescription(genericSL.Description). + WithServiceName(genericSL.ServiceName). + Build(), + executor.NoteFrom(notes), + executor.Silence("Customer misconfigured UWM Prometheus"), + } + return result, nil } // The UWM configmap is valid, an SRE will need to manually investigate this alert. // Escalate the alert with our findings. notes.AppendSuccess("Monitoring CO not degraded due to UWM misconfiguration") - return result, r.PdClient.EscalateIncidentWithNote(notes.String()) + result.Actions = []types.Action{ + executor.NoteFrom(notes), + executor.Escalate("Monitoring CO not degraded due to UWM misconfiguration - manual investigation required"), + } + return result, nil } func (c *Investigation) Name() string { diff --git a/pkg/investigations/investigation/investigation.go b/pkg/investigations/investigation/investigation.go index 60dc166d..79ad2f97 100644 --- a/pkg/investigations/investigation/investigation.go +++ b/pkg/investigations/investigation/investigation.go @@ -12,6 +12,7 @@ import ( "github.com/openshift/configuration-anomaly-detection/pkg/notewriter" "github.com/openshift/configuration-anomaly-detection/pkg/ocm" "github.com/openshift/configuration-anomaly-detection/pkg/pagerduty" + "github.com/openshift/configuration-anomaly-detection/pkg/types" ) type InvestigationStep struct { @@ -20,6 +21,10 @@ type InvestigationStep struct { } type InvestigationResult struct { + // NEW: Actions to execute via reporter (modern approach) + Actions []types.Action + + // EXISTING: Legacy fields (deprecated, maintained for backwards compatibility) LimitedSupportSet InvestigationStep ServiceLogPrepared InvestigationStep ServiceLogSent InvestigationStep diff --git a/pkg/types/action.go b/pkg/types/action.go new file mode 100644 index 00000000..dcd5befa --- /dev/null +++ b/pkg/types/action.go @@ -0,0 +1,17 @@ +// Package types contains shared types used by both investigations and reporter packages +package types + +import "context" + +// Action represents a single external system update +// This interface allows investigations to specify actions without depending on the reporter package +type Action interface { + // Execute performs the action with the provided execution context + Execute(ctx context.Context, execCtx *ExecutionContext) error + + // Type returns the action type identifier as a string + Type() string + + // Validate checks if the action can be executed + Validate() error +} diff --git a/pkg/types/context.go b/pkg/types/context.go new file mode 100644 index 00000000..3d9e9aa6 --- /dev/null +++ b/pkg/types/context.go @@ -0,0 +1,25 @@ +package types + +import ( + cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" + "github.com/openshift/configuration-anomaly-detection/pkg/ocm" + "github.com/openshift/configuration-anomaly-detection/pkg/pagerduty" + "go.uber.org/zap" +) + +// ExecutionContext provides resources needed by actions during execution +type ExecutionContext struct { + // Cluster being operated on + Cluster *cmv1.Cluster + + // Client instances + OCMClient ocm.Client + PDClient pagerduty.Client + // Future: BackplaneClient backplane.Client + + // Metadata + InvestigationName string + + // Logger for action execution + Logger *zap.SugaredLogger +}