Skip to content

Commit 7086f1d

Browse files
bergmannfclaude
andcommitted
Migrate CHGM investigation to use executor actions
This commit migrates the Cluster Has Gone Missing (CHGM) investigation to use the new executor action pattern while maintaining proper error handling for infrastructure failures. **Key changes:** 1. **Action-based remediation:** - Replaced direct OCM/PagerDuty calls with declarative actions - Service logs, limited support, notes, and incident management now use executor action builders - Investigation returns actions to be executed by the executor 2. **Smart error handling:** - Added isInfrastructureError() helper to distinguish between: * Infrastructure failures (AWS/OCM API errors) → return error for retry * Investigation findings (data too old, inconclusive) → return actions - Resource building errors continue to return errors (no change) 3. **Removed dependencies:** - Deleted postStoppedInfraLimitedSupport() helper function - Removed utils.WithRetries() usage - Removed unused pagerduty package import 4. **Investigation outcomes now use actions:** - Stopped instances → Limited support + silence - Egress blocked (deadman's snitch) → Limited support + silence - Egress blocked (other URLs) → Service log + escalate - No issues found → Note + escalate - Investigation incomplete → Note + escalate **Benefits:** - Separation of investigation logic from external system updates - Automatic retry for transient failures via executor - Type-safe action builders (no interface{} types) - Declarative results easier to test and understand - Consistent error handling across investigations **Breaking change:** Tests need to be updated to verify actions instead of mocking direct OCM/PagerDuty calls. This will be addressed in a follow-up commit. Related: Phase 1 of executor module implementation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent 4f3a706 commit 7086f1d

File tree

1 file changed

+95
-33
lines changed

1 file changed

+95
-33
lines changed

pkg/investigations/chgm/chgm.go

Lines changed: 95 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@ import (
88
"time"
99

1010
"github.com/openshift/configuration-anomaly-detection/pkg/aws"
11+
"github.com/openshift/configuration-anomaly-detection/pkg/executor"
1112
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
13+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/types"
1214
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
1315
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
1416
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
1517
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
16-
"github.com/openshift/configuration-anomaly-detection/pkg/pagerduty"
17-
"github.com/openshift/configuration-anomaly-detection/pkg/utils"
1818
hivev1 "github.com/openshift/hive/apis/hive/v1"
1919

2020
ec2v2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
@@ -35,6 +35,52 @@ var (
3535

3636
type Investiation struct{}
3737

38+
// isInfrastructureError determines if an error is a transient infrastructure
39+
// failure that should cause the investigation to be retried (return error),
40+
// vs an investigation finding that should be reported (return actions).
41+
func isInfrastructureError(err error) bool {
42+
if err == nil {
43+
return false
44+
}
45+
46+
errStr := err.Error()
47+
48+
// Transient API failures that should trigger retry
49+
infraPatterns := []string{
50+
"could not retrieve non running instances",
51+
"could not retrieve running cluster nodes",
52+
"could not retrieve expected cluster nodes",
53+
"could not PollStopEventsFor",
54+
"failed to retrieve",
55+
"timeout",
56+
"rate limit",
57+
"connection refused",
58+
"service unavailable",
59+
}
60+
61+
for _, pattern := range infraPatterns {
62+
if strings.Contains(errStr, pattern) {
63+
return true
64+
}
65+
}
66+
67+
// Investigation findings that should be reported (NOT retried)
68+
findingPatterns := []string{
69+
"stopped instances but no stoppedInstancesEvents", // CloudTrail data too old
70+
"clusterdeployment is empty", // Data missing but not retriable
71+
"no non running instances found", // Investigation finding
72+
}
73+
74+
for _, pattern := range findingPatterns {
75+
if strings.Contains(errStr, pattern) {
76+
return false
77+
}
78+
}
79+
80+
// Default: assume infrastructure error to be safe (retry)
81+
return true
82+
}
83+
3884
// Run runs the investigation for a triggered chgm pagerduty event
3985
func (c *Investiation) Run(rb investigation.ResourceBuilder) (investigation.InvestigationResult, error) {
4086
result := investigation.InvestigationResult{}
@@ -48,19 +94,34 @@ func (c *Investiation) Run(rb investigation.ResourceBuilder) (investigation.Inve
4894
// 1. Check if the user stopped instances
4995
res, err := investigateStoppedInstances(r.Cluster, r.ClusterDeployment, r.AwsClient, r.OcmClient)
5096
if err != nil {
51-
return result, r.PdClient.EscalateIncidentWithNote(fmt.Sprintf("InvestigateInstances failed: %s\n", err.Error()))
97+
// Check if this is a transient infrastructure error (AWS/OCM API failures)
98+
// These should trigger a retry of the entire investigation
99+
if isInfrastructureError(err) {
100+
return result, fmt.Errorf("investigation infrastructure failure: %w", err)
101+
}
102+
103+
// Otherwise, it's an investigation finding (e.g., CloudTrail data too old)
104+
// Report this as a finding that needs manual investigation
105+
notes.AppendWarning("Could not complete instance investigation: %s", err.Error())
106+
result.Actions = []types.Action{
107+
executor.NoteFrom(notes),
108+
executor.Escalate("Investigation incomplete - manual review required"),
109+
}
110+
return result, nil
52111
}
53112
logging.Debugf("the investigation returned: [infras running: %d] - [masters running: %d]", res.RunningInstances.Infra, res.RunningInstances.Master)
54113

55114
if !res.UserAuthorized {
56115
logging.Infof("Instances were stopped by unauthorized user: %s / arn: %s", res.User.UserName, res.User.IssuerUserName)
57-
return result, utils.WithRetries(func() error {
58-
err := postStoppedInfraLimitedSupport(r.Cluster, r.OcmClient, r.PdClient)
59-
// XXX: metrics.Inc(metrics.ServicelogSent, investigationName)
60-
result.LimitedSupportSet = investigation.InvestigationStep{Performed: true, Labels: []string{"StoppedInstances"}}
116+
notes.AppendAutomation("Customer stopped instances. Sent LS and silencing alert.")
61117

62-
return err
63-
})
118+
result.LimitedSupportSet = investigation.InvestigationStep{Performed: true, Labels: []string{"StoppedInstances"}}
119+
result.Actions = []types.Action{
120+
executor.NewLimitedSupportAction(stoppedInfraLS.Summary, stoppedInfraLS.Details).Build(),
121+
executor.NoteFrom(notes),
122+
executor.Silence("Customer stopped instances - cluster in limited support"),
123+
}
124+
return result, nil
64125
}
65126
notes.AppendSuccess("Customer did not stop nodes.")
66127
logging.Info("The customer has not stopped/terminated any nodes.")
@@ -92,36 +153,45 @@ func (c *Investiation) Run(rb investigation.ResourceBuilder) (investigation.Inve
92153
logging.Infof("Network verifier reported failure: %s", failureReason)
93154

94155
if strings.Contains(failureReason, "nosnch.in") {
95-
err := r.OcmClient.PostLimitedSupportReason(r.Cluster, &egressLS)
96-
if err != nil {
97-
return result, err
98-
}
156+
notes.AppendAutomation("Egress `nosnch.in` blocked, sent limited support.")
99157

100-
// XXX: metrics.Inc(metrics.LimitedSupportSet, investigationName, "EgressBlocked")
101158
result.LimitedSupportSet = investigation.InvestigationStep{Performed: true, Labels: []string{"EgressBlocked"}}
102-
103-
notes.AppendAutomation("Egress `nosnch.in` blocked, sent limited support.")
104-
return result, r.PdClient.SilenceIncidentWithNote(notes.String())
159+
result.Actions = []types.Action{
160+
executor.NewLimitedSupportAction(egressLS.Summary, egressLS.Details).
161+
WithContext("EgressBlocked").
162+
Build(),
163+
executor.NoteFrom(notes),
164+
executor.Silence("Deadman's snitch blocked - cluster in limited support"),
165+
}
166+
return result, nil
105167
}
106168

107169
docLink := ocm.DocumentationLink(product, ocm.DocumentationTopicPrivatelinkFirewall)
108170
egressSL := createEgressSL(failureReason, docLink)
109-
err := r.OcmClient.PostServiceLog(r.Cluster, egressSL)
110-
if err != nil {
111-
return result, err
112-
}
113-
114-
// XXX: metrics.Inc(metrics.ServicelogSent, investigationName)
115-
result.ServiceLogSent = investigation.InvestigationStep{Performed: true, Labels: nil}
116171

117172
notes.AppendWarning("NetworkVerifier found unreachable targets and sent the SL, but deadmanssnitch is not blocked! \n⚠️ Please investigate this cluster.\nUnreachable: \n%s", failureReason)
173+
174+
result.ServiceLogSent = investigation.InvestigationStep{Performed: true, Labels: nil}
175+
result.Actions = []types.Action{
176+
executor.NewServiceLogAction(egressSL.Severity, egressSL.Summary).
177+
WithDescription(egressSL.Description).
178+
WithServiceName(egressSL.ServiceName).
179+
Build(),
180+
executor.NoteFrom(notes),
181+
executor.Escalate("Egress blocked but not deadman's snitch - manual investigation required"),
182+
}
183+
return result, nil
118184
case networkverifier.Success:
119185
notes.AppendSuccess("Network verifier passed")
120186
logging.Info("Network verifier passed.")
121187
}
122188

123189
// Found no issues that CAD can handle by itself - forward notes to SRE.
124-
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
190+
result.Actions = []types.Action{
191+
executor.NoteFrom(notes),
192+
executor.Escalate("No automated remediation available - manual investigation required"),
193+
}
194+
return result, nil
125195
}
126196

127197
func (c *Investiation) Name() string {
@@ -453,12 +523,4 @@ func extractUserDetails(cloudTrailEvent *string) (CloudTrailEventRaw, error) {
453523
return res, nil
454524
}
455525

456-
// postStoppedInfraLimitedSupport will put the cluster on limited support because the user has stopped instances
457-
func postStoppedInfraLimitedSupport(cluster *cmv1.Cluster, ocmCli ocm.Client, pdCli pagerduty.Client) error {
458-
err := ocmCli.PostLimitedSupportReason(cluster, &stoppedInfraLS)
459-
if err != nil {
460-
return fmt.Errorf("failed sending limited support reason: %w", err)
461-
}
462526

463-
return pdCli.SilenceIncidentWithNote("Customer stopped instances. Sent SL and silencing alert.")
464-
}

0 commit comments

Comments
 (0)