-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrun_alerts.go
154 lines (137 loc) · 4.65 KB
/
run_alerts.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
package main
import (
"bytes"
"strconv"
)
// AlertsForRun creates a currentFail entry for this Run (if not already done)
// and rings corresponding alerts
func (run *Run) AlertsForRun() {
var bbuf bytes.Buffer
bbuf.WriteString(run.Host.Name)
// We now limit to one Fail per host, otherwise we may flood
// the user with Errors (ex: "alert, ssh connection 11s", then the same
// with 11.5s, etc). If there's an issue with a host, you have to fix it
// to get the others (if any left), it makes sense.
/*for _, err := range run.Errors {
bbuf.WriteString(err.Error())
}*/
hash := MD5Hash(bbuf.String())
currentFail := CurrentFailGetAndInc(hash)
currentFail.RelatedHost = run.Host
if currentFail.FailCount > 1 {
return
}
message := AlertMessageCreateForRun(AlertBad, run, currentFail)
message.RingAlerts()
}
// AlertsForTasks creates currentFail entries for each failed TaskResults
// (if not already done) and rings corresponding alerts
func (run *Run) AlertsForTasks() {
for _, taskRes := range run.TaskResults {
if len(taskRes.Errors) > 0 {
var bbuf bytes.Buffer
bbuf.WriteString(run.Host.Name + taskRes.Task.Probe.Name)
for _, err := range taskRes.Errors {
bbuf.WriteString(err.Error())
}
hash := MD5Hash(bbuf.String())
currentFail := CurrentFailGetAndInc(hash)
currentFail.RelatedTTask = taskRes.Task
if currentFail.FailCount > 1 {
return
}
message := AlertMessageCreateForTaskResult(AlertBad, run, taskRes, currentFail)
message.RingAlerts()
}
}
}
// AlertsForChecks creates currentFail entries for every FailedChecks of
// every TaskResults (if not already done) and rings corresponding alerts
func (run *Run) AlertsForChecks() {
// Failures
for _, taskRes := range run.TaskResults {
for _, check := range taskRes.FailedChecks {
Info.Printf("task '%s', check '%s' failed (%s)\n", taskRes.Task.Probe.Name, check.Desc, run.Host.Name)
hash := MD5Hash(run.Host.Name + taskRes.Task.Probe.Name + strconv.Itoa(check.Index))
currentFail := CurrentFailGetAndInc(hash)
currentFail.RelatedTask = taskRes.Task
if currentFail.FailCount != check.NeededFailures {
continue // not yet / already done
}
message := AlertMessageCreateForCheck(AlertBad, run, taskRes, check, currentFail)
message.RingAlerts()
}
}
// Successes
for _, taskRes := range run.TaskResults {
for _, check := range taskRes.SuccessfulChecks {
hash := MD5Hash(run.Host.Name + taskRes.Task.Probe.Name + strconv.Itoa(check.Index))
// we had a failure for that?
if currentFail := CurrentFailGetAndDec(hash); currentFail != nil {
if currentFail.OkCount == check.NeededSuccesses {
Info.Printf("task '%s', check '%s' is now OK (%s)\n", taskRes.Task.Probe.Name, check.Desc, run.Host.Name)
// send the good news (if the bad one was sent) and delete this currentFail
if currentFail.FailCount >= check.NeededFailures {
message := AlertMessageCreateForCheck(AlertGood, run, taskRes, check, currentFail)
message.RingAlerts()
}
CurrentFailDelete(hash)
}
}
}
}
}
// Alerts checks for Run failures, Task failures and Check
// failures and call corresponding AlertsFor*() functions
func (run *Run) Alerts() {
run.ClearAnyCurrentTasksFails()
if run.totalErrorCount() == 0 {
run.ClearAnyCurrentRunFails()
run.DoChecks()
if run.totalTaskResultErrorCount() > 0 {
Info.Printf("found some 'tasks' error(s) (post-checks)\n")
run.AlertsForTasks()
} else {
// ideal path, let's see if there's any check errors ?
run.AlertsForChecks()
}
} else { // run & tasks errors
if len(run.Errors) > 0 {
Info.Printf("found some 'run' error(s)\n")
run.AlertsForRun()
run.ReSchedule()
} else {
Info.Printf("found some 'tasks' error(s)\n")
run.AlertsForTasks()
}
}
run.ReScheduleFailedTasks()
}
// ClearAnyCurrentRunFails deletes any currentFail for the Run (same Host)
// and then rings GOOD alerts
func (run *Run) ClearAnyCurrentRunFails() {
for hash, cf := range currentFails {
if cf.RelatedHost == run.Host {
// there was a time when we were only ringing one message
// for the whole host, but it's compliant with UniqueID idea
message := AlertMessageCreateForRun(AlertGood, run, cf)
message.RingAlerts()
CurrentFailDelete(hash)
}
}
}
// ClearAnyCurrentTasksFails deletes any currentFail for Run Tasks
// and then rings GOOD alerts
func (run *Run) ClearAnyCurrentTasksFails() {
for _, taskRes := range run.TaskResults {
if len(taskRes.Errors) == 0 {
for hash, cf := range currentFails {
if taskRes.Task == cf.RelatedTTask {
message := AlertMessageCreateForTaskResult(AlertGood, run, taskRes, cf)
message.RingAlerts()
CurrentFailDelete(hash)
}
}
}
}
}