Skip to content

Commit

Permalink
etcdserver: add metric counters for livez/readyz health checks.
Browse files Browse the repository at this point in the history
Signed-off-by: Siyuan Zhang <[email protected]>
  • Loading branch information
siyuanfoundation committed Nov 13, 2023
1 parent b343231 commit f3c6db5
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 20 deletions.
100 changes: 80 additions & 20 deletions server/etcdserver/api/etcdhttp/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.

// This file defines the http endpoints for etcd health checks.
// The endpoints include /livez, /readyz and /health.

package etcdhttp

import (
Expand All @@ -34,8 +37,13 @@ import (
)

const (
PathHealth = "/health"
PathProxyHealth = "/proxy/health"
PathHealth = "/health"
PathProxyHealth = "/proxy/health"
HealthStatusSuccess string = "success"
HealthStatusError string = "error"
checkTypeLivez = "livez"
checkTypeReadyz = "readyz"
checkTypeHealth = "health"
)

type ServerHealth interface {
Expand Down Expand Up @@ -82,8 +90,10 @@ func NewHealthHandler(lg *zap.Logger, hfunc func(ctx context.Context, excludedAl
defer func() {
if h.Health == "true" {
healthSuccess.Inc()
recordMetrics(checkTypeHealth, "root", HealthStatusSuccess)
} else {
healthFailed.Inc()
recordMetrics(checkTypeHealth, "root", HealthStatusError)
}
}()
d, _ := json.Marshal(h)
Expand All @@ -99,23 +109,43 @@ func NewHealthHandler(lg *zap.Logger, hfunc func(ctx context.Context, excludedAl
}

var (
// Deprecated: use healthCheckCounter instead.
healthSuccess = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "server",
Name: "health_success",
Help: "The total number of successful health checks",
Help: "The total number of successful health checks (Deprecated, use etcd_server_healthchecks_total instead)",
})
// Deprecated: use healthCheckCounter instead.
healthFailed = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "server",
Name: "health_failures",
Help: "The total number of failed health checks",
Help: "The total number of failed health checks (Deprecated, use etcd_server_healthchecks_total instead)",
})
healthCheckGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "etcd",
Subsystem: "server",
Name: "healthcheck",
Help: "The result of each kind of healthcheck.",
},
[]string{"type", "name"},
)
healthCheckCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "server",
Name: "healthchecks_total",
Help: "The total number of each kind of healthcheck.",
},
[]string{"type", "name", "status"},
)
)

func init() {
prometheus.MustRegister(healthSuccess)
prometheus.MustRegister(healthFailed)
prometheus.MustRegister(healthCheckGauge)
prometheus.MustRegister(healthCheckCounter)
}

// Health defines etcd server health status.
Expand All @@ -125,6 +155,12 @@ type Health struct {
Reason string `json:"reason"`
}

// HealthStatus is used in new /readyz or /livez health checks instead of the Health struct.
type HealthStatus struct {
Reason string `json:"reason"`
Status string `json:"status"`
}

func getQuerySet(r *http.Request, query string) StringSet {
querySet := make(map[string]struct{})
qs, found := r.URL.Query()[query]
Expand Down Expand Up @@ -201,18 +237,18 @@ func checkAPI(ctx context.Context, lg *zap.Logger, srv ServerHealth, serializabl
type HealthCheck func(ctx context.Context) error

type CheckRegistry struct {
path string
checks map[string]HealthCheck
checkType string
checks map[string]HealthCheck
}

func installLivezEndpoints(lg *zap.Logger, mux *http.ServeMux, server ServerHealth) {
reg := CheckRegistry{path: "/livez", checks: make(map[string]HealthCheck)}
reg := CheckRegistry{checkType: checkTypeLivez, checks: make(map[string]HealthCheck)}
reg.Register("serializable_read", serializableReadCheck(server))
reg.InstallHttpEndpoints(lg, mux)
}

func installReadyzEndpoints(lg *zap.Logger, mux *http.ServeMux, server ServerHealth) {
reg := CheckRegistry{path: "/readyz", checks: make(map[string]HealthCheck)}
reg := CheckRegistry{checkType: checkTypeReadyz, checks: make(map[string]HealthCheck)}
reg.Register("data_corruption", activeAlarmCheck(server, pb.AlarmType_CORRUPT))
reg.Register("serializable_read", serializableReadCheck(server))
reg.InstallHttpEndpoints(lg, mux)
Expand All @@ -222,26 +258,30 @@ func (reg *CheckRegistry) Register(name string, check HealthCheck) {
reg.checks[name] = check
}

func (reg *CheckRegistry) RootPath() string {
return "/" + reg.checkType
}

func (reg *CheckRegistry) InstallHttpEndpoints(lg *zap.Logger, mux *http.ServeMux) {
checkNames := make([]string, 0, len(reg.checks))
for k := range reg.checks {
checkNames = append(checkNames, k)
}

// installs the http handler for the root path.
reg.installRootHttpEndpoint(lg, mux, reg.path, checkNames...)
reg.installRootHttpEndpoint(lg, mux, checkNames...)
for _, checkName := range checkNames {
// installs the http handler for the individual check sub path.
subpath := path.Join(reg.path, checkName)
subpath := path.Join(reg.RootPath(), checkName)
check := checkName
mux.Handle(subpath, newHealthHandler(subpath, lg, func(r *http.Request) Health {
mux.Handle(subpath, newHealthHandler(subpath, lg, func(r *http.Request) HealthStatus {
return reg.runHealthChecks(r.Context(), check)
}))
}
}

func (reg *CheckRegistry) runHealthChecks(ctx context.Context, checkNames ...string) Health {
h := Health{Health: "true"}
func (reg *CheckRegistry) runHealthChecks(ctx context.Context, checkNames ...string) HealthStatus {
h := HealthStatus{Status: HealthStatusSuccess}
var individualCheckOutput bytes.Buffer
for _, checkName := range checkNames {
check, found := reg.checks[checkName]
Expand All @@ -250,29 +290,33 @@ func (reg *CheckRegistry) runHealthChecks(ctx context.Context, checkNames ...str
}
if err := check(ctx); err != nil {
fmt.Fprintf(&individualCheckOutput, "[-]%s failed: %v\n", checkName, err)
h.Health = "false"
h.Status = HealthStatusError
recordMetrics(reg.checkType, checkName, HealthStatusError)
} else {
fmt.Fprintf(&individualCheckOutput, "[+]%s ok\n", checkName)
recordMetrics(reg.checkType, checkName, HealthStatusSuccess)
}
}
h.Reason = individualCheckOutput.String()
return h
}

// installRootHttpEndpoint installs the http handler for the root path.
func (reg *CheckRegistry) installRootHttpEndpoint(lg *zap.Logger, mux *http.ServeMux, path string, checks ...string) {
hfunc := func(r *http.Request) Health {
func (reg *CheckRegistry) installRootHttpEndpoint(lg *zap.Logger, mux *http.ServeMux, checks ...string) {
hfunc := func(r *http.Request) HealthStatus {
// extracts the health check names to be excludeList from the query param
excluded := getQuerySet(r, "exclude")

filteredCheckNames := filterCheckList(lg, listToStringSet(checks), excluded)
return reg.runHealthChecks(r.Context(), filteredCheckNames...)
h := reg.runHealthChecks(r.Context(), filteredCheckNames...)
recordMetrics(reg.checkType, "root", h.Status)
return h
}
mux.Handle(path, newHealthHandler(path, lg, hfunc))
mux.Handle(reg.RootPath(), newHealthHandler(reg.RootPath(), lg, hfunc))
}

// newHealthHandler generates a http HandlerFunc for a health check function hfunc.
func newHealthHandler(path string, lg *zap.Logger, hfunc func(*http.Request) Health) http.HandlerFunc {
func newHealthHandler(path string, lg *zap.Logger, hfunc func(*http.Request) HealthStatus) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
w.Header().Set("Allow", http.MethodGet)
Expand All @@ -282,7 +326,7 @@ func newHealthHandler(path string, lg *zap.Logger, hfunc func(*http.Request) Hea
}
h := hfunc(r)
// Always returns detailed reason for failed checks.
if h.Health != "true" {
if h.Status == HealthStatusError {
http.Error(w, h.Reason, http.StatusServiceUnavailable)
lg.Error("Health check error", zap.String("path", path), zap.String("reason", h.Reason), zap.Int("status-code", http.StatusServiceUnavailable))
return
Expand Down Expand Up @@ -342,6 +386,22 @@ func listToStringSet(list []string) StringSet {
return set
}

func recordMetrics(checkType, name string, status string) {
val := 0
if status == HealthStatusSuccess {
val = 1
}
healthCheckGauge.With(prometheus.Labels{
"type": checkType,
"name": name,
}).Set(float64(val))
healthCheckCounter.With(prometheus.Labels{
"type": checkType,
"name": name,
"status": status,
}).Inc()
}

// activeAlarmCheck checks if a specific alarm type is active in the server.
func activeAlarmCheck(srv ServerHealth, at pb.AlarmType) func(context.Context) error {
return func(ctx context.Context) error {
Expand Down
54 changes: 54 additions & 0 deletions server/etcdserver/api/etcdhttp/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"strings"
"testing"

"github.com/prometheus/client_golang/prometheus"
"go.uber.org/zap/zaptest"

"go.etcd.io/raft/v3"
Expand Down Expand Up @@ -193,6 +194,7 @@ func TestHttpSubPath(t *testing.T) {
ts := httptest.NewServer(mux)
defer ts.Close()
checkHttpResponse(t, ts, tt.healthCheckURL, tt.expectStatusCode, tt.inResult, tt.notInResult)
checkMetrics(t, tt.healthCheckURL, "", tt.expectStatusCode)
})
}
}
Expand Down Expand Up @@ -291,6 +293,7 @@ func TestSerializableReadCheck(t *testing.T) {
ts := httptest.NewServer(mux)
defer ts.Close()
checkHttpResponse(t, ts, tt.healthCheckURL, tt.expectStatusCode, tt.inResult, tt.notInResult)
checkMetrics(t, tt.healthCheckURL, "serializable_read", tt.expectStatusCode)
})
}
}
Expand Down Expand Up @@ -323,3 +326,54 @@ func checkHttpResponse(t *testing.T, ts *httptest.Server, url string, expectStat
}
}
}

func checkMetrics(t *testing.T, url, checkName string, expectStatusCode int) {
defer healthCheckGauge.Reset()
defer healthCheckCounter.Reset()

typeName := strings.TrimPrefix(strings.Split(url, "?")[0], "/")
if len(checkName) == 0 {
checkName = strings.Split(typeName, "/")[1]
typeName = strings.Split(typeName, "/")[0]
}

expectedSuccessCount := 1
expectedErrorCount := 0
if expectStatusCode != http.StatusOK {
expectedSuccessCount = 0
expectedErrorCount = 1
}

gather, _ := prometheus.DefaultGatherer.Gather()
for _, mf := range gather {
name := *mf.Name
val := 0
switch name {
case "etcd_server_healthcheck":
val = int(mf.GetMetric()[0].GetGauge().GetValue())
case "etcd_server_healthcheck_total":
val = int(mf.GetMetric()[0].GetCounter().GetValue())
default:
continue
}
labelMap := make(map[string]string)
for _, label := range mf.GetMetric()[0].Label {
labelMap[label.GetName()] = label.GetValue()
}
if typeName != labelMap["type"] {
continue
}
if labelMap["name"] != checkName {
continue
}
if statusLabel, found := labelMap["status"]; found && statusLabel == HealthStatusError {
if val != expectedErrorCount {
t.Fatalf("%s got errorCount %d, wanted %d\n", name, val, expectedErrorCount)
}
} else {
if val != expectedSuccessCount {
t.Fatalf("%s got expectedSuccessCount %d, wanted %d\n", name, val, expectedSuccessCount)
}
}
}
}

0 comments on commit f3c6db5

Please sign in to comment.