|
| 1 | +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one |
| 2 | +// or more contributor license agreements. Licensed under the Elastic License 2.0; |
| 3 | +// you may not use this file except in compliance with the Elastic License 2.0. |
| 4 | + |
| 5 | +package elasticdiagnostics |
| 6 | + |
| 7 | +import ( |
| 8 | + "bytes" |
| 9 | + "context" |
| 10 | + "encoding/json" |
| 11 | + "errors" |
| 12 | + "fmt" |
| 13 | + "net" |
| 14 | + "net/http" |
| 15 | + "runtime/pprof" |
| 16 | + "sync" |
| 17 | + "time" |
| 18 | + |
| 19 | + "go.opentelemetry.io/collector/component" |
| 20 | + "go.opentelemetry.io/collector/confmap" |
| 21 | + "go.uber.org/zap" |
| 22 | + "go.yaml.in/yaml/v3" |
| 23 | + "google.golang.org/protobuf/types/known/timestamppb" |
| 24 | + |
| 25 | + "github.com/elastic/elastic-agent-client/v7/pkg/proto" |
| 26 | + "github.com/elastic/elastic-agent-libs/logp" |
| 27 | + "github.com/elastic/elastic-agent/internal/pkg/diagnostics" |
| 28 | + "github.com/elastic/elastic-agent/pkg/ipc" |
| 29 | +) |
| 30 | + |
| 31 | +var ( |
| 32 | + _ component.Component = (*diagnosticsExtension)(nil) |
| 33 | +) |
| 34 | + |
| 35 | +type diagHook struct { |
| 36 | + description string |
| 37 | + filename string |
| 38 | + contentType string |
| 39 | + hook func() []byte |
| 40 | +} |
| 41 | + |
| 42 | +type diagnosticsExtension struct { |
| 43 | + listener net.Listener |
| 44 | + server *http.Server |
| 45 | + logger *zap.Logger |
| 46 | + logp *logp.Logger |
| 47 | + |
| 48 | + diagnosticsConfig *Config |
| 49 | + collectorConfig *confmap.Conf |
| 50 | + componentHooks map[string][]*diagHook |
| 51 | + globalHooks map[string]*diagHook |
| 52 | + |
| 53 | + mx sync.Mutex |
| 54 | + hooksMtx sync.Mutex |
| 55 | + configMtx sync.Mutex |
| 56 | +} |
| 57 | + |
| 58 | +func (d *diagnosticsExtension) Start(ctx context.Context, host component.Host) error { |
| 59 | + d.mx.Lock() |
| 60 | + defer d.mx.Unlock() |
| 61 | + var err error |
| 62 | + |
| 63 | + d.logp, err = logp.NewZapLogger(d.logger) |
| 64 | + if err != nil { |
| 65 | + // NewZapLogger always returns nil error, so this shouldn't happen. |
| 66 | + return fmt.Errorf("failed to create logp.Logger from zap logger: %w", err) |
| 67 | + } |
| 68 | + |
| 69 | + d.registerGlobalDiagnostics() |
| 70 | + |
| 71 | + d.listener, err = ipc.CreateListener(d.logp, d.diagnosticsConfig.Endpoint) |
| 72 | + if err != nil { |
| 73 | + return fmt.Errorf("error creating listener: %w", err) |
| 74 | + } |
| 75 | + |
| 76 | + mux := http.NewServeMux() |
| 77 | + mux.Handle("/diagnostics", d) |
| 78 | + |
| 79 | + d.server = &http.Server{ |
| 80 | + Handler: mux, |
| 81 | + ReadHeaderTimeout: 30 * time.Second, |
| 82 | + } |
| 83 | + go func() { |
| 84 | + if err := d.server.Serve(d.listener); err != nil && !errors.Is(err, http.ErrServerClosed) { |
| 85 | + d.logger.Error("HTTP server error", zap.Error(err)) |
| 86 | + } |
| 87 | + }() |
| 88 | + d.logger.Info("Diagnostics extension started", zap.String("address", d.listener.Addr().String())) |
| 89 | + return nil |
| 90 | +} |
| 91 | + |
| 92 | +func (d *diagnosticsExtension) Shutdown(ctx context.Context) error { |
| 93 | + d.mx.Lock() |
| 94 | + defer d.mx.Unlock() |
| 95 | + if d.server == nil { |
| 96 | + return nil |
| 97 | + } |
| 98 | + if err := d.server.Shutdown(ctx); err != nil { |
| 99 | + return err |
| 100 | + } |
| 101 | + ipc.CleanupListener(d.logp, d.diagnosticsConfig.Endpoint) |
| 102 | + return nil |
| 103 | +} |
| 104 | + |
| 105 | +func (d *diagnosticsExtension) registerGlobalDiagnostics() { |
| 106 | + d.globalHooks["collector_config"] = &diagHook{ |
| 107 | + description: "full collector configuration", |
| 108 | + filename: "edot/otel-merged-actual.yaml", |
| 109 | + contentType: "application/yaml", |
| 110 | + hook: func() []byte { |
| 111 | + d.configMtx.Lock() |
| 112 | + defer d.configMtx.Unlock() |
| 113 | + if d.collectorConfig == nil { |
| 114 | + return []byte("no active OTel Configuration") |
| 115 | + } |
| 116 | + b, err := yaml.Marshal(d.collectorConfig.ToStringMap()) |
| 117 | + if err != nil { |
| 118 | + return fmt.Appendf(nil, "error: failed to convert to yaml: %v", err) |
| 119 | + } |
| 120 | + return b |
| 121 | + }, |
| 122 | + } |
| 123 | + |
| 124 | + // register basic profiles. |
| 125 | + for _, profile := range []string{"goroutine", "heap", "allocs", "mutex", "threadcreate", "block"} { |
| 126 | + d.globalHooks[profile] = &diagHook{ |
| 127 | + description: fmt.Sprintf("%s profile of the collector", profile), |
| 128 | + filename: fmt.Sprintf("edot/%s.profile.gz", profile), |
| 129 | + contentType: "application/octet-stream", |
| 130 | + hook: func() []byte { |
| 131 | + var buf bytes.Buffer |
| 132 | + err := pprof.Lookup(profile).WriteTo(&buf, 0) |
| 133 | + if err != nil { |
| 134 | + return fmt.Appendf(nil, "error: failed to get %s profile: %v", profile, err) |
| 135 | + } |
| 136 | + return buf.Bytes() |
| 137 | + }, |
| 138 | + } |
| 139 | + } |
| 140 | +} |
| 141 | + |
| 142 | +func (d *diagnosticsExtension) NotifyConfig(ctx context.Context, conf *confmap.Conf) error { |
| 143 | + d.configMtx.Lock() |
| 144 | + defer d.configMtx.Unlock() |
| 145 | + d.collectorConfig = conf |
| 146 | + return nil |
| 147 | +} |
| 148 | + |
| 149 | +// RegisterDiagnosticHook API exposes the ability for beat receivers to register their hooks. |
| 150 | +// NOTE: Changing the function signature will require changes to libbeat and beatreceivers. Proceed with caution. |
| 151 | +func (d *diagnosticsExtension) RegisterDiagnosticHook(componentName string, description string, filename string, contentType string, hook func() []byte) { |
| 152 | + d.hooksMtx.Lock() |
| 153 | + defer d.hooksMtx.Unlock() |
| 154 | + if _, ok := d.componentHooks[componentName]; ok { |
| 155 | + d.componentHooks[componentName] = append(d.componentHooks[componentName], &diagHook{ |
| 156 | + description: description, |
| 157 | + filename: filename, |
| 158 | + contentType: contentType, |
| 159 | + hook: hook, |
| 160 | + }) |
| 161 | + } else { |
| 162 | + d.componentHooks[componentName] = []*diagHook{ |
| 163 | + { |
| 164 | + description: description, |
| 165 | + filename: filename, |
| 166 | + contentType: contentType, |
| 167 | + hook: hook, |
| 168 | + }, |
| 169 | + } |
| 170 | + } |
| 171 | +} |
| 172 | + |
| 173 | +func (d *diagnosticsExtension) ServeHTTP(w http.ResponseWriter, req *http.Request) { |
| 174 | + d.hooksMtx.Lock() |
| 175 | + defer d.hooksMtx.Unlock() |
| 176 | + componentResults := make([]*proto.ActionDiagnosticUnitResult, 0) |
| 177 | + for name, hooks := range d.componentHooks { |
| 178 | + for _, hook := range hooks { |
| 179 | + componentResults = append(componentResults, &proto.ActionDiagnosticUnitResult{ |
| 180 | + Name: name, |
| 181 | + Filename: hook.filename, |
| 182 | + ContentType: hook.contentType, |
| 183 | + Description: hook.description, |
| 184 | + Content: hook.hook(), |
| 185 | + Generated: timestamppb.Now(), |
| 186 | + }) |
| 187 | + } |
| 188 | + } |
| 189 | + |
| 190 | + globalResults := make([]*proto.ActionDiagnosticUnitResult, 0) |
| 191 | + for name, hook := range d.globalHooks { |
| 192 | + globalResults = append(globalResults, &proto.ActionDiagnosticUnitResult{ |
| 193 | + Name: name, |
| 194 | + Filename: hook.filename, |
| 195 | + ContentType: hook.contentType, |
| 196 | + Description: hook.description, |
| 197 | + Content: hook.hook(), |
| 198 | + Generated: timestamppb.Now(), |
| 199 | + }) |
| 200 | + } |
| 201 | + |
| 202 | + // only add a CPU profile if requested via query parameter. |
| 203 | + if req.URL.Query().Get("cpu") == "true" { |
| 204 | + diagCPUDuration := diagnostics.DiagCPUDuration |
| 205 | + |
| 206 | + // check if cpuduration parameter is set, if so override the default duration |
| 207 | + // if parsing fails, log the error and use the default duration |
| 208 | + if req.URL.Query().Get("cpuduration") != "" { |
| 209 | + var err error |
| 210 | + diagCPUDuration, err = time.ParseDuration(req.URL.Query().Get("cpuduration")) |
| 211 | + if err != nil { |
| 212 | + d.logger.Error("Failed parsing cpuduration parameter, using default", zap.String("cpuduration", req.URL.Query().Get("cpuduration")), zap.Error(err)) |
| 213 | + diagCPUDuration = diagnostics.DiagCPUDuration |
| 214 | + } |
| 215 | + } |
| 216 | + cpuProfile, err := diagnostics.CreateCPUProfile(req.Context(), diagCPUDuration) |
| 217 | + if err != nil { |
| 218 | + d.logger.Error("Failed creating CPU profile", zap.Error(err)) |
| 219 | + } |
| 220 | + globalResults = append(globalResults, &proto.ActionDiagnosticUnitResult{ |
| 221 | + Name: "cpu", |
| 222 | + Filename: "edot/cpu.profile.gz", |
| 223 | + ContentType: "application/octet-stream", |
| 224 | + Description: "CPU profile of the collector", |
| 225 | + Content: cpuProfile, |
| 226 | + }) |
| 227 | + } |
| 228 | + |
| 229 | + b, err := json.Marshal(Response{ |
| 230 | + GlobalDiagnostics: globalResults, |
| 231 | + ComponentDiagnostics: componentResults, |
| 232 | + }) |
| 233 | + w.Header().Add("content-type", "application/json") |
| 234 | + if err != nil { |
| 235 | + d.logger.Error("Failed marshaling response", zap.Error(err)) |
| 236 | + w.WriteHeader(500) |
| 237 | + if _, err := fmt.Fprintf(w, "{'error':'%v'}", err); err != nil { |
| 238 | + d.logger.Error("Failed writing response to client.", zap.Error(err)) |
| 239 | + } |
| 240 | + return |
| 241 | + } |
| 242 | + if _, err := w.Write(b); err != nil { |
| 243 | + d.logger.Error("Failed writing response to client.", zap.Error(err)) |
| 244 | + } |
| 245 | +} |
0 commit comments