Skip to content

Commit e51f9b8

Browse files
[8.19] (backport #10158) Improve logging to catch early errors on startup (#10249)
* Improve logging to catch early errors on startup (#10158) * Update run command to have a critical section to ensure errors are always logged. * Add changelog. * Cleanup from code review. (cherry picked from commit 18beeba) # Conflicts: # internal/pkg/agent/cmd/run.go * Fix backport. --------- Co-authored-by: Blake Rouse <[email protected]>
1 parent 4e5bd49 commit e51f9b8

File tree

2 files changed

+120
-36
lines changed

2 files changed

+120
-36
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Kind can be one of:
2+
# - breaking-change: a change to previously-documented behavior
3+
# - deprecation: functionality that is being removed in a later release
4+
# - bug-fix: fixes a problem in a previous version
5+
# - enhancement: extends functionality but does not break or fix existing behavior
6+
# - feature: new functionality
7+
# - known-issue: problems that we are aware of in a given version
8+
# - security: impacts on the security of a product or a user’s deployment.
9+
# - upgrade: important information for someone upgrading from a prior version
10+
# - other: does not fit into any of the other categories
11+
kind: bug-fix
12+
13+
# Change summary; a 80ish characters long description of the change.
14+
summary: Improve logging to catch early errors on startup
15+
16+
# Long description; in case the summary is not enough to describe the change
17+
# this field accommodate a description without length limits.
18+
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
19+
#description:
20+
21+
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
22+
component: elastic-agent
23+
24+
# PR URL; optional; the PR number that added the changeset.
25+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
26+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
27+
# Please provide it if you are adding a fragment for a different PR.
28+
pr: https://github.com/elastic/elastic-agent/pull/10158
29+
30+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
31+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
32+
issue: https://github.com/elastic/elastic-agent/issues/9099

internal/pkg/agent/cmd/run.go

Lines changed: 88 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package cmd
66

77
import (
88
"context"
9+
goerrors "errors"
910
"fmt"
1011
"net/url"
1112
"os"
@@ -87,7 +88,6 @@ func newRunCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command {
8788
testingMode, _ := cmd.Flags().GetBool("testing-mode")
8889
if err := run(nil, testingMode, fleetInitTimeout); err != nil && !errors.Is(err, context.Canceled) {
8990
fmt.Fprintf(streams.Err, "Error: %v\n%s\n", err, troubleshootMessage())
90-
logExternal(fmt.Sprintf("%s run failed: %s", paths.BinaryName, err))
9191
return err
9292
}
9393
return nil
@@ -140,51 +140,74 @@ func run(override application.CfgOverrider, testingMode bool, fleetInitTimeout t
140140
defer cancel()
141141
go service.ProcessWindowsControlEvents(stopBeat)
142142

143-
upgradeDetailsFromMarker, err := handleUpgrade()
144-
if err != nil {
145-
return fmt.Errorf("error checking for and handling upgrade: %w", err)
146-
}
147-
148-
locker := filelock.NewAppLocker(paths.Data(), paths.AgentLockFileName)
149-
if err := locker.TryLock(); err != nil {
150-
return err
151-
}
152-
defer func() {
153-
_ = locker.Unlock()
154-
}()
155-
156-
return runElasticAgent(ctx, cancel, override, stop, testingMode, fleetInitTimeout, upgradeDetailsFromMarker, modifiers...)
143+
return runElasticAgentCritical(ctx, cancel, override, stop, testingMode, fleetInitTimeout, modifiers...)
157144
}
158145

159146
func logReturn(l *logger.Logger, err error) error {
160147
if err != nil && !errors.Is(err, context.Canceled) {
161148
l.Errorf("%s", err)
149+
logExternal(fmt.Sprintf("%s run failed: %s", paths.BinaryName, err))
162150
}
163151
return err
164152
}
165153

166-
func runElasticAgent(
154+
// runElasticAgentCritical provides a critical path to running runElasticAgent, it exhausts all efforts to log any
155+
// errors to ensure that any issues are captured in the logs.
156+
func runElasticAgentCritical(
167157
ctx context.Context,
168158
cancel context.CancelFunc,
169159
override application.CfgOverrider,
170160
stop chan bool,
171161
testingMode bool,
172162
fleetInitTimeout time.Duration,
173-
upgradeDetailsFromMarker *details.Details,
174163
modifiers ...component.PlatformModifier,
175164
) error {
176-
cfg, err := loadConfig(ctx, override)
165+
var errs []error
166+
167+
// early handleUpgrade, but don't error yet
168+
upgradeDetailsFromMarker, err := handleUpgrade()
177169
if err != nil {
178-
return err
170+
errs = append(errs, fmt.Errorf("failed to handle upgrade: %w", err))
179171
}
180172

181-
logLvl := logger.DefaultLogLevel
182-
if cfg.Settings.LoggingConfig != nil {
183-
logLvl = cfg.Settings.LoggingConfig.Level
173+
// single run, but don't error yet
174+
locker := filelock.NewAppLocker(paths.Data(), paths.AgentLockFileName)
175+
lockErr := locker.TryLock()
176+
if lockErr != nil {
177+
errs = append(errs, fmt.Errorf("failed to get app lock: %w", err))
178+
}
179+
defer func() {
180+
_ = locker.Unlock()
181+
}()
182+
183+
// try load config, but don't error yet
184+
cfg, err := loadConfig(ctx, override)
185+
if err != nil {
186+
// failed to load configuration, just load the default to create the logger
187+
errs = append(errs, fmt.Errorf("failed to load configuration: %w", err))
188+
cfg = configuration.DefaultConfiguration()
184189
}
190+
185191
baseLogger, err := logger.NewFromConfig("", cfg.Settings.LoggingConfig, cfg.Settings.EventLoggingConfig, true)
186192
if err != nil {
187-
return err
193+
errs = append(errs, fmt.Errorf("failed to create logger: %w", err))
194+
195+
// failed to create the baseLogger, this comes from the configuration being possibly invalid
196+
// switch to a default config and try again
197+
cfg = configuration.DefaultConfiguration()
198+
baseLogger, err = logger.NewFromConfig("", cfg.Settings.LoggingConfig, cfg.Settings.EventLoggingConfig, true)
199+
if err != nil {
200+
errs = append(errs, fmt.Errorf("failed to create logger with default configuration: %w", err))
201+
202+
// this really should not happen, but this whole critical function is very defensive
203+
baseLogger, err = logger.New("", true)
204+
if err != nil {
205+
errs = append(errs, fmt.Errorf("failed to create logger with no configuration: %w", err))
206+
207+
// again? no way, but you never know
208+
baseLogger = logger.NewWithoutConfig("")
209+
}
210+
}
188211
}
189212

190213
// Make sure to flush any buffered logs before we're done.
@@ -194,10 +217,39 @@ func runElasticAgent(
194217
"source": agentName,
195218
})
196219

220+
// at this point the logger is working, so any errors that we hit can now be logged and returned
221+
if len(errs) > 0 {
222+
return logReturn(l, goerrors.Join(errs...))
223+
}
224+
225+
// actually run the agent now
226+
err = runElasticAgent(ctx, cancel, baseLogger, l, cfg, override, stop, testingMode, fleetInitTimeout, upgradeDetailsFromMarker, modifiers...)
227+
return logReturn(l, err)
228+
}
229+
230+
// runElasticAgent runs the actual Elastic Agent.
231+
func runElasticAgent(
232+
ctx context.Context,
233+
cancel context.CancelFunc,
234+
baseLogger *logger.Logger,
235+
l *logger.Logger,
236+
cfg *configuration.Configuration,
237+
override application.CfgOverrider,
238+
stop chan bool,
239+
testingMode bool,
240+
fleetInitTimeout time.Duration,
241+
upgradeDetailsFromMarker *details.Details,
242+
modifiers ...component.PlatformModifier,
243+
) error {
244+
logLvl := logger.DefaultLogLevel
245+
if cfg.Settings.LoggingConfig != nil {
246+
logLvl = cfg.Settings.LoggingConfig.Level
247+
}
248+
197249
// try early to check if running as root
198250
isRoot, err := utils.HasRoot()
199251
if err != nil {
200-
return logReturn(l, fmt.Errorf("failed to check for root/Administrator privileges: %w", err))
252+
return fmt.Errorf("failed to check for root/Administrator privileges: %w", err)
201253
}
202254

203255
l.Infow("Elastic Agent started",
@@ -207,7 +259,7 @@ func runElasticAgent(
207259

208260
cfg, err = tryDelayEnroll(ctx, l, cfg, override)
209261
if err != nil {
210-
return logReturn(l, errors.New(err, "failed to perform delayed enrollment"))
262+
return errors.New(err, "failed to perform delayed enrollment")
211263
}
212264
pathConfigFile := paths.AgentConfigFile()
213265

@@ -223,31 +275,31 @@ func runElasticAgent(
223275
// that writes the agentID into fleet.enc (encrypted fleet.yml) before even loading the configuration.
224276
err = secret.CreateAgentSecret(ctx, vault.WithUnprivileged(!isRoot))
225277
if err != nil {
226-
return logReturn(l, fmt.Errorf("failed to read/write secrets: %w", err))
278+
return fmt.Errorf("failed to read/write secrets: %w", err)
227279
}
228280

229281
// Migrate .yml files if the corresponding .enc does not exist
230282

231283
// the encrypted config does not exist but the unencrypted file does
232284
err = migration.MigrateToEncryptedConfig(ctx, l, paths.AgentConfigYmlFile(), paths.AgentConfigFile())
233285
if err != nil {
234-
return logReturn(l, errors.New(err, "error migrating fleet config"))
286+
return errors.New(err, "error migrating fleet config")
235287
}
236288

237289
// the encrypted state does not exist but the unencrypted file does
238290
err = migration.MigrateToEncryptedConfig(ctx, l,
239291
paths.AgentStateStoreYmlFile(),
240292
paths.AgentStateStoreFile())
241293
if err != nil {
242-
return logReturn(l, errors.New(err, "error migrating agent state"))
294+
return errors.New(err, "error migrating agent state")
243295
}
244296

245297
agentInfo, err := info.NewAgentInfoWithLog(ctx, defaultLogLevel(cfg, logLvl.String()), createAgentID)
246298
if err != nil {
247-
return logReturn(l, errors.New(err,
299+
return errors.New(err,
248300
"could not load agent info",
249301
errors.TypeFilesystem,
250-
errors.M(errors.MetaKeyPath, pathConfigFile)))
302+
errors.M(errors.MetaKeyPath, pathConfigFile))
251303
}
252304

253305
// Ensure that the log level now matches what is configured in the agentInfo.
@@ -273,14 +325,14 @@ func runElasticAgent(
273325

274326
execPath, err := reexecPath()
275327
if err != nil {
276-
return logReturn(l, fmt.Errorf("failed to get reexec path: %w", err))
328+
return fmt.Errorf("failed to get reexec path: %w", err)
277329
}
278330
rexLogger := l.Named("reexec")
279331
rex := reexec.NewManager(rexLogger, execPath)
280332

281333
tracer, err := initTracer(agentName, release.Version(), cfg.Settings.MonitoringConfig)
282334
if err != nil {
283-
return logReturn(l, fmt.Errorf("could not initiate APM tracer: %w", err))
335+
return fmt.Errorf("could not initiate APM tracer: %w", err)
284336
}
285337
if tracer != nil {
286338
l.Info("APM instrumentation enabled")
@@ -296,12 +348,12 @@ func runElasticAgent(
296348
coord, configMgr, _, err := application.New(ctx, l, baseLogger, logLvl, agentInfo, rex, tracer, testingMode,
297349
fleetInitTimeout, isBootstrap, override, upgradeDetailsFromMarker, modifiers...)
298350
if err != nil {
299-
return logReturn(l, err)
351+
return err
300352
}
301353

302354
monitoringServer, err := setupMetrics(l, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, tracer, coord)
303355
if err != nil {
304-
return logReturn(l, err)
356+
return err
305357
}
306358
coord.RegisterMonitoringServer(monitoringServer)
307359
defer func() {
@@ -325,7 +377,7 @@ func runElasticAgent(
325377

326378
// start the control listener
327379
if err := control.Start(); err != nil {
328-
return logReturn(l, err)
380+
return err
329381
}
330382
defer control.Stop()
331383

@@ -408,7 +460,7 @@ LOOP:
408460
if isRex {
409461
rex.ShutdownComplete()
410462
}
411-
return logReturn(l, err)
463+
return err
412464
}
413465

414466
func loadConfig(ctx context.Context, override application.CfgOverrider) (*configuration.Configuration, error) {

0 commit comments

Comments
 (0)