Skip to content

Commit ca9e141

Browse files
blakerousemergify[bot]
authored andcommitted
Improve logging to catch early errors on startup (#10158)
* Update run command to have a critical section to ensure errors are always logged. * Add changelog. * Cleanup from code review. (cherry picked from commit 18beeba)
1 parent 5511fe6 commit ca9e141

File tree

2 files changed

+125
-38
lines changed

2 files changed

+125
-38
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Kind can be one of:
2+
# - breaking-change: a change to previously-documented behavior
3+
# - deprecation: functionality that is being removed in a later release
4+
# - bug-fix: fixes a problem in a previous version
5+
# - enhancement: extends functionality but does not break or fix existing behavior
6+
# - feature: new functionality
7+
# - known-issue: problems that we are aware of in a given version
8+
# - security: impacts on the security of a product or a user’s deployment.
9+
# - upgrade: important information for someone upgrading from a prior version
10+
# - other: does not fit into any of the other categories
11+
kind: bug-fix
12+
13+
# Change summary; a 80ish characters long description of the change.
14+
summary: Improve logging to catch early errors on startup
15+
16+
# Long description; in case the summary is not enough to describe the change
17+
# this field accommodate a description without length limits.
18+
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
19+
#description:
20+
21+
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
22+
component: elastic-agent
23+
24+
# PR URL; optional; the PR number that added the changeset.
25+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
26+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
27+
# Please provide it if you are adding a fragment for a different PR.
28+
pr: https://github.com/elastic/elastic-agent/pull/10158
29+
30+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
31+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
32+
issue: https://github.com/elastic/elastic-agent/issues/9099

internal/pkg/agent/cmd/run.go

Lines changed: 93 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package cmd
66

77
import (
88
"context"
9+
goerrors "errors"
910
"fmt"
1011
"net/url"
1112
"os"
@@ -88,7 +89,6 @@ func newRunCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command {
8889
testingMode, _ := cmd.Flags().GetBool("testing-mode")
8990
if err := run(nil, testingMode, fleetInitTimeout); err != nil && !errors.Is(err, context.Canceled) {
9091
fmt.Fprintf(streams.Err, "Error: %v\n%s\n", err, troubleshootMessage())
91-
logExternal(fmt.Sprintf("%s run failed: %s", paths.BinaryName, err))
9292
return err
9393
}
9494
return nil
@@ -141,56 +141,82 @@ func run(override application.CfgOverrider, testingMode bool, fleetInitTimeout t
141141
defer cancel()
142142
go service.ProcessWindowsControlEvents(stopBeat)
143143

144-
upgradeDetailsFromMarker, err := handleUpgrade()
145-
if err != nil {
146-
return fmt.Errorf("error checking for and handling upgrade: %w", err)
147-
}
148-
149-
locker := filelock.NewAppLocker(paths.Data(), paths.AgentLockFileName)
150-
if err := locker.TryLock(); err != nil {
151-
return err
152-
}
153-
defer func() {
154-
_ = locker.Unlock()
155-
}()
156-
157-
return runElasticAgent(ctx, cancel, override, stop, testingMode, fleetInitTimeout, upgradeDetailsFromMarker, modifiers...)
144+
return runElasticAgentCritical(ctx, cancel, override, stop, testingMode, fleetInitTimeout, modifiers...)
158145
}
159146

160147
func logReturn(l *logger.Logger, err error) error {
161148
if err != nil && !errors.Is(err, context.Canceled) {
162149
l.Errorf("%s", err)
150+
logExternal(fmt.Sprintf("%s run failed: %s", paths.BinaryName, err))
163151
}
164152
return err
165153
}
166154

167-
func runElasticAgent(
155+
// runElasticAgentCritical provides a critical path to running runElasticAgent, it exhausts all efforts to log any
156+
// errors to ensure that any issues are captured in the logs.
157+
func runElasticAgentCritical(
168158
ctx context.Context,
169159
cancel context.CancelFunc,
170160
override application.CfgOverrider,
171161
stop chan bool,
172162
testingMode bool,
173163
fleetInitTimeout time.Duration,
174-
upgradeDetailsFromMarker *details.Details,
175164
modifiers ...component.PlatformModifier,
176165
) error {
177-
err := coordinator.RestoreConfig()
166+
var errs []error
167+
168+
// early handleUpgrade, but don't error yet
169+
upgradeDetailsFromMarker, err := handleUpgrade()
178170
if err != nil {
179-
return err
171+
errs = append(errs, fmt.Errorf("failed to handle upgrade: %w", err))
180172
}
181173

174+
// single run, but don't error yet
175+
locker := filelock.NewAppLocker(paths.Data(), paths.AgentLockFileName)
176+
lockErr := locker.TryLock()
177+
if lockErr != nil {
178+
errs = append(errs, fmt.Errorf("failed to get app lock: %w", err))
179+
}
180+
defer func() {
181+
_ = locker.Unlock()
182+
}()
183+
184+
// try restore (if app locker didn't fail), but don't error yet
185+
if lockErr == nil {
186+
err = coordinator.RestoreConfig()
187+
if err != nil {
188+
errs = append(errs, fmt.Errorf("failed to restore configuration: %w", err))
189+
}
190+
}
191+
192+
// try load config, but don't error yet
182193
cfg, err := loadConfig(ctx, override)
183194
if err != nil {
184-
return err
195+
// failed to load configuration, just load the default to create the logger
196+
errs = append(errs, fmt.Errorf("failed to load configuration: %w", err))
197+
cfg = configuration.DefaultConfiguration()
185198
}
186199

187-
logLvl := logger.DefaultLogLevel
188-
if cfg.Settings.LoggingConfig != nil {
189-
logLvl = cfg.Settings.LoggingConfig.Level
190-
}
191200
baseLogger, err := logger.NewFromConfig("", cfg.Settings.LoggingConfig, cfg.Settings.EventLoggingConfig, true)
192201
if err != nil {
193-
return err
202+
errs = append(errs, fmt.Errorf("failed to create logger: %w", err))
203+
204+
// failed to create the baseLogger, this comes from the configuration being possibly invalid
205+
// switch to a default config and try again
206+
cfg = configuration.DefaultConfiguration()
207+
baseLogger, err = logger.NewFromConfig("", cfg.Settings.LoggingConfig, cfg.Settings.EventLoggingConfig, true)
208+
if err != nil {
209+
errs = append(errs, fmt.Errorf("failed to create logger with default configuration: %w", err))
210+
211+
// this really should not happen, but this whole critical function is very defensive
212+
baseLogger, err = logger.New("", true)
213+
if err != nil {
214+
errs = append(errs, fmt.Errorf("failed to create logger with no configuration: %w", err))
215+
216+
// again? no way, but you never know
217+
baseLogger = logger.NewWithoutConfig("")
218+
}
219+
}
194220
}
195221

196222
// Make sure to flush any buffered logs before we're done.
@@ -200,10 +226,39 @@ func runElasticAgent(
200226
"source": agentName,
201227
})
202228

229+
// at this point the logger is working, so any errors that we hit can now be logged and returned
230+
if len(errs) > 0 {
231+
return logReturn(l, goerrors.Join(errs...))
232+
}
233+
234+
// actually run the agent now
235+
err = runElasticAgent(ctx, cancel, baseLogger, l, cfg, override, stop, testingMode, fleetInitTimeout, upgradeDetailsFromMarker, modifiers...)
236+
return logReturn(l, err)
237+
}
238+
239+
// runElasticAgent runs the actual Elastic Agent.
240+
func runElasticAgent(
241+
ctx context.Context,
242+
cancel context.CancelFunc,
243+
baseLogger *logger.Logger,
244+
l *logger.Logger,
245+
cfg *configuration.Configuration,
246+
override application.CfgOverrider,
247+
stop chan bool,
248+
testingMode bool,
249+
fleetInitTimeout time.Duration,
250+
upgradeDetailsFromMarker *details.Details,
251+
modifiers ...component.PlatformModifier,
252+
) error {
253+
logLvl := logger.DefaultLogLevel
254+
if cfg.Settings.LoggingConfig != nil {
255+
logLvl = cfg.Settings.LoggingConfig.Level
256+
}
257+
203258
// try early to check if running as root
204259
isRoot, err := utils.HasRoot()
205260
if err != nil {
206-
return logReturn(l, fmt.Errorf("failed to check for root/Administrator privileges: %w", err))
261+
return fmt.Errorf("failed to check for root/Administrator privileges: %w", err)
207262
}
208263

209264
l.Infow("Elastic Agent started",
@@ -213,7 +268,7 @@ func runElasticAgent(
213268

214269
cfg, err = tryDelayEnroll(ctx, l, cfg, override)
215270
if err != nil {
216-
return logReturn(l, errors.New(err, "failed to perform delayed enrollment"))
271+
return errors.New(err, "failed to perform delayed enrollment")
217272
}
218273

219274
// agent ID needs to stay empty in bootstrap mode
@@ -225,31 +280,31 @@ func runElasticAgent(
225280
// that writes the agentID into fleet.enc (encrypted fleet.yml) before even loading the configuration.
226281
err = secret.CreateAgentSecret(ctx, vault.WithUnprivileged(!isRoot))
227282
if err != nil {
228-
return logReturn(l, fmt.Errorf("failed to read/write secrets: %w", err))
283+
return fmt.Errorf("failed to read/write secrets: %w", err)
229284
}
230285

231286
// Migrate .yml files if the corresponding .enc does not exist
232287

233288
// the encrypted config does not exist but the unencrypted file does
234289
err = migration.MigrateToEncryptedConfig(ctx, l, paths.AgentConfigYmlFile(), paths.AgentConfigFile())
235290
if err != nil {
236-
return logReturn(l, errors.New(err, "error migrating fleet config"))
291+
return errors.New(err, "error migrating fleet config")
237292
}
238293

239294
// the encrypted state does not exist but the unencrypted file does
240295
err = migration.MigrateToEncryptedConfig(ctx, l,
241296
paths.AgentStateStoreYmlFile(),
242297
paths.AgentStateStoreFile())
243298
if err != nil {
244-
return logReturn(l, errors.New(err, "error migrating agent state"))
299+
return errors.New(err, "error migrating agent state")
245300
}
246301

247302
agentInfo, err := info.NewAgentInfoWithLog(ctx, defaultLogLevel(cfg, logLvl.String()), createAgentID)
248303
if err != nil {
249-
return logReturn(l, errors.New(err,
304+
return errors.New(err,
250305
"could not load agent info",
251306
errors.TypeFilesystem,
252-
errors.M(errors.MetaKeyPath, paths.AgentConfigFile())))
307+
errors.M(errors.MetaKeyPath, paths.AgentConfigFile()))
253308
}
254309

255310
// Ensure that the log level now matches what is configured in the agentInfo.
@@ -275,14 +330,14 @@ func runElasticAgent(
275330

276331
execPath, err := reexecPath()
277332
if err != nil {
278-
return logReturn(l, fmt.Errorf("failed to get reexec path: %w", err))
333+
return fmt.Errorf("failed to get reexec path: %w", err)
279334
}
280335
rexLogger := l.Named("reexec")
281336
rex := reexec.NewManager(rexLogger, execPath)
282337

283338
tracer, err := initTracer(agentName, release.Version(), cfg.Settings.MonitoringConfig)
284339
if err != nil {
285-
return logReturn(l, fmt.Errorf("could not initiate APM tracer: %w", err))
340+
return fmt.Errorf("could not initiate APM tracer: %w", err)
286341
}
287342
if tracer != nil {
288343
l.Info("APM instrumentation enabled")
@@ -298,12 +353,12 @@ func runElasticAgent(
298353
coord, configMgr, _, err := application.New(ctx, l, baseLogger, logLvl, agentInfo, rex, tracer, testingMode,
299354
fleetInitTimeout, isBootstrap, override, upgradeDetailsFromMarker, modifiers...)
300355
if err != nil {
301-
return logReturn(l, err)
356+
return err
302357
}
303358

304359
monitoringServer, err := setupMetrics(l, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, tracer, coord)
305360
if err != nil {
306-
return logReturn(l, err)
361+
return err
307362
}
308363
coord.RegisterMonitoringServer(monitoringServer)
309364
defer func() {
@@ -327,7 +382,7 @@ func runElasticAgent(
327382

328383
// start the control listener
329384
if err := control.Start(); err != nil {
330-
return logReturn(l, err)
385+
return err
331386
}
332387
defer control.Stop()
333388

@@ -410,7 +465,7 @@ LOOP:
410465
if isRex {
411466
rex.ShutdownComplete()
412467
}
413-
return logReturn(l, err)
468+
return err
414469
}
415470

416471
func loadConfig(ctx context.Context, override application.CfgOverrider) (*configuration.Configuration, error) {

0 commit comments

Comments
 (0)