Skip to content

Commit 3802622

Browse files
blakerousemergify[bot]
authored andcommitted
Improve logging to catch early errors on startup (#10158)
* Update run command to have a critical section to ensure errors are always logged. * Add changelog. * Cleanup from code review. (cherry picked from commit 18beeba) # Conflicts: # internal/pkg/agent/cmd/run.go
1 parent 9c8ca42 commit 3802622

File tree

2 files changed

+134
-35
lines changed

2 files changed

+134
-35
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Kind can be one of:
2+
# - breaking-change: a change to previously-documented behavior
3+
# - deprecation: functionality that is being removed in a later release
4+
# - bug-fix: fixes a problem in a previous version
5+
# - enhancement: extends functionality but does not break or fix existing behavior
6+
# - feature: new functionality
7+
# - known-issue: problems that we are aware of in a given version
8+
# - security: impacts on the security of a product or a user’s deployment.
9+
# - upgrade: important information for someone upgrading from a prior version
10+
# - other: does not fit into any of the other categories
11+
kind: bug-fix
12+
13+
# Change summary; a 80ish characters long description of the change.
14+
summary: Improve logging to catch early errors on startup
15+
16+
# Long description; in case the summary is not enough to describe the change
17+
# this field accommodate a description without length limits.
18+
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
19+
#description:
20+
21+
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
22+
component: elastic-agent
23+
24+
# PR URL; optional; the PR number that added the changeset.
25+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
26+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
27+
# Please provide it if you are adding a fragment for a different PR.
28+
pr: https://github.com/elastic/elastic-agent/pull/10158
29+
30+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
31+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
32+
issue: https://github.com/elastic/elastic-agent/issues/9099

internal/pkg/agent/cmd/run.go

Lines changed: 102 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package cmd
66

77
import (
88
"context"
9+
goerrors "errors"
910
"fmt"
1011
"net/url"
1112
"os"
@@ -87,7 +88,6 @@ func newRunCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command {
8788
testingMode, _ := cmd.Flags().GetBool("testing-mode")
8889
if err := run(nil, testingMode, fleetInitTimeout); err != nil && !errors.Is(err, context.Canceled) {
8990
fmt.Fprintf(streams.Err, "Error: %v\n%s\n", err, troubleshootMessage())
90-
logExternal(fmt.Sprintf("%s run failed: %s", paths.BinaryName, err))
9191
return err
9292
}
9393
return nil
@@ -140,51 +140,85 @@ func run(override application.CfgOverrider, testingMode bool, fleetInitTimeout t
140140
defer cancel()
141141
go service.ProcessWindowsControlEvents(stopBeat)
142142

143-
upgradeDetailsFromMarker, err := handleUpgrade()
144-
if err != nil {
145-
return fmt.Errorf("error checking for and handling upgrade: %w", err)
146-
}
147-
148-
locker := filelock.NewAppLocker(paths.Data(), paths.AgentLockFileName)
149-
if err := locker.TryLock(); err != nil {
150-
return err
151-
}
152-
defer func() {
153-
_ = locker.Unlock()
154-
}()
155-
156-
return runElasticAgent(ctx, cancel, override, stop, testingMode, fleetInitTimeout, upgradeDetailsFromMarker, modifiers...)
143+
return runElasticAgentCritical(ctx, cancel, override, stop, testingMode, fleetInitTimeout, modifiers...)
157144
}
158145

159146
func logReturn(l *logger.Logger, err error) error {
160147
if err != nil && !errors.Is(err, context.Canceled) {
161148
l.Errorf("%s", err)
149+
logExternal(fmt.Sprintf("%s run failed: %s", paths.BinaryName, err))
162150
}
163151
return err
164152
}
165153

166-
func runElasticAgent(
154+
// runElasticAgentCritical provides a critical path to running runElasticAgent, it exhausts all efforts to log any
155+
// errors to ensure that any issues are captured in the logs.
156+
func runElasticAgentCritical(
167157
ctx context.Context,
168158
cancel context.CancelFunc,
169159
override application.CfgOverrider,
170160
stop chan bool,
171161
testingMode bool,
172162
fleetInitTimeout time.Duration,
173-
upgradeDetailsFromMarker *details.Details,
174163
modifiers ...component.PlatformModifier,
175164
) error {
176-
cfg, err := loadConfig(ctx, override)
165+
<<<<<<< HEAD
166+
=======
167+
var errs []error
168+
169+
// early handleUpgrade, but don't error yet
170+
upgradeDetailsFromMarker, err := handleUpgrade()
177171
if err != nil {
178-
return err
172+
errs = append(errs, fmt.Errorf("failed to handle upgrade: %w", err))
179173
}
180174

181-
logLvl := logger.DefaultLogLevel
182-
if cfg.Settings.LoggingConfig != nil {
183-
logLvl = cfg.Settings.LoggingConfig.Level
175+
// single run, but don't error yet
176+
locker := filelock.NewAppLocker(paths.Data(), paths.AgentLockFileName)
177+
lockErr := locker.TryLock()
178+
if lockErr != nil {
179+
errs = append(errs, fmt.Errorf("failed to get app lock: %w", err))
184180
}
181+
defer func() {
182+
_ = locker.Unlock()
183+
}()
184+
185+
// try restore (if app locker didn't fail), but don't error yet
186+
if lockErr == nil {
187+
err = coordinator.RestoreConfig()
188+
if err != nil {
189+
errs = append(errs, fmt.Errorf("failed to restore configuration: %w", err))
190+
}
191+
}
192+
193+
// try load config, but don't error yet
194+
>>>>>>> 18beeba11 (Improve logging to catch early errors on startup (#10158))
195+
cfg, err := loadConfig(ctx, override)
196+
if err != nil {
197+
// failed to load configuration, just load the default to create the logger
198+
errs = append(errs, fmt.Errorf("failed to load configuration: %w", err))
199+
cfg = configuration.DefaultConfiguration()
200+
}
201+
185202
baseLogger, err := logger.NewFromConfig("", cfg.Settings.LoggingConfig, cfg.Settings.EventLoggingConfig, true)
186203
if err != nil {
187-
return err
204+
errs = append(errs, fmt.Errorf("failed to create logger: %w", err))
205+
206+
// failed to create the baseLogger, this comes from the configuration being possibly invalid
207+
// switch to a default config and try again
208+
cfg = configuration.DefaultConfiguration()
209+
baseLogger, err = logger.NewFromConfig("", cfg.Settings.LoggingConfig, cfg.Settings.EventLoggingConfig, true)
210+
if err != nil {
211+
errs = append(errs, fmt.Errorf("failed to create logger with default configuration: %w", err))
212+
213+
// this really should not happen, but this whole critical function is very defensive
214+
baseLogger, err = logger.New("", true)
215+
if err != nil {
216+
errs = append(errs, fmt.Errorf("failed to create logger with no configuration: %w", err))
217+
218+
// again? no way, but you never know
219+
baseLogger = logger.NewWithoutConfig("")
220+
}
221+
}
188222
}
189223

190224
// Make sure to flush any buffered logs before we're done.
@@ -194,10 +228,39 @@ func runElasticAgent(
194228
"source": agentName,
195229
})
196230

231+
// at this point the logger is working, so any errors that we hit can now be logged and returned
232+
if len(errs) > 0 {
233+
return logReturn(l, goerrors.Join(errs...))
234+
}
235+
236+
// actually run the agent now
237+
err = runElasticAgent(ctx, cancel, baseLogger, l, cfg, override, stop, testingMode, fleetInitTimeout, upgradeDetailsFromMarker, modifiers...)
238+
return logReturn(l, err)
239+
}
240+
241+
// runElasticAgent runs the actual Elastic Agent.
242+
func runElasticAgent(
243+
ctx context.Context,
244+
cancel context.CancelFunc,
245+
baseLogger *logger.Logger,
246+
l *logger.Logger,
247+
cfg *configuration.Configuration,
248+
override application.CfgOverrider,
249+
stop chan bool,
250+
testingMode bool,
251+
fleetInitTimeout time.Duration,
252+
upgradeDetailsFromMarker *details.Details,
253+
modifiers ...component.PlatformModifier,
254+
) error {
255+
logLvl := logger.DefaultLogLevel
256+
if cfg.Settings.LoggingConfig != nil {
257+
logLvl = cfg.Settings.LoggingConfig.Level
258+
}
259+
197260
// try early to check if running as root
198261
isRoot, err := utils.HasRoot()
199262
if err != nil {
200-
return logReturn(l, fmt.Errorf("failed to check for root/Administrator privileges: %w", err))
263+
return fmt.Errorf("failed to check for root/Administrator privileges: %w", err)
201264
}
202265

203266
l.Infow("Elastic Agent started",
@@ -207,7 +270,7 @@ func runElasticAgent(
207270

208271
cfg, err = tryDelayEnroll(ctx, l, cfg, override)
209272
if err != nil {
210-
return logReturn(l, errors.New(err, "failed to perform delayed enrollment"))
273+
return errors.New(err, "failed to perform delayed enrollment")
211274
}
212275
pathConfigFile := paths.AgentConfigFile()
213276

@@ -223,31 +286,35 @@ func runElasticAgent(
223286
// that writes the agentID into fleet.enc (encrypted fleet.yml) before even loading the configuration.
224287
err = secret.CreateAgentSecret(ctx, vault.WithUnprivileged(!isRoot))
225288
if err != nil {
226-
return logReturn(l, fmt.Errorf("failed to read/write secrets: %w", err))
289+
return fmt.Errorf("failed to read/write secrets: %w", err)
227290
}
228291

229292
// Migrate .yml files if the corresponding .enc does not exist
230293

231294
// the encrypted config does not exist but the unencrypted file does
232295
err = migration.MigrateToEncryptedConfig(ctx, l, paths.AgentConfigYmlFile(), paths.AgentConfigFile())
233296
if err != nil {
234-
return logReturn(l, errors.New(err, "error migrating fleet config"))
297+
return errors.New(err, "error migrating fleet config")
235298
}
236299

237300
// the encrypted state does not exist but the unencrypted file does
238301
err = migration.MigrateToEncryptedConfig(ctx, l,
239302
paths.AgentStateStoreYmlFile(),
240303
paths.AgentStateStoreFile())
241304
if err != nil {
242-
return logReturn(l, errors.New(err, "error migrating agent state"))
305+
return errors.New(err, "error migrating agent state")
243306
}
244307

245308
agentInfo, err := info.NewAgentInfoWithLog(ctx, defaultLogLevel(cfg, logLvl.String()), createAgentID)
246309
if err != nil {
247-
return logReturn(l, errors.New(err,
310+
return errors.New(err,
248311
"could not load agent info",
249312
errors.TypeFilesystem,
313+
<<<<<<< HEAD
250314
errors.M(errors.MetaKeyPath, pathConfigFile)))
315+
=======
316+
errors.M(errors.MetaKeyPath, paths.AgentConfigFile()))
317+
>>>>>>> 18beeba11 (Improve logging to catch early errors on startup (#10158))
251318
}
252319

253320
// Ensure that the log level now matches what is configured in the agentInfo.
@@ -273,14 +340,14 @@ func runElasticAgent(
273340

274341
execPath, err := reexecPath()
275342
if err != nil {
276-
return logReturn(l, fmt.Errorf("failed to get reexec path: %w", err))
343+
return fmt.Errorf("failed to get reexec path: %w", err)
277344
}
278345
rexLogger := l.Named("reexec")
279346
rex := reexec.NewManager(rexLogger, execPath)
280347

281348
tracer, err := initTracer(agentName, release.Version(), cfg.Settings.MonitoringConfig)
282349
if err != nil {
283-
return logReturn(l, fmt.Errorf("could not initiate APM tracer: %w", err))
350+
return fmt.Errorf("could not initiate APM tracer: %w", err)
284351
}
285352
if tracer != nil {
286353
l.Info("APM instrumentation enabled")
@@ -296,12 +363,12 @@ func runElasticAgent(
296363
coord, configMgr, _, err := application.New(ctx, l, baseLogger, logLvl, agentInfo, rex, tracer, testingMode,
297364
fleetInitTimeout, isBootstrap, override, upgradeDetailsFromMarker, modifiers...)
298365
if err != nil {
299-
return logReturn(l, err)
366+
return err
300367
}
301368

302369
monitoringServer, err := setupMetrics(l, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, tracer, coord)
303370
if err != nil {
304-
return logReturn(l, err)
371+
return err
305372
}
306373
coord.RegisterMonitoringServer(monitoringServer)
307374
defer func() {
@@ -325,7 +392,7 @@ func runElasticAgent(
325392

326393
// start the control listener
327394
if err := control.Start(); err != nil {
328-
return logReturn(l, err)
395+
return err
329396
}
330397
defer control.Stop()
331398

@@ -408,7 +475,7 @@ LOOP:
408475
if isRex {
409476
rex.ShutdownComplete()
410477
}
411-
return logReturn(l, err)
478+
return err
412479
}
413480

414481
func loadConfig(ctx context.Context, override application.CfgOverrider) (*configuration.Configuration, error) {

0 commit comments

Comments
 (0)