@@ -6,6 +6,7 @@ package cmd
66
77import (
88 "context"
9+ goerrors "errors"
910 "fmt"
1011 "net/url"
1112 "os"
@@ -87,7 +88,6 @@ func newRunCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command {
8788 testingMode , _ := cmd .Flags ().GetBool ("testing-mode" )
8889 if err := run (nil , testingMode , fleetInitTimeout ); err != nil && ! errors .Is (err , context .Canceled ) {
8990 fmt .Fprintf (streams .Err , "Error: %v\n %s\n " , err , troubleshootMessage ())
90- logExternal (fmt .Sprintf ("%s run failed: %s" , paths .BinaryName , err ))
9191 return err
9292 }
9393 return nil
@@ -140,51 +140,74 @@ func run(override application.CfgOverrider, testingMode bool, fleetInitTimeout t
140140 defer cancel ()
141141 go service .ProcessWindowsControlEvents (stopBeat )
142142
143- upgradeDetailsFromMarker , err := handleUpgrade ()
144- if err != nil {
145- return fmt .Errorf ("error checking for and handling upgrade: %w" , err )
146- }
147-
148- locker := filelock .NewAppLocker (paths .Data (), paths .AgentLockFileName )
149- if err := locker .TryLock (); err != nil {
150- return err
151- }
152- defer func () {
153- _ = locker .Unlock ()
154- }()
155-
156- return runElasticAgent (ctx , cancel , override , stop , testingMode , fleetInitTimeout , upgradeDetailsFromMarker , modifiers ... )
143+ return runElasticAgentCritical (ctx , cancel , override , stop , testingMode , fleetInitTimeout , modifiers ... )
157144}
158145
159146func logReturn (l * logger.Logger , err error ) error {
160147 if err != nil && ! errors .Is (err , context .Canceled ) {
161148 l .Errorf ("%s" , err )
149+ logExternal (fmt .Sprintf ("%s run failed: %s" , paths .BinaryName , err ))
162150 }
163151 return err
164152}
165153
166- func runElasticAgent (
154+ // runElasticAgentCritical provides a critical path to running runElasticAgent, it exhausts all efforts to log any
155+ // errors to ensure that any issues are captured in the logs.
156+ func runElasticAgentCritical (
167157 ctx context.Context ,
168158 cancel context.CancelFunc ,
169159 override application.CfgOverrider ,
170160 stop chan bool ,
171161 testingMode bool ,
172162 fleetInitTimeout time.Duration ,
173- upgradeDetailsFromMarker * details.Details ,
174163 modifiers ... component.PlatformModifier ,
175164) error {
176- cfg , err := loadConfig (ctx , override )
165+ var errs []error
166+
167+ // early handleUpgrade, but don't error yet
168+ upgradeDetailsFromMarker , err := handleUpgrade ()
177169 if err != nil {
178- return err
170+ errs = append ( errs , fmt . Errorf ( "failed to handle upgrade: %w" , err ))
179171 }
180172
181- logLvl := logger .DefaultLogLevel
182- if cfg .Settings .LoggingConfig != nil {
183- logLvl = cfg .Settings .LoggingConfig .Level
173+ // single run, but don't error yet
174+ locker := filelock .NewAppLocker (paths .Data (), paths .AgentLockFileName )
175+ lockErr := locker .TryLock ()
176+ if lockErr != nil {
177+ errs = append (errs , fmt .Errorf ("failed to get app lock: %w" , err ))
178+ }
179+ defer func () {
180+ _ = locker .Unlock ()
181+ }()
182+
183+ // try load config, but don't error yet
184+ cfg , err := loadConfig (ctx , override )
185+ if err != nil {
186+ // failed to load configuration, just load the default to create the logger
187+ errs = append (errs , fmt .Errorf ("failed to load configuration: %w" , err ))
188+ cfg = configuration .DefaultConfiguration ()
184189 }
190+
185191 baseLogger , err := logger .NewFromConfig ("" , cfg .Settings .LoggingConfig , cfg .Settings .EventLoggingConfig , true )
186192 if err != nil {
187- return err
193+ errs = append (errs , fmt .Errorf ("failed to create logger: %w" , err ))
194+
195+ // failed to create the baseLogger, this comes from the configuration being possibly invalid
196+ // switch to a default config and try again
197+ cfg = configuration .DefaultConfiguration ()
198+ baseLogger , err = logger .NewFromConfig ("" , cfg .Settings .LoggingConfig , cfg .Settings .EventLoggingConfig , true )
199+ if err != nil {
200+ errs = append (errs , fmt .Errorf ("failed to create logger with default configuration: %w" , err ))
201+
202+ // this really should not happen, but this whole critical function is very defensive
203+ baseLogger , err = logger .New ("" , true )
204+ if err != nil {
205+ errs = append (errs , fmt .Errorf ("failed to create logger with no configuration: %w" , err ))
206+
207+ // again? no way, but you never know
208+ baseLogger = logger .NewWithoutConfig ("" )
209+ }
210+ }
188211 }
189212
190213 // Make sure to flush any buffered logs before we're done.
@@ -194,10 +217,39 @@ func runElasticAgent(
194217 "source" : agentName ,
195218 })
196219
220+ // at this point the logger is working, so any errors that we hit can now be logged and returned
221+ if len (errs ) > 0 {
222+ return logReturn (l , goerrors .Join (errs ... ))
223+ }
224+
225+ // actually run the agent now
226+ err = runElasticAgent (ctx , cancel , baseLogger , l , cfg , override , stop , testingMode , fleetInitTimeout , upgradeDetailsFromMarker , modifiers ... )
227+ return logReturn (l , err )
228+ }
229+
230+ // runElasticAgent runs the actual Elastic Agent.
231+ func runElasticAgent (
232+ ctx context.Context ,
233+ cancel context.CancelFunc ,
234+ baseLogger * logger.Logger ,
235+ l * logger.Logger ,
236+ cfg * configuration.Configuration ,
237+ override application.CfgOverrider ,
238+ stop chan bool ,
239+ testingMode bool ,
240+ fleetInitTimeout time.Duration ,
241+ upgradeDetailsFromMarker * details.Details ,
242+ modifiers ... component.PlatformModifier ,
243+ ) error {
244+ logLvl := logger .DefaultLogLevel
245+ if cfg .Settings .LoggingConfig != nil {
246+ logLvl = cfg .Settings .LoggingConfig .Level
247+ }
248+
197249 // try early to check if running as root
198250 isRoot , err := utils .HasRoot ()
199251 if err != nil {
200- return logReturn ( l , fmt .Errorf ("failed to check for root/Administrator privileges: %w" , err ) )
252+ return fmt .Errorf ("failed to check for root/Administrator privileges: %w" , err )
201253 }
202254
203255 l .Infow ("Elastic Agent started" ,
@@ -207,7 +259,7 @@ func runElasticAgent(
207259
208260 cfg , err = tryDelayEnroll (ctx , l , cfg , override )
209261 if err != nil {
210- return logReturn ( l , errors .New (err , "failed to perform delayed enrollment" ) )
262+ return errors .New (err , "failed to perform delayed enrollment" )
211263 }
212264 pathConfigFile := paths .AgentConfigFile ()
213265
@@ -223,31 +275,31 @@ func runElasticAgent(
223275 // that writes the agentID into fleet.enc (encrypted fleet.yml) before even loading the configuration.
224276 err = secret .CreateAgentSecret (ctx , vault .WithUnprivileged (! isRoot ))
225277 if err != nil {
226- return logReturn ( l , fmt .Errorf ("failed to read/write secrets: %w" , err ) )
278+ return fmt .Errorf ("failed to read/write secrets: %w" , err )
227279 }
228280
229281 // Migrate .yml files if the corresponding .enc does not exist
230282
231283 // the encrypted config does not exist but the unencrypted file does
232284 err = migration .MigrateToEncryptedConfig (ctx , l , paths .AgentConfigYmlFile (), paths .AgentConfigFile ())
233285 if err != nil {
234- return logReturn ( l , errors .New (err , "error migrating fleet config" ) )
286+ return errors .New (err , "error migrating fleet config" )
235287 }
236288
237289 // the encrypted state does not exist but the unencrypted file does
238290 err = migration .MigrateToEncryptedConfig (ctx , l ,
239291 paths .AgentStateStoreYmlFile (),
240292 paths .AgentStateStoreFile ())
241293 if err != nil {
242- return logReturn ( l , errors .New (err , "error migrating agent state" ) )
294+ return errors .New (err , "error migrating agent state" )
243295 }
244296
245297 agentInfo , err := info .NewAgentInfoWithLog (ctx , defaultLogLevel (cfg , logLvl .String ()), createAgentID )
246298 if err != nil {
247- return logReturn ( l , errors .New (err ,
299+ return errors .New (err ,
248300 "could not load agent info" ,
249301 errors .TypeFilesystem ,
250- errors .M (errors .MetaKeyPath , pathConfigFile )))
302+ errors .M (errors .MetaKeyPath , pathConfigFile ))
251303 }
252304
253305 // Ensure that the log level now matches what is configured in the agentInfo.
@@ -273,14 +325,14 @@ func runElasticAgent(
273325
274326 execPath , err := reexecPath ()
275327 if err != nil {
276- return logReturn ( l , fmt .Errorf ("failed to get reexec path: %w" , err ) )
328+ return fmt .Errorf ("failed to get reexec path: %w" , err )
277329 }
278330 rexLogger := l .Named ("reexec" )
279331 rex := reexec .NewManager (rexLogger , execPath )
280332
281333 tracer , err := initTracer (agentName , release .Version (), cfg .Settings .MonitoringConfig )
282334 if err != nil {
283- return logReturn ( l , fmt .Errorf ("could not initiate APM tracer: %w" , err ) )
335+ return fmt .Errorf ("could not initiate APM tracer: %w" , err )
284336 }
285337 if tracer != nil {
286338 l .Info ("APM instrumentation enabled" )
@@ -296,12 +348,12 @@ func runElasticAgent(
296348 coord , configMgr , _ , err := application .New (ctx , l , baseLogger , logLvl , agentInfo , rex , tracer , testingMode ,
297349 fleetInitTimeout , isBootstrap , override , upgradeDetailsFromMarker , modifiers ... )
298350 if err != nil {
299- return logReturn ( l , err )
351+ return err
300352 }
301353
302354 monitoringServer , err := setupMetrics (l , cfg .Settings .DownloadConfig .OS (), cfg .Settings .MonitoringConfig , tracer , coord )
303355 if err != nil {
304- return logReturn ( l , err )
356+ return err
305357 }
306358 coord .RegisterMonitoringServer (monitoringServer )
307359 defer func () {
@@ -325,7 +377,7 @@ func runElasticAgent(
325377
326378 // start the control listener
327379 if err := control .Start (); err != nil {
328- return logReturn ( l , err )
380+ return err
329381 }
330382 defer control .Stop ()
331383
@@ -408,7 +460,7 @@ LOOP:
408460 if isRex {
409461 rex .ShutdownComplete ()
410462 }
411- return logReturn ( l , err )
463+ return err
412464}
413465
414466func loadConfig (ctx context.Context , override application.CfgOverrider ) (* configuration.Configuration , error ) {
0 commit comments