@@ -6,6 +6,7 @@ package cmd
6
6
7
7
import (
8
8
"context"
9
+ goerrors "errors"
9
10
"fmt"
10
11
"net/url"
11
12
"os"
@@ -87,7 +88,6 @@ func newRunCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command {
87
88
testingMode , _ := cmd .Flags ().GetBool ("testing-mode" )
88
89
if err := run (nil , testingMode , fleetInitTimeout ); err != nil && ! errors .Is (err , context .Canceled ) {
89
90
fmt .Fprintf (streams .Err , "Error: %v\n %s\n " , err , troubleshootMessage ())
90
- logExternal (fmt .Sprintf ("%s run failed: %s" , paths .BinaryName , err ))
91
91
return err
92
92
}
93
93
return nil
@@ -140,51 +140,85 @@ func run(override application.CfgOverrider, testingMode bool, fleetInitTimeout t
140
140
defer cancel ()
141
141
go service .ProcessWindowsControlEvents (stopBeat )
142
142
143
- upgradeDetailsFromMarker , err := handleUpgrade ()
144
- if err != nil {
145
- return fmt .Errorf ("error checking for and handling upgrade: %w" , err )
146
- }
147
-
148
- locker := filelock .NewAppLocker (paths .Data (), paths .AgentLockFileName )
149
- if err := locker .TryLock (); err != nil {
150
- return err
151
- }
152
- defer func () {
153
- _ = locker .Unlock ()
154
- }()
155
-
156
- return runElasticAgent (ctx , cancel , override , stop , testingMode , fleetInitTimeout , upgradeDetailsFromMarker , modifiers ... )
143
+ return runElasticAgentCritical (ctx , cancel , override , stop , testingMode , fleetInitTimeout , modifiers ... )
157
144
}
158
145
159
146
func logReturn (l * logger.Logger , err error ) error {
160
147
if err != nil && ! errors .Is (err , context .Canceled ) {
161
148
l .Errorf ("%s" , err )
149
+ logExternal (fmt .Sprintf ("%s run failed: %s" , paths .BinaryName , err ))
162
150
}
163
151
return err
164
152
}
165
153
166
- func runElasticAgent (
154
+ // runElasticAgentCritical provides a critical path to running runElasticAgent, it exhausts all efforts to log any
155
+ // errors to ensure that any issues are captured in the logs.
156
+ func runElasticAgentCritical (
167
157
ctx context.Context ,
168
158
cancel context.CancelFunc ,
169
159
override application.CfgOverrider ,
170
160
stop chan bool ,
171
161
testingMode bool ,
172
162
fleetInitTimeout time.Duration ,
173
- upgradeDetailsFromMarker * details.Details ,
174
163
modifiers ... component.PlatformModifier ,
175
164
) error {
176
- cfg , err := loadConfig (ctx , override )
165
+ << << << < HEAD
166
+ == == == =
167
+ var errs []error
168
+
169
+ // early handleUpgrade, but don't error yet
170
+ upgradeDetailsFromMarker , err := handleUpgrade ()
177
171
if err != nil {
178
- return err
172
+ errs = append ( errs , fmt . Errorf ( "failed to handle upgrade: %w" , err ))
179
173
}
180
174
181
- logLvl := logger .DefaultLogLevel
182
- if cfg .Settings .LoggingConfig != nil {
183
- logLvl = cfg .Settings .LoggingConfig .Level
175
+ // single run, but don't error yet
176
+ locker := filelock .NewAppLocker (paths .Data (), paths .AgentLockFileName )
177
+ lockErr := locker .TryLock ()
178
+ if lockErr != nil {
179
+ errs = append (errs , fmt .Errorf ("failed to get app lock: %w" , err ))
184
180
}
181
+ defer func () {
182
+ _ = locker .Unlock ()
183
+ }()
184
+
185
+ // try restore (if app locker didn't fail), but don't error yet
186
+ if lockErr == nil {
187
+ err = coordinator .RestoreConfig ()
188
+ if err != nil {
189
+ errs = append (errs , fmt .Errorf ("failed to restore configuration: %w" , err ))
190
+ }
191
+ }
192
+
193
+ // try load config, but don't error yet
194
+ >> >> >> > 18 beeba11 (Improve logging to catch early errors on startup (#10158 ))
195
+ cfg , err := loadConfig (ctx , override )
196
+ if err != nil {
197
+ // failed to load configuration, just load the default to create the logger
198
+ errs = append (errs , fmt .Errorf ("failed to load configuration: %w" , err ))
199
+ cfg = configuration .DefaultConfiguration ()
200
+ }
201
+
185
202
baseLogger , err := logger .NewFromConfig ("" , cfg .Settings .LoggingConfig , cfg .Settings .EventLoggingConfig , true )
186
203
if err != nil {
187
- return err
204
+ errs = append (errs , fmt .Errorf ("failed to create logger: %w" , err ))
205
+
206
+ // failed to create the baseLogger, this comes from the configuration being possibly invalid
207
+ // switch to a default config and try again
208
+ cfg = configuration .DefaultConfiguration ()
209
+ baseLogger , err = logger .NewFromConfig ("" , cfg .Settings .LoggingConfig , cfg .Settings .EventLoggingConfig , true )
210
+ if err != nil {
211
+ errs = append (errs , fmt .Errorf ("failed to create logger with default configuration: %w" , err ))
212
+
213
+ // this really should not happen, but this whole critical function is very defensive
214
+ baseLogger , err = logger .New ("" , true )
215
+ if err != nil {
216
+ errs = append (errs , fmt .Errorf ("failed to create logger with no configuration: %w" , err ))
217
+
218
+ // again? no way, but you never know
219
+ baseLogger = logger .NewWithoutConfig ("" )
220
+ }
221
+ }
188
222
}
189
223
190
224
// Make sure to flush any buffered logs before we're done.
@@ -194,10 +228,39 @@ func runElasticAgent(
194
228
"source" : agentName ,
195
229
})
196
230
231
+ // at this point the logger is working, so any errors that we hit can now be logged and returned
232
+ if len (errs ) > 0 {
233
+ return logReturn (l , goerrors .Join (errs ... ))
234
+ }
235
+
236
+ // actually run the agent now
237
+ err = runElasticAgent (ctx , cancel , baseLogger , l , cfg , override , stop , testingMode , fleetInitTimeout , upgradeDetailsFromMarker , modifiers ... )
238
+ return logReturn (l , err )
239
+ }
240
+
241
+ // runElasticAgent runs the actual Elastic Agent.
242
+ func runElasticAgent (
243
+ ctx context.Context ,
244
+ cancel context.CancelFunc ,
245
+ baseLogger * logger.Logger ,
246
+ l * logger.Logger ,
247
+ cfg * configuration.Configuration ,
248
+ override application.CfgOverrider ,
249
+ stop chan bool ,
250
+ testingMode bool ,
251
+ fleetInitTimeout time.Duration ,
252
+ upgradeDetailsFromMarker * details.Details ,
253
+ modifiers ... component.PlatformModifier ,
254
+ ) error {
255
+ logLvl := logger .DefaultLogLevel
256
+ if cfg .Settings .LoggingConfig != nil {
257
+ logLvl = cfg .Settings .LoggingConfig .Level
258
+ }
259
+
197
260
// try early to check if running as root
198
261
isRoot , err := utils .HasRoot ()
199
262
if err != nil {
200
- return logReturn ( l , fmt .Errorf ("failed to check for root/Administrator privileges: %w" , err ) )
263
+ return fmt .Errorf ("failed to check for root/Administrator privileges: %w" , err )
201
264
}
202
265
203
266
l .Infow ("Elastic Agent started" ,
@@ -207,7 +270,7 @@ func runElasticAgent(
207
270
208
271
cfg , err = tryDelayEnroll (ctx , l , cfg , override )
209
272
if err != nil {
210
- return logReturn ( l , errors .New (err , "failed to perform delayed enrollment" ) )
273
+ return errors .New (err , "failed to perform delayed enrollment" )
211
274
}
212
275
pathConfigFile := paths .AgentConfigFile ()
213
276
@@ -223,31 +286,35 @@ func runElasticAgent(
223
286
// that writes the agentID into fleet.enc (encrypted fleet.yml) before even loading the configuration.
224
287
err = secret .CreateAgentSecret (ctx , vault .WithUnprivileged (! isRoot ))
225
288
if err != nil {
226
- return logReturn ( l , fmt .Errorf ("failed to read/write secrets: %w" , err ) )
289
+ return fmt .Errorf ("failed to read/write secrets: %w" , err )
227
290
}
228
291
229
292
// Migrate .yml files if the corresponding .enc does not exist
230
293
231
294
// the encrypted config does not exist but the unencrypted file does
232
295
err = migration .MigrateToEncryptedConfig (ctx , l , paths .AgentConfigYmlFile (), paths .AgentConfigFile ())
233
296
if err != nil {
234
- return logReturn ( l , errors .New (err , "error migrating fleet config" ) )
297
+ return errors .New (err , "error migrating fleet config" )
235
298
}
236
299
237
300
// the encrypted state does not exist but the unencrypted file does
238
301
err = migration .MigrateToEncryptedConfig (ctx , l ,
239
302
paths .AgentStateStoreYmlFile (),
240
303
paths .AgentStateStoreFile ())
241
304
if err != nil {
242
- return logReturn ( l , errors .New (err , "error migrating agent state" ) )
305
+ return errors .New (err , "error migrating agent state" )
243
306
}
244
307
245
308
agentInfo , err := info .NewAgentInfoWithLog (ctx , defaultLogLevel (cfg , logLvl .String ()), createAgentID )
246
309
if err != nil {
247
- return logReturn ( l , errors .New (err ,
310
+ return errors .New (err ,
248
311
"could not load agent info" ,
249
312
errors .TypeFilesystem ,
313
+ << << << < HEAD
250
314
errors .M (errors .MetaKeyPath , pathConfigFile )))
315
+ == == == =
316
+ errors .M (errors .MetaKeyPath , paths .AgentConfigFile ()))
317
+ >> >> >> > 18 beeba11 (Improve logging to catch early errors on startup (#10158 ))
251
318
}
252
319
253
320
// Ensure that the log level now matches what is configured in the agentInfo.
@@ -273,14 +340,14 @@ func runElasticAgent(
273
340
274
341
execPath , err := reexecPath ()
275
342
if err != nil {
276
- return logReturn ( l , fmt .Errorf ("failed to get reexec path: %w" , err ) )
343
+ return fmt .Errorf ("failed to get reexec path: %w" , err )
277
344
}
278
345
rexLogger := l .Named ("reexec" )
279
346
rex := reexec .NewManager (rexLogger , execPath )
280
347
281
348
tracer , err := initTracer (agentName , release .Version (), cfg .Settings .MonitoringConfig )
282
349
if err != nil {
283
- return logReturn ( l , fmt .Errorf ("could not initiate APM tracer: %w" , err ) )
350
+ return fmt .Errorf ("could not initiate APM tracer: %w" , err )
284
351
}
285
352
if tracer != nil {
286
353
l .Info ("APM instrumentation enabled" )
@@ -296,12 +363,12 @@ func runElasticAgent(
296
363
coord , configMgr , _ , err := application .New (ctx , l , baseLogger , logLvl , agentInfo , rex , tracer , testingMode ,
297
364
fleetInitTimeout , isBootstrap , override , upgradeDetailsFromMarker , modifiers ... )
298
365
if err != nil {
299
- return logReturn ( l , err )
366
+ return err
300
367
}
301
368
302
369
monitoringServer , err := setupMetrics (l , cfg .Settings .DownloadConfig .OS (), cfg .Settings .MonitoringConfig , tracer , coord )
303
370
if err != nil {
304
- return logReturn ( l , err )
371
+ return err
305
372
}
306
373
coord .RegisterMonitoringServer (monitoringServer )
307
374
defer func () {
@@ -325,7 +392,7 @@ func runElasticAgent(
325
392
326
393
// start the control listener
327
394
if err := control .Start (); err != nil {
328
- return logReturn ( l , err )
395
+ return err
329
396
}
330
397
defer control .Stop ()
331
398
@@ -408,7 +475,7 @@ LOOP:
408
475
if isRex {
409
476
rex .ShutdownComplete ()
410
477
}
411
- return logReturn ( l , err )
478
+ return err
412
479
}
413
480
414
481
func loadConfig (ctx context.Context , override application.CfgOverrider ) (* configuration.Configuration , error ) {
0 commit comments