@@ -6,6 +6,7 @@ package cmd
6
6
7
7
import (
8
8
"context"
9
+ goerrors "errors"
9
10
"fmt"
10
11
"net/url"
11
12
"os"
@@ -88,7 +89,6 @@ func newRunCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command {
88
89
testingMode , _ := cmd .Flags ().GetBool ("testing-mode" )
89
90
if err := run (nil , testingMode , fleetInitTimeout ); err != nil && ! errors .Is (err , context .Canceled ) {
90
91
fmt .Fprintf (streams .Err , "Error: %v\n %s\n " , err , troubleshootMessage ())
91
- logExternal (fmt .Sprintf ("%s run failed: %s" , paths .BinaryName , err ))
92
92
return err
93
93
}
94
94
return nil
@@ -141,56 +141,82 @@ func run(override application.CfgOverrider, testingMode bool, fleetInitTimeout t
141
141
defer cancel ()
142
142
go service .ProcessWindowsControlEvents (stopBeat )
143
143
144
- upgradeDetailsFromMarker , err := handleUpgrade ()
145
- if err != nil {
146
- return fmt .Errorf ("error checking for and handling upgrade: %w" , err )
147
- }
148
-
149
- locker := filelock .NewAppLocker (paths .Data (), paths .AgentLockFileName )
150
- if err := locker .TryLock (); err != nil {
151
- return err
152
- }
153
- defer func () {
154
- _ = locker .Unlock ()
155
- }()
156
-
157
- return runElasticAgent (ctx , cancel , override , stop , testingMode , fleetInitTimeout , upgradeDetailsFromMarker , modifiers ... )
144
+ return runElasticAgentCritical (ctx , cancel , override , stop , testingMode , fleetInitTimeout , modifiers ... )
158
145
}
159
146
160
147
func logReturn (l * logger.Logger , err error ) error {
161
148
if err != nil && ! errors .Is (err , context .Canceled ) {
162
149
l .Errorf ("%s" , err )
150
+ logExternal (fmt .Sprintf ("%s run failed: %s" , paths .BinaryName , err ))
163
151
}
164
152
return err
165
153
}
166
154
167
- func runElasticAgent (
155
+ // runElasticAgentCritical provides a critical path to running runElasticAgent, it exhausts all efforts to log any
156
+ // errors to ensure that any issues are captured in the logs.
157
+ func runElasticAgentCritical (
168
158
ctx context.Context ,
169
159
cancel context.CancelFunc ,
170
160
override application.CfgOverrider ,
171
161
stop chan bool ,
172
162
testingMode bool ,
173
163
fleetInitTimeout time.Duration ,
174
- upgradeDetailsFromMarker * details.Details ,
175
164
modifiers ... component.PlatformModifier ,
176
165
) error {
177
- err := coordinator .RestoreConfig ()
166
+ var errs []error
167
+
168
+ // early handleUpgrade, but don't error yet
169
+ upgradeDetailsFromMarker , err := handleUpgrade ()
178
170
if err != nil {
179
- return err
171
+ errs = append ( errs , fmt . Errorf ( "failed to handle upgrade: %w" , err ))
180
172
}
181
173
174
+ // single run, but don't error yet
175
+ locker := filelock .NewAppLocker (paths .Data (), paths .AgentLockFileName )
176
+ lockErr := locker .TryLock ()
177
+ if lockErr != nil {
178
+ errs = append (errs , fmt .Errorf ("failed to get app lock: %w" , err ))
179
+ }
180
+ defer func () {
181
+ _ = locker .Unlock ()
182
+ }()
183
+
184
+ // try restore (if app locker didn't fail), but don't error yet
185
+ if lockErr == nil {
186
+ err = coordinator .RestoreConfig ()
187
+ if err != nil {
188
+ errs = append (errs , fmt .Errorf ("failed to restore configuration: %w" , err ))
189
+ }
190
+ }
191
+
192
+ // try load config, but don't error yet
182
193
cfg , err := loadConfig (ctx , override )
183
194
if err != nil {
184
- return err
195
+ // failed to load configuration, just load the default to create the logger
196
+ errs = append (errs , fmt .Errorf ("failed to load configuration: %w" , err ))
197
+ cfg = configuration .DefaultConfiguration ()
185
198
}
186
199
187
- logLvl := logger .DefaultLogLevel
188
- if cfg .Settings .LoggingConfig != nil {
189
- logLvl = cfg .Settings .LoggingConfig .Level
190
- }
191
200
baseLogger , err := logger .NewFromConfig ("" , cfg .Settings .LoggingConfig , cfg .Settings .EventLoggingConfig , true )
192
201
if err != nil {
193
- return err
202
+ errs = append (errs , fmt .Errorf ("failed to create logger: %w" , err ))
203
+
204
+ // failed to create the baseLogger, this comes from the configuration being possibly invalid
205
+ // switch to a default config and try again
206
+ cfg = configuration .DefaultConfiguration ()
207
+ baseLogger , err = logger .NewFromConfig ("" , cfg .Settings .LoggingConfig , cfg .Settings .EventLoggingConfig , true )
208
+ if err != nil {
209
+ errs = append (errs , fmt .Errorf ("failed to create logger with default configuration: %w" , err ))
210
+
211
+ // this really should not happen, but this whole critical function is very defensive
212
+ baseLogger , err = logger .New ("" , true )
213
+ if err != nil {
214
+ errs = append (errs , fmt .Errorf ("failed to create logger with no configuration: %w" , err ))
215
+
216
+ // again? no way, but you never know
217
+ baseLogger = logger .NewWithoutConfig ("" )
218
+ }
219
+ }
194
220
}
195
221
196
222
// Make sure to flush any buffered logs before we're done.
@@ -200,10 +226,39 @@ func runElasticAgent(
200
226
"source" : agentName ,
201
227
})
202
228
229
+ // at this point the logger is working, so any errors that we hit can now be logged and returned
230
+ if len (errs ) > 0 {
231
+ return logReturn (l , goerrors .Join (errs ... ))
232
+ }
233
+
234
+ // actually run the agent now
235
+ err = runElasticAgent (ctx , cancel , baseLogger , l , cfg , override , stop , testingMode , fleetInitTimeout , upgradeDetailsFromMarker , modifiers ... )
236
+ return logReturn (l , err )
237
+ }
238
+
239
+ // runElasticAgent runs the actual Elastic Agent.
240
+ func runElasticAgent (
241
+ ctx context.Context ,
242
+ cancel context.CancelFunc ,
243
+ baseLogger * logger.Logger ,
244
+ l * logger.Logger ,
245
+ cfg * configuration.Configuration ,
246
+ override application.CfgOverrider ,
247
+ stop chan bool ,
248
+ testingMode bool ,
249
+ fleetInitTimeout time.Duration ,
250
+ upgradeDetailsFromMarker * details.Details ,
251
+ modifiers ... component.PlatformModifier ,
252
+ ) error {
253
+ logLvl := logger .DefaultLogLevel
254
+ if cfg .Settings .LoggingConfig != nil {
255
+ logLvl = cfg .Settings .LoggingConfig .Level
256
+ }
257
+
203
258
// try early to check if running as root
204
259
isRoot , err := utils .HasRoot ()
205
260
if err != nil {
206
- return logReturn ( l , fmt .Errorf ("failed to check for root/Administrator privileges: %w" , err ) )
261
+ return fmt .Errorf ("failed to check for root/Administrator privileges: %w" , err )
207
262
}
208
263
209
264
l .Infow ("Elastic Agent started" ,
@@ -213,7 +268,7 @@ func runElasticAgent(
213
268
214
269
cfg , err = tryDelayEnroll (ctx , l , cfg , override )
215
270
if err != nil {
216
- return logReturn ( l , errors .New (err , "failed to perform delayed enrollment" ) )
271
+ return errors .New (err , "failed to perform delayed enrollment" )
217
272
}
218
273
219
274
// agent ID needs to stay empty in bootstrap mode
@@ -225,31 +280,31 @@ func runElasticAgent(
225
280
// that writes the agentID into fleet.enc (encrypted fleet.yml) before even loading the configuration.
226
281
err = secret .CreateAgentSecret (ctx , vault .WithUnprivileged (! isRoot ))
227
282
if err != nil {
228
- return logReturn ( l , fmt .Errorf ("failed to read/write secrets: %w" , err ) )
283
+ return fmt .Errorf ("failed to read/write secrets: %w" , err )
229
284
}
230
285
231
286
// Migrate .yml files if the corresponding .enc does not exist
232
287
233
288
// the encrypted config does not exist but the unencrypted file does
234
289
err = migration .MigrateToEncryptedConfig (ctx , l , paths .AgentConfigYmlFile (), paths .AgentConfigFile ())
235
290
if err != nil {
236
- return logReturn ( l , errors .New (err , "error migrating fleet config" ) )
291
+ return errors .New (err , "error migrating fleet config" )
237
292
}
238
293
239
294
// the encrypted state does not exist but the unencrypted file does
240
295
err = migration .MigrateToEncryptedConfig (ctx , l ,
241
296
paths .AgentStateStoreYmlFile (),
242
297
paths .AgentStateStoreFile ())
243
298
if err != nil {
244
- return logReturn ( l , errors .New (err , "error migrating agent state" ) )
299
+ return errors .New (err , "error migrating agent state" )
245
300
}
246
301
247
302
agentInfo , err := info .NewAgentInfoWithLog (ctx , defaultLogLevel (cfg , logLvl .String ()), createAgentID )
248
303
if err != nil {
249
- return logReturn ( l , errors .New (err ,
304
+ return errors .New (err ,
250
305
"could not load agent info" ,
251
306
errors .TypeFilesystem ,
252
- errors .M (errors .MetaKeyPath , paths .AgentConfigFile ())))
307
+ errors .M (errors .MetaKeyPath , paths .AgentConfigFile ()))
253
308
}
254
309
255
310
// Ensure that the log level now matches what is configured in the agentInfo.
@@ -275,14 +330,14 @@ func runElasticAgent(
275
330
276
331
execPath , err := reexecPath ()
277
332
if err != nil {
278
- return logReturn ( l , fmt .Errorf ("failed to get reexec path: %w" , err ) )
333
+ return fmt .Errorf ("failed to get reexec path: %w" , err )
279
334
}
280
335
rexLogger := l .Named ("reexec" )
281
336
rex := reexec .NewManager (rexLogger , execPath )
282
337
283
338
tracer , err := initTracer (agentName , release .Version (), cfg .Settings .MonitoringConfig )
284
339
if err != nil {
285
- return logReturn ( l , fmt .Errorf ("could not initiate APM tracer: %w" , err ) )
340
+ return fmt .Errorf ("could not initiate APM tracer: %w" , err )
286
341
}
287
342
if tracer != nil {
288
343
l .Info ("APM instrumentation enabled" )
@@ -298,12 +353,12 @@ func runElasticAgent(
298
353
coord , configMgr , _ , err := application .New (ctx , l , baseLogger , logLvl , agentInfo , rex , tracer , testingMode ,
299
354
fleetInitTimeout , isBootstrap , override , upgradeDetailsFromMarker , modifiers ... )
300
355
if err != nil {
301
- return logReturn ( l , err )
356
+ return err
302
357
}
303
358
304
359
monitoringServer , err := setupMetrics (l , cfg .Settings .DownloadConfig .OS (), cfg .Settings .MonitoringConfig , tracer , coord )
305
360
if err != nil {
306
- return logReturn ( l , err )
361
+ return err
307
362
}
308
363
coord .RegisterMonitoringServer (monitoringServer )
309
364
defer func () {
@@ -327,7 +382,7 @@ func runElasticAgent(
327
382
328
383
// start the control listener
329
384
if err := control .Start (); err != nil {
330
- return logReturn ( l , err )
385
+ return err
331
386
}
332
387
defer control .Stop ()
333
388
@@ -410,7 +465,7 @@ LOOP:
410
465
if isRex {
411
466
rex .ShutdownComplete ()
412
467
}
413
- return logReturn ( l , err )
468
+ return err
414
469
}
415
470
416
471
func loadConfig (ctx context.Context , override application.CfgOverrider ) (* configuration.Configuration , error ) {
0 commit comments