Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# Change summary; a 80ish characters long description of the change.
summary: Improve logging to catch early errors on startup

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/10158

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
issue: https://github.com/elastic/elastic-agent/issues/9099
124 changes: 88 additions & 36 deletions internal/pkg/agent/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package cmd

import (
"context"
goerrors "errors"
"fmt"
"net/url"
"os"
Expand Down Expand Up @@ -87,7 +88,6 @@ func newRunCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command {
testingMode, _ := cmd.Flags().GetBool("testing-mode")
if err := run(nil, testingMode, fleetInitTimeout); err != nil && !errors.Is(err, context.Canceled) {
fmt.Fprintf(streams.Err, "Error: %v\n%s\n", err, troubleshootMessage())
logExternal(fmt.Sprintf("%s run failed: %s", paths.BinaryName, err))
return err
}
return nil
Expand Down Expand Up @@ -140,51 +140,74 @@ func run(override application.CfgOverrider, testingMode bool, fleetInitTimeout t
defer cancel()
go service.ProcessWindowsControlEvents(stopBeat)

upgradeDetailsFromMarker, err := handleUpgrade()
if err != nil {
return fmt.Errorf("error checking for and handling upgrade: %w", err)
}

locker := filelock.NewAppLocker(paths.Data(), paths.AgentLockFileName)
if err := locker.TryLock(); err != nil {
return err
}
defer func() {
_ = locker.Unlock()
}()

return runElasticAgent(ctx, cancel, override, stop, testingMode, fleetInitTimeout, upgradeDetailsFromMarker, modifiers...)
return runElasticAgentCritical(ctx, cancel, override, stop, testingMode, fleetInitTimeout, modifiers...)
}

func logReturn(l *logger.Logger, err error) error {
if err != nil && !errors.Is(err, context.Canceled) {
l.Errorf("%s", err)
logExternal(fmt.Sprintf("%s run failed: %s", paths.BinaryName, err))
}
return err
}

func runElasticAgent(
// runElasticAgentCritical provides a critical path to running runElasticAgent, it exhausts all efforts to log any
// errors to ensure that any issues are captured in the logs.
func runElasticAgentCritical(
ctx context.Context,
cancel context.CancelFunc,
override application.CfgOverrider,
stop chan bool,
testingMode bool,
fleetInitTimeout time.Duration,
upgradeDetailsFromMarker *details.Details,
modifiers ...component.PlatformModifier,
) error {
cfg, err := loadConfig(ctx, override)
var errs []error

// early handleUpgrade, but don't error yet
upgradeDetailsFromMarker, err := handleUpgrade()
if err != nil {
return err
errs = append(errs, fmt.Errorf("failed to handle upgrade: %w", err))
}

logLvl := logger.DefaultLogLevel
if cfg.Settings.LoggingConfig != nil {
logLvl = cfg.Settings.LoggingConfig.Level
// single run, but don't error yet
locker := filelock.NewAppLocker(paths.Data(), paths.AgentLockFileName)
lockErr := locker.TryLock()
if lockErr != nil {
errs = append(errs, fmt.Errorf("failed to get app lock: %w", err))
}
defer func() {
_ = locker.Unlock()
}()

// try load config, but don't error yet
cfg, err := loadConfig(ctx, override)
if err != nil {
// failed to load configuration, just load the default to create the logger
errs = append(errs, fmt.Errorf("failed to load configuration: %w", err))
cfg = configuration.DefaultConfiguration()
}

baseLogger, err := logger.NewFromConfig("", cfg.Settings.LoggingConfig, cfg.Settings.EventLoggingConfig, true)
if err != nil {
return err
errs = append(errs, fmt.Errorf("failed to create logger: %w", err))

// failed to create the baseLogger, this comes from the configuration being possibly invalid
// switch to a default config and try again
cfg = configuration.DefaultConfiguration()
baseLogger, err = logger.NewFromConfig("", cfg.Settings.LoggingConfig, cfg.Settings.EventLoggingConfig, true)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create logger with default configuration: %w", err))

// this really should not happen, but this whole critical function is very defensive
baseLogger, err = logger.New("", true)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create logger with no configuration: %w", err))

// again? no way, but you never know
baseLogger = logger.NewWithoutConfig("")
}
}
}

// Make sure to flush any buffered logs before we're done.
Expand All @@ -194,10 +217,39 @@ func runElasticAgent(
"source": agentName,
})

// at this point the logger is working, so any errors that we hit can now be logged and returned
if len(errs) > 0 {
return logReturn(l, goerrors.Join(errs...))
}

// actually run the agent now
err = runElasticAgent(ctx, cancel, baseLogger, l, cfg, override, stop, testingMode, fleetInitTimeout, upgradeDetailsFromMarker, modifiers...)
return logReturn(l, err)
}

// runElasticAgent runs the actual Elastic Agent.
func runElasticAgent(
ctx context.Context,
cancel context.CancelFunc,
baseLogger *logger.Logger,
l *logger.Logger,
cfg *configuration.Configuration,
override application.CfgOverrider,
stop chan bool,
testingMode bool,
fleetInitTimeout time.Duration,
upgradeDetailsFromMarker *details.Details,
modifiers ...component.PlatformModifier,
) error {
logLvl := logger.DefaultLogLevel
if cfg.Settings.LoggingConfig != nil {
logLvl = cfg.Settings.LoggingConfig.Level
}

// try early to check if running as root
isRoot, err := utils.HasRoot()
if err != nil {
return logReturn(l, fmt.Errorf("failed to check for root/Administrator privileges: %w", err))
return fmt.Errorf("failed to check for root/Administrator privileges: %w", err)
}

l.Infow("Elastic Agent started",
Expand All @@ -207,7 +259,7 @@ func runElasticAgent(

cfg, err = tryDelayEnroll(ctx, l, cfg, override)
if err != nil {
return logReturn(l, errors.New(err, "failed to perform delayed enrollment"))
return errors.New(err, "failed to perform delayed enrollment")
}
pathConfigFile := paths.AgentConfigFile()

Expand All @@ -223,31 +275,31 @@ func runElasticAgent(
// that writes the agentID into fleet.enc (encrypted fleet.yml) before even loading the configuration.
err = secret.CreateAgentSecret(ctx, vault.WithUnprivileged(!isRoot))
if err != nil {
return logReturn(l, fmt.Errorf("failed to read/write secrets: %w", err))
return fmt.Errorf("failed to read/write secrets: %w", err)
}

// Migrate .yml files if the corresponding .enc does not exist

// the encrypted config does not exist but the unencrypted file does
err = migration.MigrateToEncryptedConfig(ctx, l, paths.AgentConfigYmlFile(), paths.AgentConfigFile())
if err != nil {
return logReturn(l, errors.New(err, "error migrating fleet config"))
return errors.New(err, "error migrating fleet config")
}

// the encrypted state does not exist but the unencrypted file does
err = migration.MigrateToEncryptedConfig(ctx, l,
paths.AgentStateStoreYmlFile(),
paths.AgentStateStoreFile())
if err != nil {
return logReturn(l, errors.New(err, "error migrating agent state"))
return errors.New(err, "error migrating agent state")
}

agentInfo, err := info.NewAgentInfoWithLog(ctx, defaultLogLevel(cfg, logLvl.String()), createAgentID)
if err != nil {
return logReturn(l, errors.New(err,
return errors.New(err,
"could not load agent info",
errors.TypeFilesystem,
errors.M(errors.MetaKeyPath, pathConfigFile)))
errors.M(errors.MetaKeyPath, pathConfigFile))
}

// Ensure that the log level now matches what is configured in the agentInfo.
Expand All @@ -273,14 +325,14 @@ func runElasticAgent(

execPath, err := reexecPath()
if err != nil {
return logReturn(l, fmt.Errorf("failed to get reexec path: %w", err))
return fmt.Errorf("failed to get reexec path: %w", err)
}
rexLogger := l.Named("reexec")
rex := reexec.NewManager(rexLogger, execPath)

tracer, err := initTracer(agentName, release.Version(), cfg.Settings.MonitoringConfig)
if err != nil {
return logReturn(l, fmt.Errorf("could not initiate APM tracer: %w", err))
return fmt.Errorf("could not initiate APM tracer: %w", err)
}
if tracer != nil {
l.Info("APM instrumentation enabled")
Expand All @@ -296,12 +348,12 @@ func runElasticAgent(
coord, configMgr, _, err := application.New(ctx, l, baseLogger, logLvl, agentInfo, rex, tracer, testingMode,
fleetInitTimeout, isBootstrap, override, upgradeDetailsFromMarker, modifiers...)
if err != nil {
return logReturn(l, err)
return err
}

monitoringServer, err := setupMetrics(l, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, tracer, coord)
if err != nil {
return logReturn(l, err)
return err
}
coord.RegisterMonitoringServer(monitoringServer)
defer func() {
Expand All @@ -325,7 +377,7 @@ func runElasticAgent(

// start the control listener
if err := control.Start(); err != nil {
return logReturn(l, err)
return err
}
defer control.Stop()

Expand Down Expand Up @@ -408,7 +460,7 @@ LOOP:
if isRex {
rex.ShutdownComplete()
}
return logReturn(l, err)
return err
}

func loadConfig(ctx context.Context, override application.CfgOverrider) (*configuration.Configuration, error) {
Expand Down