diff --git a/api/v1beta1/hetznerbaremetalmachine_types.go b/api/v1beta1/hetznerbaremetalmachine_types.go index 8151745e7..a671c9806 100644 --- a/api/v1beta1/hetznerbaremetalmachine_types.go +++ b/api/v1beta1/hetznerbaremetalmachine_types.go @@ -187,6 +187,11 @@ type Image struct { // URL defines the remote URL for downloading a tar, tar.gz, tar.bz, tar.bz2, tar.xz, tgz, tbz, txz image. URL string `json:"url,omitempty"` + // UseCustomImageURLCommand makes the controller use the command provided by `--baremetal-image-url-command` instead of installimage. + // Docs: https://syself.com/docs/caph/developers/image-url-command + // +optional + UseCustomImageURLCommand bool `json:"useCustomImageURLCommand"` + // Name defines the archive name after download. This has to be a valid name for Installimage. Name string `json:"name,omitempty"` @@ -197,6 +202,9 @@ type Image struct { // GetDetails returns the path of the image and whether the image has to be downloaded. func (image Image) GetDetails() (imagePath string, needsDownload bool, errorMessage string) { // If image is set, then the URL is also set and we have to download a remote file + if image.UseCustomImageURLCommand { + return "", false, "internal error: image.UseCustomImageURLCommand is active. Method GetDetails() should be used for the traditional way (without image-url-command)." + } switch { case image.Name != "" && image.URL != "": suffix, err := GetImageSuffix(image.URL) diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalhosts.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalhosts.yaml index ebf753262..e347061c3 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalhosts.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalhosts.yaml @@ -396,6 +396,11 @@ spec: a tar, tar.gz, tar.bz, tar.bz2, tar.xz, tgz, tbz, txz image. type: string + useCustomImageURLCommand: + description: |- + UseCustomImageURLCommand makes the controller use the command provided by `--baremetal-image-url-command` instead of installimage. + Docs: https://syself.com/docs/caph/developers/image-url-command + type: boolean type: object logicalVolumeDefinitions: description: LVMDefinitions defines the logical volume definitions diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalmachines.yaml index 80ed05595..9d7cfe261 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalmachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalmachines.yaml @@ -155,6 +155,11 @@ spec: description: URL defines the remote URL for downloading a tar, tar.gz, tar.bz, tar.bz2, tar.xz, tgz, tbz, txz image. type: string + useCustomImageURLCommand: + description: |- + UseCustomImageURLCommand makes the controller use the command provided by `--baremetal-image-url-command` instead of installimage. + Docs: https://syself.com/docs/caph/developers/image-url-command + type: boolean type: object logicalVolumeDefinitions: description: LVMDefinitions defines the logical volume definitions diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalmachinetemplates.yaml index f4458d0f7..c00a8329f 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalmachinetemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_hetznerbaremetalmachinetemplates.yaml @@ -142,6 +142,11 @@ spec: a tar, tar.gz, tar.bz, tar.bz2, tar.xz, tgz, tbz, txz image. type: string + useCustomImageURLCommand: + description: |- + UseCustomImageURLCommand makes the controller use the command provided by `--baremetal-image-url-command` instead of installimage. + Docs: https://syself.com/docs/caph/developers/image-url-command + type: boolean type: object logicalVolumeDefinitions: description: LVMDefinitions defines the logical volume diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index df2a2b3d6..74d2ca0d3 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -23,9 +23,11 @@ import ( "reflect" "time" + "github.com/google/go-cmp/cmp" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" @@ -47,6 +49,7 @@ import ( robotclient "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/robot" sshclient "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/ssh" "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/host" + "github.com/syself/cluster-api-provider-hetzner/pkg/utils" ) // HetznerBareMetalHostReconciler reconciles a HetznerBareMetalHost object. @@ -59,6 +62,7 @@ type HetznerBareMetalHostReconciler struct { WatchFilterValue string PreProvisionCommand string SSHAfterInstallImage bool + ImageURLCommand string } //+kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=hetznerbaremetalhosts,verbs=get;list;watch;create;update;patch;delete @@ -88,6 +92,58 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl return reconcile.Result{}, err } + // ---------------------------------------------------------------- + // Start: avoid conflict errors. Wait until local cache is up-to-date + // Won't be needed once this was implemented: + // https://github.com/kubernetes-sigs/controller-runtime/issues/3320 + initialHost := bmHost.DeepCopy() + defer func() { + // We can potentially optimize this further by ensuring that the cache is up to date only in + // the cases where an outdated cache would lead to problems. Currently, we ensure that the + // cache is up to date in all cases, i.e. for all possible changes to the + // HetznerBareMetalHost object. + if cmp.Equal(initialHost, bmHost) { + // Nothing has changed. No need to wait. + return + } + startReadOwnWrite := time.Now() + + // The object changed. Wait until the new version is in the local cache + + // Get the latest version from the apiserver. + apiserverHost := &infrav1.HetznerBareMetalHost{} + + // Use uncached APIReader + err := r.APIReader.Get(ctx, client.ObjectKeyFromObject(bmHost), apiserverHost) + if err != nil { + reterr = errors.Join(reterr, + fmt.Errorf("failed get HetznerBareMetalHost via uncached APIReader: %w", err)) + return + } + + apiserverRV := apiserverHost.ResourceVersion + + err = wait.PollUntilContextTimeout(ctx, 100*time.Millisecond, 3*time.Second, true, func(ctx context.Context) (done bool, err error) { + // new resource, read from local cache + latestFromLocalCache := &infrav1.HetznerBareMetalHost{} + getErr := r.Get(ctx, client.ObjectKeyFromObject(apiserverHost), latestFromLocalCache) + if apierrors.IsNotFound(getErr) { + // the object was deleted. All is fine. + return true, nil + } + if getErr != nil { + return false, getErr + } + return utils.IsLocalCacheUpToDate(latestFromLocalCache.ResourceVersion, apiserverRV), nil + }) + if err != nil { + log.Error(err, "cache sync failed after BootState change") + } + log.Info("Wait for update being in local cache", "durationWaitForLocalCacheSync", time.Since(startReadOwnWrite).Round(time.Millisecond)) + }() + // End: avoid conflict errors. Wait until local cache is up-to-date + // ---------------------------------------------------------------- + initialProvisioningState := bmHost.Spec.Status.ProvisioningState defer func() { if initialProvisioningState != bmHost.Spec.Status.ProvisioningState { @@ -203,6 +259,7 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl RescueSSHSecret: rescueSSHSecret, SecretManager: secretManager, PreProvisionCommand: r.PreProvisionCommand, + ImageURLCommand: r.ImageURLCommand, SSHAfterInstallImage: r.SSHAfterInstallImage, }) if err != nil { diff --git a/docs/caph/04-developers/06-image-url-command.md b/docs/caph/04-developers/06-image-url-command.md index d7d2476af..6c3728b40 100644 --- a/docs/caph/04-developers/06-image-url-command.md +++ b/docs/caph/04-developers/06-image-url-command.md @@ -5,21 +5,29 @@ sidebar: image-url-command description: Documentation on the CAPH image-url-command --- -The `--hcloud-image-url-command` for the caph controller can be used to execute a custom command to -install the node image. +The `--hcloud-image-url-command` and `--baremtal-image-url-command` for the caph controller can be +used to execute a custom command to install the node image. This provides you a flexible way to create nodes. -The script/binary will be copied into the Hetzner Rescue System and executed. +The script/binary will be copied into the rescue system and executed. You need to enable two things: * The caph binary must get argument. Example: - `--hcloud-image-url-command=/shared/image-url-command.sh` -* The hcloudmachine resource must have spec.imageURL set (usually via a hcloudmachinetemplate) + `--[hcloud|baremetal]-image-url-command=/shared/image-url-command.sh` +* for hcloud: The hcloudmachine resource must have spec.imageURL set (usually via a + hcloudmachinetemplate) +* for baremetal: The hetznerbaremetal resource must use `useCustomImageURLCommand: true`. -The command will get the imageURL, bootstrap-data and machine-name of the corresponding -hcloudmachine as argument. +The command will get the imageURL, bootstrap-data, machine-name of the corresponding +machine and the root devices (seperated by spaces) as argument. + +Example: + +```bash +/root/image-url-command oci://example.com/yourimage:v1 /root/bootstrap.data my-md-bm-kh57r-5z2v8-zdfc9 'sda sdb' +``` It is up to the command to download from that URL and provision the disk accordingly. This command must be accessible by the controller pod. You can use an initContainer to copy the command to a @@ -36,7 +44,7 @@ A Kubernetes event will be created in both (success, failure) cases containing t and stderr) of the script. If the script takes longer than 7 minutes, the controller cancels the provisioning. -We measured these durations: +We measured these durations for hcloud: | oldState | newState | avg(s) | min(s) | max(s) | |----------|----------|-------:|-------:|-------:| diff --git a/main.go b/main.go index 2ea93f041..f70ec14e9 100644 --- a/main.go +++ b/main.go @@ -85,7 +85,8 @@ var ( syncPeriod time.Duration rateLimitWaitTime time.Duration preProvisionCommand string - imageURLCommand string + hcloudImageURLCommand string + baremetalImageURLCommand string skipWebhooks bool sshAfterInstallImage bool ) @@ -108,7 +109,8 @@ func main() { fs.DurationVar(&rateLimitWaitTime, "rate-limit", 5*time.Minute, "The rate limiting for HCloud controller (e.g. 5m)") fs.BoolVar(&hcloudclient.DebugAPICalls, "debug-hcloud-api-calls", false, "Debug all calls to the hcloud API.") fs.StringVar(&preProvisionCommand, "pre-provision-command", "", "Command to run (in rescue-system) before installing the image on bare metal servers. You can use that to check if the machine is healthy before installing the image. If the exit value is non-zero, the machine is considered unhealthy. This command must be accessible by the controller pod. You can use an initContainer to copy the command to a shared emptyDir.") - fs.StringVar(&imageURLCommand, "hcloud-image-url-command", "", "Command to run (in rescue-system) to provision an hcloud machine. The command will get the imageURL, bootstrap-data and machine-name of the corresponding hcloudmachine as argument. It is up to the command to download from that URL and provision the disk accordingly. This command must be accessible by the controller pod. You can use an initContainer to copy the command to a shared emptyDir. The env var OCI_REGISTRY_AUTH_TOKEN from the caph process will be set for the command, too. The command must end with the last line containing IMAGE_URL_DONE. Otherwise the execution is considered to have failed. Docs: https://syself.com/docs/caph/developers/image-url-command") + fs.StringVar(&hcloudImageURLCommand, "hcloud-image-url-command", "", "Command to run (in rescue-system) to provision an hcloud machine. Docs: https://syself.com/docs/caph/developers/image-url-command") + fs.StringVar(&baremetalImageURLCommand, "baremetal-image-url-command", "", "Command to run (in rescue-system) to provision an baremetal machine. Docs: https://syself.com/docs/caph/developers/image-url-command") fs.BoolVar(&skipWebhooks, "skip-webhooks", false, "Skip setting up of webhooks. Together with --leader-elect=false, you can use `go run main.go` to run CAPH in a cluster connected via KUBECONFIG. You should scale down the caph deployment to 0 before doing that. This is only for testing!") fs.BoolVar(&sshAfterInstallImage, "baremetal-ssh-after-install-image", true, "Connect to the baremetal machine after install-image and ensure it is provisioned. Current default is true, but we might change that to false. Background: Users might not want the controller to be able to ssh onto the servers") @@ -133,22 +135,38 @@ func main() { } } - // If ImageURLCommand is set, check if the file exists and validate the basename. - if imageURLCommand != "" { - baseName := filepath.Base(imageURLCommand) + // If hcloudImageURLCommand is set, check if the file exists and validate the basename. + if hcloudImageURLCommand != "" { + baseName := filepath.Base(hcloudImageURLCommand) if !commandRegex.MatchString(baseName) { msg := fmt.Sprintf("basename (%s) must match the regex %s", baseName, commandRegex.String()) setupLog.Error(errors.New(msg), "") os.Exit(1) } - _, err := os.Stat(imageURLCommand) + _, err := os.Stat(hcloudImageURLCommand) if err != nil { setupLog.Error(err, "hcloud-image-url-command not found") os.Exit(1) } } + // If baremetalImageURLCommand is set, check if the file exists and validate the basename. + if baremetalImageURLCommand != "" { + baseName := filepath.Base(baremetalImageURLCommand) + if !commandRegex.MatchString(baseName) { + msg := fmt.Sprintf("basename (%s) must match the regex %s", baseName, commandRegex.String()) + setupLog.Error(errors.New(msg), "") + os.Exit(1) + } + + _, err := os.Stat(baremetalImageURLCommand) + if err != nil { + setupLog.Error(err, "baremetal-image-url-command not found") + os.Exit(1) + } + } + var watchNamespaces map[string]cache.Config if watchNamespace != "" { watchNamespaces = map[string]cache.Config{ @@ -215,7 +233,7 @@ func main() { HCloudClientFactory: hcloudClientFactory, SSHClientFactory: sshclient.NewFactory(), WatchFilterValue: watchFilterValue, - ImageURLCommand: imageURLCommand, + ImageURLCommand: hcloudImageURLCommand, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: hcloudMachineConcurrency}); err != nil { setupLog.Error(err, "unable to create controller", "controller", "HCloudMachine") os.Exit(1) @@ -240,6 +258,7 @@ func main() { RateLimitWaitTime: rateLimitWaitTime, WatchFilterValue: watchFilterValue, PreProvisionCommand: preProvisionCommand, + ImageURLCommand: baremetalImageURLCommand, SSHAfterInstallImage: sshAfterInstallImage, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: hetznerBareMetalHostConcurrency}); err != nil { setupLog.Error(err, "unable to create controller", "controller", "HetznerBareMetalHost") diff --git a/pkg/scope/baremetalhost.go b/pkg/scope/baremetalhost.go index 4e56361e7..e8c7df003 100644 --- a/pkg/scope/baremetalhost.go +++ b/pkg/scope/baremetalhost.go @@ -48,6 +48,7 @@ type BareMetalHostScopeParams struct { RescueSSHSecret *corev1.Secret SecretManager *secretutil.SecretManager PreProvisionCommand string + ImageURLCommand string SSHAfterInstallImage bool } @@ -101,6 +102,7 @@ func NewBareMetalHostScope(params BareMetalHostScopeParams) (*BareMetalHostScope cluster: params.Cluster, hetznerCluster: params.HetznerCluster, }, + ImageURLCommand: params.ImageURLCommand, }, nil } @@ -120,6 +122,7 @@ type BareMetalHostScope struct { PreProvisionCommand string SSHAfterInstallImage bool WorkloadClusterClientFactory WorkloadClusterClientFactory + ImageURLCommand string } // Name returns the HetznerCluster name. diff --git a/pkg/scope/cluster.go b/pkg/scope/cluster.go index 8b285151f..9c73dd40d 100644 --- a/pkg/scope/cluster.go +++ b/pkg/scope/cluster.go @@ -37,13 +37,14 @@ import ( // ClusterScopeParams defines the input parameters used to create a new scope. type ClusterScopeParams struct { - Client client.Client - APIReader client.Reader - Logger logr.Logger - HetznerSecret *corev1.Secret - HCloudClient hcloudclient.Client - Cluster *clusterv1.Cluster - HetznerCluster *infrav1.HetznerCluster + Client client.Client + APIReader client.Reader + Logger logr.Logger + HetznerSecret *corev1.Secret + HCloudClient hcloudclient.Client + Cluster *clusterv1.Cluster + HetznerCluster *infrav1.HetznerCluster + ImageURLCommand string } // NewClusterScope creates a new Scope from the supplied parameters. diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 925ea2c72..52c8be2ab 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -635,7 +635,7 @@ func (s *Service) actionRegistering(ctx context.Context) actionResult { timeSinceReboot := "unknown" if s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated != nil { - timeSinceReboot = time.Since(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated.Time).String() + timeSinceReboot = time.Since(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated.Time).Round(time.Second).String() } s.scope.Logger.Info("Could not reach rescue system. Will retry some seconds later.", "out", out.String(), "hostName", hostName, @@ -1195,6 +1195,9 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult { return actionStop{} } + if s.scope.HetznerBareMetalHost.Spec.Status.InstallImage.Image.UseCustomImageURLCommand { + return s.actionImageInstallingCustomImageURLCommand(ctx, sshClient) + } state, err := sshClient.GetInstallImageState() if err != nil { return actionError{err: fmt.Errorf("failed to get state of installimage processes: %w", err)} @@ -1215,6 +1218,133 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult { } } +func (s *Service) actionImageInstallingCustomImageURLCommand(ctx context.Context, sshClient sshclient.Client) actionResult { + host := s.scope.HetznerBareMetalHost + + state, logFile, err := sshClient.StateOfImageURLCommand() + if err != nil { + return actionError{err: fmt.Errorf("StateOfImageURLCommand failed: %w", err)} + } + + duration := time.Since(host.Spec.Status.LastUpdated.Time) + // Please keep the number (7) in sync with the docstring of ImageURL. + if duration > 7*time.Minute { + // timeout. Something has failed. + msg := fmt.Sprintf("ImageURLCommand timed out after %s. Deleting machine", + duration.Round(time.Second).String()) + s.scope.Logger.Error(nil, msg, "logFile", logFile) + conditions.MarkFalse(host, infrav1.ProvisionSucceededCondition, + "ImageURLCommandTimedOut", clusterv1.ConditionSeverityWarning, + "%s", msg) + return s.recordActionFailure(infrav1.FatalError, msg) + } + + switch state { + case sshclient.ImageURLCommandStateRunning: + return actionContinue{delay: 10 * time.Second} + + case sshclient.ImageURLCommandStateFinishedSuccessfully: + record.Event(s.scope.HetznerBareMetalHost, "ImageURLCommandOutput", logFile) + s.scope.Logger.Info("ImageURLCommandOutput", "logFile", logFile) + + // Update name in robot API + if _, err := s.scope.RobotClient.SetBMServerName(s.scope.HetznerBareMetalHost.Spec.ServerID, s.scope.Hostname()); err != nil { + record.Warn(s.scope.HetznerBareMetalHost, "SetBMServerNameFailed", err.Error()) + s.handleRobotRateLimitExceeded(err, "SetBMServerName") + return actionError{err: fmt.Errorf("failed to update name of host in robot API: %w", err)} + } + + // Reboot via SSH + if err := sshClient.Reboot().Err; err != nil { + err = fmt.Errorf("failed to reboot server (after install-image): %w", err) + record.Warn(s.scope.HetznerBareMetalHost, "RebootFailed", err.Error()) + return actionError{err: err} + } + + msg := "machine image and cloud-init data got installed (via image-url-command)" + createSSHRebootEvent(ctx, s.scope.HetznerBareMetalHost, msg) + + // clear potential errors - all done + s.scope.HetznerBareMetalHost.ClearError() + return actionComplete{} + + case sshclient.ImageURLCommandStateFailed: + record.Warn(s.scope.HetznerBareMetalHost, "InstallImageNotSuccessful", logFile) + msg := "image-url-command failed" + s.scope.Logger.Error(nil, msg, "logFile", logFile) + conditions.MarkFalse(host, infrav1.ProvisionSucceededCondition, + "ImageURLCommandFailed", clusterv1.ConditionSeverityWarning, + "%s", msg) + return s.recordActionFailure(infrav1.FatalError, msg) + + case sshclient.ImageURLCommandStateNotStarted: + data, err := s.scope.GetRawBootstrapData(ctx) + if err != nil { + return actionError{err: fmt.Errorf("baremetal GetRawBootstrapData failed: %w", err)} + } + + if s.scope.ImageURLCommand == "" { + err = errors.New("internal error: --baremetal-image-url-command is not set?") + s.scope.Logger.Error(err, "") + conditions.MarkFalse(s.scope.HetznerBareMetalHost, infrav1.ProvisionSucceededCondition, + "ImageURLCommandMissing", + clusterv1.ConditionSeverityError, + "%s", err.Error()) + // this can only be changed by updating the controller. This will make the + // controller reconcile all resources. + return actionContinue{delay: time.Hour} + } + + // get the information about storage devices again to have the latest names. + // Device names can change during restart. + storage, err := obtainHardwareDetailsStorage(sshClient) + if err != nil { + return actionError{err: fmt.Errorf("failed to obtain hardware details storage: %w", err)} + } + + // get device names from storage device + deviceNames := getDeviceNames(s.scope.HetznerBareMetalHost.Spec.RootDeviceHints.ListOfWWN(), storage) + + exitStatus, stdoutStderr, err := sshClient.StartImageURLCommand(ctx, s.scope.ImageURLCommand, s.scope.HetznerBareMetalHost.Spec.Status.InstallImage.Image.URL, data, s.scope.Hostname(), deviceNames) + if err != nil { + err := fmt.Errorf("StartImageURLCommand failed (retrying): %w", err) + // This could be a temporary network error. Retry. + s.scope.Logger.Error(err, "", + "ImageURLCommand", s.scope.ImageURLCommand, + "exitStatus", exitStatus, + "stdoutStderr", stdoutStderr) + conditions.MarkFalse(s.scope.HetznerBareMetalHost, infrav1.ProvisionSucceededCondition, + "ImageURLCommandFailedToStart", + clusterv1.ConditionSeverityWarning, + "%s", err.Error()) + return actionError{err: err} + } + + if exitStatus != 0 { + msg := "StartImageURLCommand failed with non-zero exit status. Deleting machine" + s.scope.Logger.Error(nil, msg, + "ImageURLCommand", s.scope.ImageURLCommand, + "exitStatus", exitStatus, + "stdoutStderr", stdoutStderr) + conditions.MarkFalse(s.scope.HetznerBareMetalHost, infrav1.ProvisionSucceededCondition, + "StartImageURLCommandFailed", + clusterv1.ConditionSeverityWarning, + "%s", msg) + return s.recordActionFailure(infrav1.ProvisioningError, msg) + } + + conditions.MarkFalse(s.scope.HetznerBareMetalHost, infrav1.ProvisionSucceededCondition, + "ImageURLCommandStarted", + clusterv1.ConditionSeverityInfo, + "baremetal-image-url-command started") + + return actionContinue{delay: 55 * time.Second} + + default: + return actionError{err: fmt.Errorf("unknown ImageURLCommandState: %q", state)} + } +} + func (s *Service) actionImageInstallingStartBackgroundProcess(ctx context.Context, sshClient sshclient.Client) actionResult { // CheckDisk before accessing the disk info, err := sshClient.CheckDisk(ctx, s.scope.HetznerBareMetalHost.Spec.RootDeviceHints.ListOfWWN()) @@ -1485,6 +1615,7 @@ func (s *Service) createAutoSetupInput(sshClient sshclient.Client) (autoSetupInp } // get the information about storage devices again to have the latest names which are then taken for installimage + // Device names can change during restart. storage, err := obtainHardwareDetailsStorage(sshClient) if err != nil { return autoSetupInput{}, actionError{err: fmt.Errorf("failed to obtain hardware details storage: %w", err)} diff --git a/pkg/services/baremetal/host/host_suite_test.go b/pkg/services/baremetal/host/host_suite_test.go index 52ded0140..1addc170e 100644 --- a/pkg/services/baremetal/host/host_suite_test.go +++ b/pkg/services/baremetal/host/host_suite_test.go @@ -34,6 +34,7 @@ import ( infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" "github.com/syself/cluster-api-provider-hetzner/pkg/scope" + secretutil "github.com/syself/cluster-api-provider-hetzner/pkg/secrets" robotclient "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/robot" sshclient "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/ssh" "github.com/syself/cluster-api-provider-hetzner/test/helpers" @@ -148,6 +149,7 @@ func newTestService( &scope.BareMetalHostScope{ Logger: log, Client: c, + SecretManager: secretutil.NewSecretManager(log, c, c), SSHClientFactory: sshClientFactory, RobotClient: robotClient, HetznerBareMetalHost: host, @@ -163,6 +165,7 @@ func newTestService( WorkloadClusterClientFactory: &fakeWorkloadClusterClientFactory{ client: c, }, + ImageURLCommand: "image-url-command", }, } } diff --git a/pkg/services/baremetal/host/host_test.go b/pkg/services/baremetal/host/host_test.go index 54e5026fa..98e5ada94 100644 --- a/pkg/services/baremetal/host/host_test.go +++ b/pkg/services/baremetal/host/host_test.go @@ -100,6 +100,152 @@ var _ = Describe("SetErrorMessage", func() { ) }) +var _ = Describe("actionImageInstalling (image-url-command)", func() { + ctx := context.Background() + + newBaseHost := func() *infrav1.HetznerBareMetalHost { + host := helpers.BareMetalHost( + "test-host", + "default", + helpers.WithIPv4(), + helpers.WithConsumerRef(), + helpers.WithSSHStatus(), + ) + // Set install image with custom image-url-command mode + host.Spec.Status.InstallImage = &infrav1.InstallImage{ + Image: infrav1.Image{ + URL: "https://example.com/foo/image", + UseCustomImageURLCommand: true, + }, + } + // Ensure LastUpdated is now by default + t := metav1.Now() + host.Spec.Status.LastUpdated = &t + return host + } + + It("returns continue when command is running", func() { + host := newBaseHost() + sshMock := &sshmock.Client{} + sshMock.On("GetHostName").Return(sshclient.Output{StdOut: "rescue"}) + sshMock.On("StateOfImageURLCommand").Return(sshclient.ImageURLCommandStateRunning, "", nil) + + svc := newTestService(host, nil, bmmock.NewSSHFactory(sshMock, sshMock, sshMock), nil, helpers.GetDefaultSSHSecret(rescueSSHKeyName, "default")) + + res := svc.actionImageInstalling(ctx) + Expect(res).To(BeAssignableToTypeOf(actionContinue{})) + c := conditions.Get(host, infrav1.ProvisionSucceededCondition) + Expect(c.Message).To(Equal(`host (test-host) is still provisioning - state "image-installing"`)) + }) + + It("reboots and completes when command finished successfully", func() { + host := newBaseHost() + sshMock := &sshmock.Client{} + sshMock.On("GetHostName").Return(sshclient.Output{StdOut: "rescue"}) + sshMock.On("StateOfImageURLCommand").Return(sshclient.ImageURLCommandStateFinishedSuccessfully, "LOGFILE-CONTENT", nil) + sshMock.On("Reboot").Return(sshclient.Output{}) + + robot := robotmock.Client{} + robot.On("SetBMServerName", mock.Anything, infrav1.BareMetalHostNamePrefix+host.Spec.ConsumerRef.Name).Return(nil, nil) + + svc := newTestService(host, &robot, bmmock.NewSSHFactory(sshMock, sshMock, sshMock), nil, helpers.GetDefaultSSHSecret(rescueSSHKeyName, "default")) + + res := svc.actionImageInstalling(ctx) + Expect(res).To(BeAssignableToTypeOf(actionComplete{})) + Expect(sshMock.AssertCalled(GinkgoT(), "Reboot")).To(BeTrue()) + Expect(robot.AssertCalled(GinkgoT(), "SetBMServerName", mock.Anything, infrav1.BareMetalHostNamePrefix+host.Spec.ConsumerRef.Name)).To(BeTrue()) + // error should be cleared + Expect(host.Spec.Status.ErrorMessage).To(Equal("")) + c := conditions.Get(host, infrav1.ProvisionSucceededCondition) + Expect(c.Message).To(Equal(`host (test-host) is still provisioning - state "image-installing"`)) + }) + + It("returns error when command failed", func() { + host := newBaseHost() + sshMock := &sshmock.Client{} + sshMock.On("GetHostName").Return(sshclient.Output{StdOut: "rescue"}) + sshMock.On("StateOfImageURLCommand").Return(sshclient.ImageURLCommandStateFailed, "some logs", nil) + + svc := newTestService(host, nil, bmmock.NewSSHFactory(sshMock, sshMock, sshMock), nil, helpers.GetDefaultSSHSecret(rescueSSHKeyName, "default")) + res := svc.actionImageInstalling(ctx) + Expect(res).To(BeAssignableToTypeOf(actionFailed{})) + c := conditions.Get(host, infrav1.ProvisionSucceededCondition) + Expect(c.Message).To(ContainSubstring("image-url-command failed")) + }) + + It("starts the command on NotStarted and continues", func() { + host := newBaseHost() + // set UserData secret ref and create the secret the scope's SecretManager will fetch + host.Spec.Status.UserData = &corev1.SecretReference{ // bootstrap secret ref + Name: "bootstrap-secret", + Namespace: host.Namespace, + } + + // Build service with fake client containing the bootstrap secret + sshMock := &sshmock.Client{} + sshMock.On("GetHostName").Return(sshclient.Output{StdOut: "rescue"}) + sshMock.On("StateOfImageURLCommand").Return(sshclient.ImageURLCommandStateNotStarted, "", nil) + + svc := newTestService(host, nil, bmmock.NewSSHFactory(sshMock, sshMock, sshMock), nil, helpers.GetDefaultSSHSecret(rescueSSHKeyName, "default")) + sshMock.On("StartImageURLCommand", mock.Anything, "image-url-command", host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"}).Return(0, "", nil) + // Create bootstrap secret in fake client with key 'value' + secret := &corev1.Secret{ObjectMeta: metav1.ObjectMeta{Name: host.Spec.Status.UserData.Name, Namespace: host.Spec.Status.UserData.Namespace}, Data: map[string][]byte{"value": []byte("#cloud-config")}} + Expect(svc.scope.Client.Create(ctx, secret)).To(Succeed()) + + sshMock.On("GetHardwareDetailsStorage").Return(sshclient.Output{StdOut: `NAME="nvme1n1" TYPE="disk" HCTL="" MODEL="SAMSUNG MZVLB512HAJQ-00000" VENDOR="" SERIAL="S3W8NX0N811178" SIZE="512110190592" WWN="eui.0025388801b4dff2" ROTA="0"`}) + svc.scope.HetznerBareMetalHost.Spec.RootDeviceHints = &infrav1.RootDeviceHints{ + WWN: "eui.0025388801b4dff2", + } + + res := svc.actionImageInstalling(ctx) + Expect(res).To(BeAssignableToTypeOf(actionContinue{})) + Expect(sshMock.AssertCalled(GinkgoT(), "StartImageURLCommand", mock.Anything, "image-url-command", host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"})).To(BeTrue()) + c := conditions.Get(host, infrav1.ProvisionSucceededCondition) + Expect(c.Message).To(ContainSubstring(`baremetal-image-url-command started`)) + }) + + It("records failure when StartImageURLCommand returns non-zero exit", func() { + host := newBaseHost() + host.Spec.Status.UserData = &corev1.SecretReference{Name: "bootstrap-secret", Namespace: host.Namespace} + + sshMock := &sshmock.Client{} + sshMock.On("GetHostName").Return(sshclient.Output{StdOut: "rescue"}) + sshMock.On("StateOfImageURLCommand").Return(sshclient.ImageURLCommandStateNotStarted, "", nil) + + svc := newTestService(host, nil, bmmock.NewSSHFactory(sshMock, sshMock, sshMock), nil, helpers.GetDefaultSSHSecret(rescueSSHKeyName, "default")) + sshMock.On("StartImageURLCommand", mock.Anything, "image-url-command", host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"}).Return(7, "boom", nil) + + secret := &corev1.Secret{ObjectMeta: metav1.ObjectMeta{Name: host.Spec.Status.UserData.Name, Namespace: host.Spec.Status.UserData.Namespace}, Data: map[string][]byte{"value": []byte("#cloud-config")}} + Expect(svc.scope.Client.Create(ctx, secret)).To(Succeed()) + + sshMock.On("GetHardwareDetailsStorage").Return(sshclient.Output{StdOut: `NAME="nvme1n1" TYPE="disk" HCTL="" MODEL="SAMSUNG MZVLB512HAJQ-00000" VENDOR="" SERIAL="S3W8NX0N811178" SIZE="512110190592" WWN="eui.0025388801b4dff2" ROTA="0"`}) + svc.scope.HetznerBareMetalHost.Spec.RootDeviceHints = &infrav1.RootDeviceHints{ + WWN: "eui.0025388801b4dff2", + } + res := svc.actionImageInstalling(ctx) + Expect(res).To(BeAssignableToTypeOf(actionFailed{})) + c := conditions.Get(host, infrav1.ProvisionSucceededCondition) + Expect(c.Message).To(ContainSubstring("StartImageURLCommand failed with non-zero exit status. Deleting machine")) + }) + + It("times out after 7 minutes", func() { + host := newBaseHost() + sevenPlus := metav1.NewTime(time.Now().Add(-8 * time.Minute)) + host.Spec.Status.LastUpdated = &sevenPlus + + sshMock := &sshmock.Client{} + sshMock.On("GetHostName").Return(sshclient.Output{StdOut: "rescue"}) + sshMock.On("StateOfImageURLCommand").Return(sshclient.ImageURLCommandStateRunning, "", nil) + + svc := newTestService(host, nil, bmmock.NewSSHFactory(sshMock, sshMock, sshMock), nil, helpers.GetDefaultSSHSecret(rescueSSHKeyName, "default")) + + res := svc.actionImageInstalling(ctx) + Expect(res).To(BeAssignableToTypeOf(actionFailed{})) + c := conditions.Get(host, infrav1.ProvisionSucceededCondition) + Expect(c.Message).To(ContainSubstring("ImageURLCommand timed out")) + }) +}) + var _ = Describe("test validateRootDeviceWwnsAreSubsetOfExistingWwns", func() { It("should return error when storageDevices is empty", func() { rootDeviceHints := &infrav1.RootDeviceHints{WWN: "wwn1"} diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index 20541a024..ec51cd39f 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -221,3 +221,21 @@ func GetDefaultLogger(logLevel string) logr.Logger { return zapr.NewLogger(zapLog) } + +// IsLocalCacheUpToDate compares two ResourceVersions, and return true if the local cache is +// up-to-date or ahead. Related: https://github.com/kubernetes-sigs/controller-runtime/issues/3320 +func IsLocalCacheUpToDate(rvLocalCache, rvAPIServer string) bool { + if len(rvLocalCache) < len(rvAPIServer) { + // RV of cache is behind. + return false + } + if len(rvLocalCache) > len(rvAPIServer) { + // RV of cache has changed like from "999" to "1000" + return true + } + if rvLocalCache >= rvAPIServer { + // RV of cache is equal, or ahead. + return true + } + return false +} diff --git a/pkg/utils/utils_test.go b/pkg/utils/utils_test.go index 742debe77..d496edcc1 100644 --- a/pkg/utils/utils_test.go +++ b/pkg/utils/utils_test.go @@ -17,8 +17,11 @@ limitations under the License. package utils_test import ( + "testing" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" @@ -321,3 +324,20 @@ var _ = Describe("Test FindOwnerRefFromList", func() { }), ) }) + +func Test_IsLocalCacheUpToDate(t *testing.T) { + // true: localCache is up-to-date + require.True(t, utils.IsLocalCacheUpToDate("123", "123")) + + // true: localCache is ahead apiserver + require.True(t, utils.IsLocalCacheUpToDate("124", "123")) + + // true: localCache is ahead apiserver + require.True(t, utils.IsLocalCacheUpToDate("1000", "999")) + + // false: localCache is behind apiserver. + require.False(t, utils.IsLocalCacheUpToDate("123", "124")) + + // false: localCache is behind apiserver. + require.False(t, utils.IsLocalCacheUpToDate("999", "1000")) +} diff --git a/templates/cluster-templates/bases/hetznerbaremetal-mt-control-plane-ubuntu.yaml b/templates/cluster-templates/bases/hetznerbaremetal-mt-control-plane-ubuntu.yaml index b75416ee6..4650a3781 100644 --- a/templates/cluster-templates/bases/hetznerbaremetal-mt-control-plane-ubuntu.yaml +++ b/templates/cluster-templates/bases/hetznerbaremetal-mt-control-plane-ubuntu.yaml @@ -34,7 +34,6 @@ spec: cloud-init clean --logs sshSpec: portAfterInstallImage: 22 - portAfterCloudInit: 22 secretRef: name: robot-ssh key: diff --git a/templates/cluster-templates/bases/hetznerbaremetal-mt-md-1-ubuntu.yaml b/templates/cluster-templates/bases/hetznerbaremetal-mt-md-1-ubuntu.yaml index e01e6890b..624e2e5ba 100644 --- a/templates/cluster-templates/bases/hetznerbaremetal-mt-md-1-ubuntu.yaml +++ b/templates/cluster-templates/bases/hetznerbaremetal-mt-md-1-ubuntu.yaml @@ -34,7 +34,6 @@ spec: cloud-init clean --logs sshSpec: portAfterInstallImage: 22 - portAfterCloudInit: 22 secretRef: name: robot-ssh key: diff --git a/templates/cluster-templates/cluster-class.yaml b/templates/cluster-templates/cluster-class.yaml index ffa3e7af6..3373ae246 100644 --- a/templates/cluster-templates/cluster-class.yaml +++ b/templates/cluster-templates/cluster-class.yaml @@ -698,7 +698,6 @@ spec: apt-get install -y cloud-init apparmor apparmor-utils cloud-init clean --logs sshSpec: - portAfterCloudInit: 22 portAfterInstallImage: 22 secretRef: key: