From 26f35cf3e8fe5448eba28fe478c89df8ead7184a Mon Sep 17 00:00:00 2001 From: Richard Vanderpool <49568690+rvanderp3@users.noreply.github.com> Date: Tue, 26 Aug 2025 17:03:03 -0400 Subject: [PATCH] Allocate dedicated host when a dedicated host doesn't exist --- api/v1beta1/awscluster_conversion.go | 3 + api/v1beta1/awsmachine_conversion.go | 11 + api/v1beta1/conversion.go | 5 + api/v1beta1/zz_generated.conversion.go | 11 +- api/v1beta2/awsmachine_types.go | 58 ++- api/v1beta2/awsmachine_webhook.go | 17 +- api/v1beta2/awsmachine_webhook_test.go | 49 ++- api/v1beta2/awsmachinetemplate_webhook.go | 17 + .../awsmachinetemplate_webhook_test.go | 20 + api/v1beta2/conditions_consts.go | 13 + api/v1beta2/types.go | 32 ++ api/v1beta2/zz_generated.deepcopy.go | 98 +++++ ...ster.x-k8s.io_awsmanagedcontrolplanes.yaml | 22 ++ ...tructure.cluster.x-k8s.io_awsclusters.yaml | 11 + ...tructure.cluster.x-k8s.io_awsmachines.yaml | 62 +++- ....cluster.x-k8s.io_awsmachinetemplates.yaml | 34 +- controllers/awsmachine_controller.go | 170 +++++++++ .../machine-with-dynamic-dedicated-host.yaml | 47 +++ go.mod | 6 +- go.sum | 3 +- pkg/cloud/services/common/common.go | 3 + pkg/cloud/services/ec2/dedicatedhosts.go | 258 +++++++++++++ pkg/cloud/services/ec2/dedicatedhosts_test.go | 344 ++++++++++++++++++ pkg/cloud/services/ec2/instances.go | 106 +++++- pkg/cloud/services/ec2/service.go | 4 + pkg/cloud/services/interfaces.go | 5 + .../mock_services/ec2_interface_mock.go | 44 +++ test/mocks/aws_ec2api_mock.go | 60 +++ 28 files changed, 1473 insertions(+), 40 deletions(-) create mode 100644 examples/machine-with-dynamic-dedicated-host.yaml create mode 100644 pkg/cloud/services/ec2/dedicatedhosts.go create mode 100644 pkg/cloud/services/ec2/dedicatedhosts_test.go diff --git a/api/v1beta1/awscluster_conversion.go b/api/v1beta1/awscluster_conversion.go index a201fd6935..805d60856e 100644 --- a/api/v1beta1/awscluster_conversion.go +++ b/api/v1beta1/awscluster_conversion.go @@ -67,6 +67,9 @@ func (src *AWSCluster) ConvertTo(dstRaw conversion.Hub) error { dst.Status.Bastion.HostID = restored.Status.Bastion.HostID dst.Status.Bastion.CapacityReservationPreference = restored.Status.Bastion.CapacityReservationPreference dst.Status.Bastion.CPUOptions = restored.Status.Bastion.CPUOptions + if restored.Status.Bastion.DynamicHostAllocation != nil { + dst.Status.Bastion.DynamicHostAllocation = restored.Status.Bastion.DynamicHostAllocation + } } dst.Spec.Partition = restored.Spec.Partition diff --git a/api/v1beta1/awsmachine_conversion.go b/api/v1beta1/awsmachine_conversion.go index e809b649b2..e68f0fb60e 100644 --- a/api/v1beta1/awsmachine_conversion.go +++ b/api/v1beta1/awsmachine_conversion.go @@ -49,6 +49,9 @@ func (src *AWSMachine) ConvertTo(dstRaw conversion.Hub) error { dst.Spec.CapacityReservationPreference = restored.Spec.CapacityReservationPreference dst.Spec.NetworkInterfaceType = restored.Spec.NetworkInterfaceType dst.Spec.CPUOptions = restored.Spec.CPUOptions + if restored.Spec.DynamicHostAllocation != nil { + dst.Spec.DynamicHostAllocation = restored.Spec.DynamicHostAllocation + } if restored.Spec.ElasticIPPool != nil { if dst.Spec.ElasticIPPool == nil { dst.Spec.ElasticIPPool = &infrav1.ElasticIPPool{} @@ -61,6 +64,11 @@ func (src *AWSMachine) ConvertTo(dstRaw conversion.Hub) error { } } + dst.Status.DedicatedHost = restored.Status.DedicatedHost + dst.Status.HostReleaseAttempts = restored.Status.HostReleaseAttempts + dst.Status.LastHostReleaseAttempt = restored.Status.LastHostReleaseAttempt + dst.Status.HostReleaseFailedReason = restored.Status.HostReleaseFailedReason + return nil } @@ -117,6 +125,9 @@ func (r *AWSMachineTemplate) ConvertTo(dstRaw conversion.Hub) error { dst.Spec.Template.Spec.CapacityReservationPreference = restored.Spec.Template.Spec.CapacityReservationPreference dst.Spec.Template.Spec.NetworkInterfaceType = restored.Spec.Template.Spec.NetworkInterfaceType dst.Spec.Template.Spec.CPUOptions = restored.Spec.Template.Spec.CPUOptions + if restored.Spec.Template.Spec.DynamicHostAllocation != nil { + dst.Spec.Template.Spec.DynamicHostAllocation = restored.Spec.Template.Spec.DynamicHostAllocation + } if restored.Spec.Template.Spec.ElasticIPPool != nil { if dst.Spec.Template.Spec.ElasticIPPool == nil { dst.Spec.Template.Spec.ElasticIPPool = &infrav1.ElasticIPPool{} diff --git a/api/v1beta1/conversion.go b/api/v1beta1/conversion.go index 6247cfeab1..c3000ee97c 100644 --- a/api/v1beta1/conversion.go +++ b/api/v1beta1/conversion.go @@ -103,3 +103,8 @@ func Convert_v1beta2_S3Bucket_To_v1beta1_S3Bucket(in *v1beta2.S3Bucket, out *S3B func Convert_v1beta2_Ignition_To_v1beta1_Ignition(in *v1beta2.Ignition, out *Ignition, s conversion.Scope) error { return autoConvert_v1beta2_Ignition_To_v1beta1_Ignition(in, out, s) } + +func Convert_v1beta2_AWSMachineStatus_To_v1beta1_AWSMachineStatus(in *v1beta2.AWSMachineStatus, out *AWSMachineStatus, s conversion.Scope) error { + // Note: DedicatedHostID is not present in v1beta1, so it will be dropped during conversion + return autoConvert_v1beta2_AWSMachineStatus_To_v1beta1_AWSMachineStatus(in, out, s) +} diff --git a/api/v1beta1/zz_generated.conversion.go b/api/v1beta1/zz_generated.conversion.go index 9c7a33e9fb..fab1e050f4 100644 --- a/api/v1beta1/zz_generated.conversion.go +++ b/api/v1beta1/zz_generated.conversion.go @@ -1452,6 +1452,7 @@ func autoConvert_v1beta2_AWSMachineSpec_To_v1beta1_AWSMachineSpec(in *v1beta2.AW // WARNING: in.MarketType requires manual conversion: does not exist in peer-type // WARNING: in.HostID requires manual conversion: does not exist in peer-type // WARNING: in.HostAffinity requires manual conversion: does not exist in peer-type + // WARNING: in.DynamicHostAllocation requires manual conversion: does not exist in peer-type // WARNING: in.CapacityReservationPreference requires manual conversion: does not exist in peer-type return nil } @@ -1480,14 +1481,13 @@ func autoConvert_v1beta2_AWSMachineStatus_To_v1beta1_AWSMachineStatus(in *v1beta out.FailureReason = (*string)(unsafe.Pointer(in.FailureReason)) out.FailureMessage = (*string)(unsafe.Pointer(in.FailureMessage)) out.Conditions = *(*apiv1beta1.Conditions)(unsafe.Pointer(&in.Conditions)) + // WARNING: in.DedicatedHost requires manual conversion: does not exist in peer-type + // WARNING: in.HostReleaseAttempts requires manual conversion: does not exist in peer-type + // WARNING: in.LastHostReleaseAttempt requires manual conversion: does not exist in peer-type + // WARNING: in.HostReleaseFailedReason requires manual conversion: does not exist in peer-type return nil } -// Convert_v1beta2_AWSMachineStatus_To_v1beta1_AWSMachineStatus is an autogenerated conversion function. -func Convert_v1beta2_AWSMachineStatus_To_v1beta1_AWSMachineStatus(in *v1beta2.AWSMachineStatus, out *AWSMachineStatus, s conversion.Scope) error { - return autoConvert_v1beta2_AWSMachineStatus_To_v1beta1_AWSMachineStatus(in, out, s) -} - func autoConvert_v1beta1_AWSMachineTemplate_To_v1beta2_AWSMachineTemplate(in *AWSMachineTemplate, out *v1beta2.AWSMachineTemplate, s conversion.Scope) error { out.TypeMeta = in.TypeMeta out.ObjectMeta = in.ObjectMeta @@ -2063,6 +2063,7 @@ func autoConvert_v1beta2_Instance_To_v1beta1_Instance(in *v1beta2.Instance, out // WARNING: in.MarketType requires manual conversion: does not exist in peer-type // WARNING: in.HostAffinity requires manual conversion: does not exist in peer-type // WARNING: in.HostID requires manual conversion: does not exist in peer-type + // WARNING: in.DynamicHostAllocation requires manual conversion: does not exist in peer-type // WARNING: in.CapacityReservationPreference requires manual conversion: does not exist in peer-type // WARNING: in.CPUOptions requires manual conversion: does not exist in peer-type return nil diff --git a/api/v1beta2/awsmachine_types.go b/api/v1beta2/awsmachine_types.go index 7031bdbaae..bfa0f5fd9f 100644 --- a/api/v1beta2/awsmachine_types.go +++ b/api/v1beta2/awsmachine_types.go @@ -218,6 +218,10 @@ type AWSMachineSpec struct { PlacementGroupPartition int64 `json:"placementGroupPartition,omitempty"` // Tenancy indicates if instance should run on shared or single-tenant hardware. + // When Tenancy=host, AWS will attempt to find a suitable host from: + // - Preexisting allocated hosts that have auto-placement enabled + // - A specific host ID, if configured + // - Allocating a new dedicated host if DynamicHostAllocation is configured // +optional // +kubebuilder:validation:Enum:=default;dedicated;host Tenancy string `json:"tenancy,omitempty"` @@ -240,17 +244,28 @@ type AWSMachineSpec struct { MarketType MarketType `json:"marketType,omitempty"` // HostID specifies the Dedicated Host on which the instance must be started. + // This field is mutually exclusive with DynamicHostAllocation. + // +kubebuilder:validation:Pattern=`^h-[0-9a-f]{17}$` + // +kubebuilder:validation:MaxLength=19 // +optional HostID *string `json:"hostID,omitempty"` // HostAffinity specifies the dedicated host affinity setting for the instance. - // When hostAffinity is set to host, an instance started onto a specific host always restarts on the same host if stopped. - // When hostAffinity is set to default, and you stop and restart the instance, it can be restarted on any available host. + // When HostAffinity is set to host, an instance started onto a specific host always restarts on the same host if stopped. + // When HostAffinity is set to default, and you stop and restart the instance, it can be restarted on any available host. // When HostAffinity is defined, HostID is required. // +optional // +kubebuilder:validation:Enum:=default;host + // +kubebuilder:default=host HostAffinity *string `json:"hostAffinity,omitempty"` + // DynamicHostAllocation enables automatic allocation of a single dedicated host. + // This field is mutually exclusive with HostID and always allocates exactly one host. + // Cost effectiveness of allocating a single instance on a dedicated host may vary + // depending on the instance type and the region. + // +optional + DynamicHostAllocation *DynamicHostAllocationSpec `json:"dynamicHostAllocation,omitempty"` + // CapacityReservationPreference specifies the preference for use of Capacity Reservations by the instance. Valid values include: // "Open": The instance may make use of open Capacity Reservations that match its AZ and InstanceType // "None": The instance may not make use of any Capacity Reservations. This is to conserve open reservations for desired workloads @@ -260,6 +275,14 @@ type AWSMachineSpec struct { CapacityReservationPreference CapacityReservationPreference `json:"capacityReservationPreference,omitempty"` } +// DynamicHostAllocationSpec defines the configuration for dynamic dedicated host allocation. +// This specification always allocates exactly one dedicated host per machine. +type DynamicHostAllocationSpec struct { + // Tags to apply to the allocated dedicated host. + // +optional + Tags map[string]string `json:"tags,omitempty"` +} + // CloudInit defines options related to the bootstrapping systems where // CloudInit is used. type CloudInit struct { @@ -438,6 +461,37 @@ type AWSMachineStatus struct { // Conditions defines current service state of the AWSMachine. // +optional Conditions clusterv1.Conditions `json:"conditions,omitempty"` + + // DedicatedHost tracks the dynamically allocated dedicated host. + // This field is populated when DynamicHostAllocation is used. + // +optional + DedicatedHost *DedicatedHostStatus `json:"dedicatedHost,omitempty"` + + // HostReleaseAttempts tracks the number of attempts to release the dedicated host. + // +optional + HostReleaseAttempts *int32 `json:"hostReleaseAttempts,omitempty"` + + // LastHostReleaseAttempt tracks the timestamp of the last attempt to release the dedicated host. + // +optional + LastHostReleaseAttempt *metav1.Time `json:"lastHostReleaseAttempt,omitempty"` + + // HostReleaseFailedReason tracks the reason for the last host release failure. + // +optional + HostReleaseFailedReason *string `json:"hostReleaseFailedReason,omitempty"` +} + +// DedicatedHostStatus defines the observed state of a dynamically allocated dedicated host +// associated with an AWSMachine. This struct is used to track the ID of the dedicated host +// and any failure messages encountered during host release operations. +type DedicatedHostStatus struct { + // ID tracks the dynamically allocated dedicated host ID. + // This field is populated when DynamicHostAllocation is used. + // +optional + ID *string `json:"id,omitempty"` + + // ReleaseFailureMessage tracks the last failure message for the release host attempt. + // +optional + ReleaseFailureMessage *string `json:"releaseFailureMessage,omitempty"` } // +kubebuilder:object:root=true diff --git a/api/v1beta2/awsmachine_webhook.go b/api/v1beta2/awsmachine_webhook.go index 3cb24f5cbe..9c271c6939 100644 --- a/api/v1beta2/awsmachine_webhook.go +++ b/api/v1beta2/awsmachine_webhook.go @@ -75,11 +75,11 @@ func (*awsMachineWebhook) ValidateCreate(_ context.Context, obj runtime.Object) allErrs = append(allErrs, r.validateNonRootVolumes()...) allErrs = append(allErrs, r.validateSSHKeyName()...) allErrs = append(allErrs, r.validateAdditionalSecurityGroups()...) - allErrs = append(allErrs, r.validateHostAffinity()...) allErrs = append(allErrs, r.Spec.AdditionalTags.Validate()...) allErrs = append(allErrs, r.validateNetworkElasticIPPool()...) allErrs = append(allErrs, r.validateInstanceMarketType()...) allErrs = append(allErrs, r.validateCapacityReservation()...) + allErrs = append(allErrs, r.validateHostAllocation()...) return nil, aggregateObjErrors(r.GroupVersionKind().GroupKind(), r.Name, allErrs) } @@ -109,7 +109,7 @@ func (*awsMachineWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj run allErrs = append(allErrs, r.validateCloudInitSecret()...) allErrs = append(allErrs, r.validateAdditionalSecurityGroups()...) allErrs = append(allErrs, r.Spec.AdditionalTags.Validate()...) - allErrs = append(allErrs, r.validateHostAffinity()...) + allErrs = append(allErrs, r.validateHostAllocation()...) newAWSMachineSpec := newAWSMachine["spec"].(map[string]interface{}) oldAWSMachineSpec := oldAWSMachine["spec"].(map[string]interface{}) @@ -474,14 +474,17 @@ func (r *AWSMachine) validateAdditionalSecurityGroups() field.ErrorList { return allErrs } -func (r *AWSMachine) validateHostAffinity() field.ErrorList { +func (r *AWSMachine) validateHostAllocation() field.ErrorList { var allErrs field.ErrorList - if r.Spec.HostAffinity != nil { - if r.Spec.HostID == nil || len(*r.Spec.HostID) == 0 { - allErrs = append(allErrs, field.Required(field.NewPath("spec.hostID"), "hostID must be set when hostAffinity is configured")) - } + // Check if both hostID and dynamicHostAllocation are specified + hasHostID := r.Spec.HostID != nil && len(*r.Spec.HostID) > 0 + hasDynamicHostAllocation := r.Spec.DynamicHostAllocation != nil + + if hasHostID && hasDynamicHostAllocation { + allErrs = append(allErrs, field.Forbidden(field.NewPath("spec.hostID"), "hostID and dynamicHostAllocation are mutually exclusive"), field.Forbidden(field.NewPath("spec.dynamicHostAllocation"), "hostID and dynamicHostAllocation are mutually exclusive")) } + return allErrs } diff --git a/api/v1beta2/awsmachine_webhook_test.go b/api/v1beta2/awsmachine_webhook_test.go index 66c0919ecb..d233cde8d5 100644 --- a/api/v1beta2/awsmachine_webhook_test.go +++ b/api/v1beta2/awsmachine_webhook_test.go @@ -489,16 +489,6 @@ func TestAWSMachineCreate(t *testing.T) { }, wantErr: true, }, - { - name: "configure host affinity without Host ID", - machine: &AWSMachine{ - Spec: AWSMachineSpec{ - InstanceType: "test", - HostAffinity: ptr.To("default"), - }, - }, - wantErr: true, - }, { name: "create with valid BYOIPv4", machine: &AWSMachine{ @@ -567,6 +557,45 @@ func TestAWSMachineCreate(t *testing.T) { }, wantErr: true, }, + { + name: "hostID and dynamicHostAllocation are mutually exclusive", + machine: &AWSMachine{ + Spec: AWSMachineSpec{ + InstanceType: "test", + HostID: aws.String("h-1234567890abcdef0"), + DynamicHostAllocation: &DynamicHostAllocationSpec{ + Tags: map[string]string{ + "Environment": "test", + }, + }, + }, + }, + wantErr: true, + }, + { + name: "hostID alone is valid", + machine: &AWSMachine{ + Spec: AWSMachineSpec{ + InstanceType: "test", + HostID: aws.String("h-1234567890abcdef0"), + }, + }, + wantErr: false, + }, + { + name: "dynamicHostAllocation alone is valid", + machine: &AWSMachine{ + Spec: AWSMachineSpec{ + InstanceType: "test", + DynamicHostAllocation: &DynamicHostAllocationSpec{ + Tags: map[string]string{ + "Environment": "test", + }, + }, + }, + }, + wantErr: false, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/api/v1beta2/awsmachinetemplate_webhook.go b/api/v1beta2/awsmachinetemplate_webhook.go index 65e4e0ca32..aedb28d38f 100644 --- a/api/v1beta2/awsmachinetemplate_webhook.go +++ b/api/v1beta2/awsmachinetemplate_webhook.go @@ -172,6 +172,22 @@ func (r *AWSMachineTemplate) validateIgnitionAndCloudInit() field.ErrorList { return allErrs } +func (r *AWSMachineTemplate) validateHostAllocation() field.ErrorList { + var allErrs field.ErrorList + + spec := r.Spec.Template.Spec + + // Check if both hostID and dynamicHostAllocation are specified + hasHostID := spec.HostID != nil && len(*spec.HostID) > 0 + hasDynamicHostAllocation := spec.DynamicHostAllocation != nil + + if hasHostID && hasDynamicHostAllocation { + allErrs = append(allErrs, field.Forbidden(field.NewPath("spec.template.spec.hostID"), "hostID and dynamicHostAllocation are mutually exclusive"), field.Forbidden(field.NewPath("spec.template.spec.dynamicHostAllocation"), "hostID and dynamicHostAllocation are mutually exclusive")) + } + + return allErrs +} + func (r *AWSMachineTemplate) validateSSHKeyName() field.ErrorList { return validateSSHKeyName(r.Spec.Template.Spec.SSHKeyName) } @@ -205,6 +221,7 @@ func (r *AWSMachineTemplateWebhook) ValidateCreate(_ context.Context, raw runtim allErrs = append(allErrs, obj.validateSSHKeyName()...) allErrs = append(allErrs, obj.validateAdditionalSecurityGroups()...) allErrs = append(allErrs, obj.Spec.Template.Spec.AdditionalTags.Validate()...) + allErrs = append(allErrs, obj.validateHostAllocation()...) return nil, aggregateObjErrors(obj.GroupVersionKind().GroupKind(), obj.Name, allErrs) } diff --git a/api/v1beta2/awsmachinetemplate_webhook_test.go b/api/v1beta2/awsmachinetemplate_webhook_test.go index ce355d1e4b..1aefb0d260 100644 --- a/api/v1beta2/awsmachinetemplate_webhook_test.go +++ b/api/v1beta2/awsmachinetemplate_webhook_test.go @@ -80,6 +80,26 @@ func TestAWSMachineTemplateValidateCreate(t *testing.T) { }, wantError: false, }, + { + name: "hostID and dynamicHostAllocation are mutually exclusive", + inputTemplate: &AWSMachineTemplate{ + ObjectMeta: metav1.ObjectMeta{}, + Spec: AWSMachineTemplateSpec{ + Template: AWSMachineTemplateResource{ + Spec: AWSMachineSpec{ + InstanceType: "test", + HostID: aws.String("h-1234567890abcdef0"), + DynamicHostAllocation: &DynamicHostAllocationSpec{ + Tags: map[string]string{ + "Environment": "test", + }, + }, + }, + }, + }, + }, + wantError: true, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/api/v1beta2/conditions_consts.go b/api/v1beta2/conditions_consts.go index 604ef8e1d5..98bd4c2b5b 100644 --- a/api/v1beta2/conditions_consts.go +++ b/api/v1beta2/conditions_consts.go @@ -146,6 +146,10 @@ const ( // InstanceReadyCondition reports on current status of the EC2 instance. Ready indicates the instance is in a Running state. InstanceReadyCondition clusterv1.ConditionType = "InstanceReady" + // DedicatedHostReleaseCondition reports on the status of dedicated host release operations. + // This condition tracks whether the dedicated host has been successfully released or if there are failures. + DedicatedHostReleaseCondition clusterv1.ConditionType = "DedicatedHostRelease" + // InstanceNotFoundReason used when the instance couldn't be retrieved. InstanceNotFoundReason = "InstanceNotFound" // InstanceTerminatedReason instance is in a terminated state. @@ -191,4 +195,13 @@ const ( // S3BucketFailedReason is used when any errors occur during reconciliation of an S3 bucket. S3BucketFailedReason = "S3BucketCreationFailed" + + // DedicatedHostReleaseSucceededReason used when the dedicated host is successfully released. + DedicatedHostReleaseSucceededReason = "DedicatedHostReleaseSucceeded" + + // DedicatedHostReleaseFailedReason used when the dedicated host release fails. + DedicatedHostReleaseFailedReason = "DedicatedHostReleaseFailed" + + // DedicatedHostReleaseRetryingReason used when the dedicated host release is being retried. + DedicatedHostReleaseRetryingReason = "DedicatedHostReleaseRetrying" ) diff --git a/api/v1beta2/types.go b/api/v1beta2/types.go index c268165c10..6f6702a011 100644 --- a/api/v1beta2/types.go +++ b/api/v1beta2/types.go @@ -286,6 +286,11 @@ type Instance struct { // +optional HostID *string `json:"hostID,omitempty"` + // DynamicHostAllocation enables automatic allocation of dedicated hosts. + // This field is mutually exclusive with HostID. + // +optional + DynamicHostAllocation *DynamicHostAllocationSpec `json:"dynamicHostAllocation,omitempty"` + // CapacityReservationPreference specifies the preference for use of Capacity Reservations by the instance. Valid values include: // "Open": The instance may make use of open Capacity Reservations that match its AZ and InstanceType // "None": The instance may not make use of any Capacity Reservations. This is to conserve open reservations for desired workloads @@ -316,6 +321,33 @@ const ( CapacityReservationPreferenceOpen CapacityReservationPreference = "Open" ) +// DedicatedHostInfo contains information about a dedicated host. +type DedicatedHostInfo struct { + // HostID is the ID of the dedicated host. + HostID string `json:"hostID"` + + // InstanceFamily is the instance family supported by the host. + InstanceFamily string `json:"instanceFamily"` + + // InstanceType is the instance type supported by the host. + InstanceType string `json:"instanceType"` + + // AvailabilityZone is the AZ where the host is located. + AvailabilityZone string `json:"availabilityZone"` + + // State is the current state of the dedicated host. + State string `json:"state"` + + // TotalCapacity is the total number of instances that can be launched on the host. + TotalCapacity int32 `json:"totalCapacity"` + + // AvailableCapacity is the number of instances that can still be launched on the host. + AvailableCapacity int32 `json:"availableCapacity"` + + // Tags associated with the dedicated host. + Tags map[string]string `json:"tags,omitempty"` +} + // MarketType describes the market type of an Instance // +kubebuilder:validation:Enum:=OnDemand;Spot;CapacityBlock type MarketType string diff --git a/api/v1beta2/zz_generated.deepcopy.go b/api/v1beta2/zz_generated.deepcopy.go index 197cffba66..7fe3127e08 100644 --- a/api/v1beta2/zz_generated.deepcopy.go +++ b/api/v1beta2/zz_generated.deepcopy.go @@ -782,6 +782,11 @@ func (in *AWSMachineSpec) DeepCopyInto(out *AWSMachineSpec) { *out = new(string) **out = **in } + if in.DynamicHostAllocation != nil { + in, out := &in.DynamicHostAllocation, &out.DynamicHostAllocation + *out = new(DynamicHostAllocationSpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AWSMachineSpec. @@ -824,6 +829,25 @@ func (in *AWSMachineStatus) DeepCopyInto(out *AWSMachineStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.DedicatedHost != nil { + in, out := &in.DedicatedHost, &out.DedicatedHost + *out = new(DedicatedHostStatus) + (*in).DeepCopyInto(*out) + } + if in.HostReleaseAttempts != nil { + in, out := &in.HostReleaseAttempts, &out.HostReleaseAttempts + *out = new(int32) + **out = **in + } + if in.LastHostReleaseAttempt != nil { + in, out := &in.LastHostReleaseAttempt, &out.LastHostReleaseAttempt + *out = (*in).DeepCopy() + } + if in.HostReleaseFailedReason != nil { + in, out := &in.HostReleaseFailedReason, &out.HostReleaseFailedReason + *out = new(string) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AWSMachineStatus. @@ -1413,6 +1437,75 @@ func (in *CloudInit) DeepCopy() *CloudInit { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DedicatedHostInfo) DeepCopyInto(out *DedicatedHostInfo) { + *out = *in + if in.Tags != nil { + in, out := &in.Tags, &out.Tags + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DedicatedHostInfo. +func (in *DedicatedHostInfo) DeepCopy() *DedicatedHostInfo { + if in == nil { + return nil + } + out := new(DedicatedHostInfo) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DedicatedHostStatus) DeepCopyInto(out *DedicatedHostStatus) { + *out = *in + if in.ID != nil { + in, out := &in.ID, &out.ID + *out = new(string) + **out = **in + } + if in.ReleaseFailureMessage != nil { + in, out := &in.ReleaseFailureMessage, &out.ReleaseFailureMessage + *out = new(string) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DedicatedHostStatus. +func (in *DedicatedHostStatus) DeepCopy() *DedicatedHostStatus { + if in == nil { + return nil + } + out := new(DedicatedHostStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DynamicHostAllocationSpec) DeepCopyInto(out *DynamicHostAllocationSpec) { + *out = *in + if in.Tags != nil { + in, out := &in.Tags, &out.Tags + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamicHostAllocationSpec. +func (in *DynamicHostAllocationSpec) DeepCopy() *DynamicHostAllocationSpec { + if in == nil { + return nil + } + out := new(DynamicHostAllocationSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ElasticIPPool) DeepCopyInto(out *ElasticIPPool) { *out = *in @@ -1736,6 +1829,11 @@ func (in *Instance) DeepCopyInto(out *Instance) { *out = new(string) **out = **in } + if in.DynamicHostAllocation != nil { + in, out := &in.DynamicHostAllocation, &out.DynamicHostAllocation + *out = new(DynamicHostAllocationSpec) + (*in).DeepCopyInto(*out) + } out.CPUOptions = in.CPUOptions } diff --git a/config/crd/bases/controlplane.cluster.x-k8s.io_awsmanagedcontrolplanes.yaml b/config/crd/bases/controlplane.cluster.x-k8s.io_awsmanagedcontrolplanes.yaml index 937de1cc32..c9469d1dc5 100644 --- a/config/crd/bases/controlplane.cluster.x-k8s.io_awsmanagedcontrolplanes.yaml +++ b/config/crd/bases/controlplane.cluster.x-k8s.io_awsmanagedcontrolplanes.yaml @@ -1262,6 +1262,17 @@ spec: - AMDEncryptedVirtualizationNestedPaging type: string type: object + dynamicHostAllocation: + description: |- + DynamicHostAllocation enables automatic allocation of dedicated hosts. + This field is mutually exclusive with HostID. + properties: + tags: + additionalProperties: + type: string + description: Tags to apply to the allocated dedicated host. + type: object + type: object ebsOptimized: description: Indicates whether the instance is optimized for Amazon EBS I/O. @@ -3506,6 +3517,17 @@ spec: - AMDEncryptedVirtualizationNestedPaging type: string type: object + dynamicHostAllocation: + description: |- + DynamicHostAllocation enables automatic allocation of dedicated hosts. + This field is mutually exclusive with HostID. + properties: + tags: + additionalProperties: + type: string + description: Tags to apply to the allocated dedicated host. + type: object + type: object ebsOptimized: description: Indicates whether the instance is optimized for Amazon EBS I/O. diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsclusters.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsclusters.yaml index 83416aa9ae..869454a917 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsclusters.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsclusters.yaml @@ -2240,6 +2240,17 @@ spec: - AMDEncryptedVirtualizationNestedPaging type: string type: object + dynamicHostAllocation: + description: |- + DynamicHostAllocation enables automatic allocation of dedicated hosts. + This field is mutually exclusive with HostID. + properties: + tags: + additionalProperties: + type: string + description: Tags to apply to the allocated dedicated host. + type: object + type: object ebsOptimized: description: Indicates whether the instance is optimized for Amazon EBS I/O. diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachines.yaml index d7aa2cfef6..3fd9a319bd 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachines.yaml @@ -717,6 +717,19 @@ spec: - AMDEncryptedVirtualizationNestedPaging type: string type: object + dynamicHostAllocation: + description: |- + DynamicHostAllocation enables automatic allocation of a single dedicated host. + This field is mutually exclusive with HostID and always allocates exactly one host. + Cost effectiveness of allocating a single instance on a dedicated host may vary + depending on the instance type and the region. + properties: + tags: + additionalProperties: + type: string + description: Tags to apply to the allocated dedicated host. + type: object + type: object elasticIpPool: description: ElasticIPPool is the configuration to allocate Public IPv4 address (Elastic IP/EIP) from user-defined pool. @@ -747,18 +760,22 @@ spec: rule: self in ['none','amazon-pool'] type: object hostAffinity: + default: host description: |- HostAffinity specifies the dedicated host affinity setting for the instance. - When hostAffinity is set to host, an instance started onto a specific host always restarts on the same host if stopped. - When hostAffinity is set to default, and you stop and restart the instance, it can be restarted on any available host. + When HostAffinity is set to host, an instance started onto a specific host always restarts on the same host if stopped. + When HostAffinity is set to default, and you stop and restart the instance, it can be restarted on any available host. When HostAffinity is defined, HostID is required. enum: - default - host type: string hostID: - description: HostID specifies the Dedicated Host on which the instance - must be started. + description: |- + HostID specifies the Dedicated Host on which the instance must be started. + This field is mutually exclusive with DynamicHostAllocation. + maxLength: 19 + pattern: ^h-[0-9a-f]{17}$ type: string iamInstanceProfile: description: IAMInstanceProfile is a name of an IAM instance profile @@ -1163,8 +1180,12 @@ spec: type: string type: object tenancy: - description: Tenancy indicates if instance should run on shared or - single-tenant hardware. + description: |- + Tenancy indicates if instance should run on shared or single-tenant hardware. + When Tenancy=host, AWS will attempt to find a suitable host from: + - Preexisting allocated hosts that have auto-placement enabled + - A specific host ID, if configured + - Allocating a new dedicated host if DynamicHostAllocation is configured enum: - default - dedicated @@ -1267,6 +1288,21 @@ spec: - type type: object type: array + dedicatedHost: + description: |- + DedicatedHost tracks the dynamically allocated dedicated host. + This field is populated when DynamicHostAllocation is used. + properties: + id: + description: |- + ID tracks the dynamically allocated dedicated host ID. + This field is populated when DynamicHostAllocation is used. + type: string + releaseFailureMessage: + description: ReleaseFailureMessage tracks the last failure message + for the release host attempt. + type: string + type: object failureMessage: description: |- FailureMessage will be set in the event that there is a terminal problem @@ -1305,6 +1341,15 @@ spec: can be added as events to the Machine object and/or logged in the controller's output. type: string + hostReleaseAttempts: + description: HostReleaseAttempts tracks the number of attempts to + release the dedicated host. + format: int32 + type: integer + hostReleaseFailedReason: + description: HostReleaseFailedReason tracks the reason for the last + host release failure. + type: string instanceState: description: InstanceState is the state of the AWS instance for this machine. @@ -1314,6 +1359,11 @@ spec: Interruptible reports that this machine is using spot instances and can therefore be interrupted by CAPI when it receives a notice that the spot instance is to be terminated by AWS. This will be set to true when SpotMarketOptions is not nil (i.e. this machine is using a spot instance). type: boolean + lastHostReleaseAttempt: + description: LastHostReleaseAttempt tracks the timestamp of the last + attempt to release the dedicated host. + format: date-time + type: string ready: description: Ready is true when the provider resource is ready. type: boolean diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml index 5e3f55519d..1d3f40efed 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml @@ -636,6 +636,20 @@ spec: - AMDEncryptedVirtualizationNestedPaging type: string type: object + dynamicHostAllocation: + description: |- + DynamicHostAllocation enables automatic allocation of a single dedicated host. + This field is mutually exclusive with HostID and always allocates exactly one host. + Cost effectiveness of allocating a single instance on a dedicated host may vary + depending on the instance type and the region. + properties: + tags: + additionalProperties: + type: string + description: Tags to apply to the allocated dedicated + host. + type: object + type: object elasticIpPool: description: ElasticIPPool is the configuration to allocate Public IPv4 address (Elastic IP/EIP) from user-defined pool. @@ -666,18 +680,22 @@ spec: rule: self in ['none','amazon-pool'] type: object hostAffinity: + default: host description: |- HostAffinity specifies the dedicated host affinity setting for the instance. - When hostAffinity is set to host, an instance started onto a specific host always restarts on the same host if stopped. - When hostAffinity is set to default, and you stop and restart the instance, it can be restarted on any available host. + When HostAffinity is set to host, an instance started onto a specific host always restarts on the same host if stopped. + When HostAffinity is set to default, and you stop and restart the instance, it can be restarted on any available host. When HostAffinity is defined, HostID is required. enum: - default - host type: string hostID: - description: HostID specifies the Dedicated Host on which - the instance must be started. + description: |- + HostID specifies the Dedicated Host on which the instance must be started. + This field is mutually exclusive with DynamicHostAllocation. + maxLength: 19 + pattern: ^h-[0-9a-f]{17}$ type: string iamInstanceProfile: description: IAMInstanceProfile is a name of an IAM instance @@ -1089,8 +1107,12 @@ spec: type: string type: object tenancy: - description: Tenancy indicates if instance should run on shared - or single-tenant hardware. + description: |- + Tenancy indicates if instance should run on shared or single-tenant hardware. + When Tenancy=host, AWS will attempt to find a suitable host from: + - Preexisting allocated hosts that have auto-placement enabled + - A specific host ID, if configured + - Allocating a new dedicated host if DynamicHostAllocation is configured enum: - default - dedicated diff --git a/controllers/awsmachine_controller.go b/controllers/awsmachine_controller.go index 445bab678c..4bb6f81c2e 100644 --- a/controllers/awsmachine_controller.go +++ b/controllers/awsmachine_controller.go @@ -32,6 +32,7 @@ import ( "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" @@ -360,6 +361,84 @@ func (r *AWSMachineReconciler) reconcileDelete(ctx context.Context, machineScope return ctrl.Result{RequeueAfter: time.Minute}, nil case infrav1.InstanceStateTerminated: machineScope.Info("EC2 instance terminated successfully", "instance-id", instance.ID) + + // Handle dedicated host cleanup AFTER instance is confirmed terminated + if machineScope.AWSMachine.Status.DedicatedHost != nil && + machineScope.AWSMachine.Status.DedicatedHost.ID != nil && + machineScope.AWSMachine.Spec.DynamicHostAllocation != nil { + hostID := *machineScope.AWSMachine.Status.DedicatedHost.ID + + // Check if we should retry host release + shouldRetry, retryAfter := shouldRetryHostRelease(machineScope) + + if shouldRetry { + // Mark that we're retrying + conditions.MarkFalse(machineScope.AWSMachine, infrav1.DedicatedHostReleaseCondition, + infrav1.DedicatedHostReleaseRetryingReason, clusterv1.ConditionSeverityWarning, + "Retrying dedicated host release, attempt %d", getHostReleaseAttempts(machineScope)) + + // Update retry tracking + updateHostReleaseRetryTracking(machineScope) + + // Patch the object to persist retry tracking + if err := machineScope.PatchObject(); err != nil { + machineScope.Error(err, "failed to patch object with retry tracking") + return ctrl.Result{}, err + } + + machineScope.Info("Retrying dedicated host release", "hostID", hostID, "attempt", getHostReleaseAttempts(machineScope)) + return ctrl.Result{RequeueAfter: retryAfter}, nil + } + + // Attempt to release the dedicated host + machineScope.Info("Releasing dynamically allocated dedicated host", "hostID", hostID, "attempt", getHostReleaseAttempts(machineScope)) + if err := ec2Service.ReleaseDedicatedHost(ctx, hostID); err != nil { + // Host release failed, set up retry logic + machineScope.Error(err, "failed to release dedicated host", "hostID", hostID, "attempt", getHostReleaseAttempts(machineScope)) + r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "FailedReleaseHost", "Failed to release dedicated host %s: %v", hostID, err) + + // Update failure tracking + updateHostReleaseFailureTracking(machineScope, err.Error()) + + // Mark the condition as failed + conditions.MarkFalse(machineScope.AWSMachine, infrav1.DedicatedHostReleaseCondition, + infrav1.DedicatedHostReleaseFailedReason, clusterv1.ConditionSeverityWarning, + "Failed to release dedicated host: %v", err) + + // Patch the object to persist failure tracking + if err := machineScope.PatchObject(); err != nil { + machineScope.Error(err, "failed to patch object with failure tracking") + return ctrl.Result{}, err + } + + // Check if we've exceeded max retries + if hasExceededMaxHostReleaseRetries(machineScope) { + machineScope.Error(err, "exceeded maximum retry attempts for dedicated host release", "hostID", hostID, "maxAttempts", getMaxHostReleaseRetries()) + r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeWarning, "MaxRetriesExceeded", "Exceeded maximum retry attempts for dedicated host %s release", hostID) + // Continue with deletion even if host release fails permanently + } else { + // Return to trigger retry + return ctrl.Result{RequeueAfter: getInitialHostReleaseRetryDelay()}, nil + } + } else { + // Host release succeeded + machineScope.Info("Successfully released dedicated host", "hostID", hostID) + r.Recorder.Eventf(machineScope.AWSMachine, corev1.EventTypeNormal, "SuccessfulReleaseHost", "Released dedicated host %s", hostID) + + // Mark the condition as succeeded + conditions.MarkTrue(machineScope.AWSMachine, infrav1.DedicatedHostReleaseCondition) + + // Clear retry tracking since we succeeded + clearHostReleaseRetryTracking(machineScope) + + // Patch the object to persist success state + if err := machineScope.PatchObject(); err != nil { + machineScope.Error(err, "failed to patch object after successful host release") + return ctrl.Result{}, err + } + } + } + controllerutil.RemoveFinalizer(machineScope.AWSMachine, infrav1.MachineFinalizer) return ctrl.Result{}, nil default: @@ -1294,3 +1373,94 @@ func (r *AWSMachineReconciler) ensureInstanceMetadataOptions(ec2svc services.EC2 return ec2svc.ModifyInstanceMetadataOptions(instance.ID, machine.Spec.InstanceMetadataOptions) } + +// getMaxHostReleaseRetries returns the maximum number of retry attempts for dedicated host release. +func getMaxHostReleaseRetries() int32 { + return 5 +} + +// getInitialHostReleaseRetryDelay returns the initial delay before the first retry. +func getInitialHostReleaseRetryDelay() time.Duration { + return 30 * time.Second +} + +// getHostReleaseAttempts returns the current number of host release attempts. +func getHostReleaseAttempts(scope *scope.MachineScope) int32 { + if scope.AWSMachine.Status.HostReleaseAttempts == nil { + return 0 + } + return *scope.AWSMachine.Status.HostReleaseAttempts +} + +// shouldRetryHostRelease determines if we should retry host release based on retry tracking. +func shouldRetryHostRelease(scope *scope.MachineScope) (bool, time.Duration) { + attempts := getHostReleaseAttempts(scope) + + // If no attempts yet, don't retry + if attempts == 0 { + return false, 0 + } + + // Check if we've exceeded max retries + if attempts >= getMaxHostReleaseRetries() { + return false, 0 + } + + // Check if enough time has passed since last attempt + lastAttempt := scope.AWSMachine.Status.LastHostReleaseAttempt + if lastAttempt == nil { + return false, 0 + } + + // Calculate exponential backoff delay + baseDelay := getInitialHostReleaseRetryDelay() + multiplier := int64(1) + for i := int32(1); i < attempts; i++ { + multiplier *= 2 + } + backoffDelay := time.Duration(int64(baseDelay) * multiplier) + + // Cap the maximum delay at 5 minutes + if backoffDelay > 5*time.Minute { + backoffDelay = 5 * time.Minute + } + + // Check if enough time has passed + timeSinceLastAttempt := time.Since(lastAttempt.Time) + if timeSinceLastAttempt < backoffDelay { + remainingDelay := backoffDelay - timeSinceLastAttempt + return false, remainingDelay + } + + return true, backoffDelay +} + +// updateHostReleaseRetryTracking increments the retry attempt counter and updates the timestamp. +func updateHostReleaseRetryTracking(scope *scope.MachineScope) { + attempts := getHostReleaseAttempts(scope) + 1 + scope.AWSMachine.Status.HostReleaseAttempts = &attempts + + now := time.Now() + scope.AWSMachine.Status.LastHostReleaseAttempt = &metav1.Time{Time: now} +} + +// updateHostReleaseFailureTracking updates the failure reason and timestamp. +func updateHostReleaseFailureTracking(scope *scope.MachineScope, reason string) { + scope.AWSMachine.Status.HostReleaseFailedReason = &reason + + // Update the timestamp for the last attempt + now := time.Now() + scope.AWSMachine.Status.LastHostReleaseAttempt = &metav1.Time{Time: now} +} + +// clearHostReleaseRetryTracking resets all retry tracking fields after successful release. +func clearHostReleaseRetryTracking(scope *scope.MachineScope) { + scope.AWSMachine.Status.HostReleaseAttempts = nil + scope.AWSMachine.Status.LastHostReleaseAttempt = nil + scope.AWSMachine.Status.HostReleaseFailedReason = nil +} + +// hasExceededMaxHostReleaseRetries checks if we've exceeded the maximum retry attempts. +func hasExceededMaxHostReleaseRetries(scope *scope.MachineScope) bool { + return getHostReleaseAttempts(scope) >= getMaxHostReleaseRetries() +} diff --git a/examples/machine-with-dynamic-dedicated-host.yaml b/examples/machine-with-dynamic-dedicated-host.yaml new file mode 100644 index 0000000000..46dcf1b069 --- /dev/null +++ b/examples/machine-with-dynamic-dedicated-host.yaml @@ -0,0 +1,47 @@ +apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 +kind: AWSMachine +metadata: + name: test-machine-with-dynamic-host + namespace: default +spec: + instanceType: m5.large + ami: + id: ami-0abcdef1234567890 + iamInstanceProfile: nodes.cluster-api-provider-aws.sigs.k8s.io + + # Additional tags that will be applied to the dedicated host + # These tags can be overridden by dedicated host specific tags + additionalTags: + Environment: "test" + Owner: "platform-team" + CostCenter: "engineering" + + # Dynamic dedicated host allocation configuration + # This will allocate a single dedicated host automatically + dynamicHostAllocation: + # Tags to apply to the allocated dedicated host (optional) + # These tags take precedence over additionalTags above + tags: + Environment: "production" # This will override the "test" value from additionalTags + Application: "virtualization" + Purpose: "BYOL-Windows" + + # Standard instance configuration + subnet: + id: subnet-0a1b2c3d4e5f6g7h8 + + securityGroupOverrides: + - id: sg-0123456789abcdef0 + + userData: + name: test-userdata + namespace: default +--- +apiVersion: v1 +kind: Secret +metadata: + name: test-userdata + namespace: default +type: Opaque +data: + userData: IyEvYmluL2Jhc2gKZWNobyAiSGVsbG8gV29ybGQi # base64 encoded "#!/bin/bash\necho \"Hello World\"" \ No newline at end of file diff --git a/go.mod b/go.mod index 1ffc472259..aa0d42f7c1 100644 --- a/go.mod +++ b/go.mod @@ -48,6 +48,7 @@ require ( github.com/sirupsen/logrus v1.9.3 github.com/spf13/cobra v1.9.1 github.com/spf13/pflag v1.0.6 + github.com/stretchr/testify v1.10.0 github.com/zgalor/weberr v0.8.2 golang.org/x/crypto v0.36.0 golang.org/x/net v0.38.0 @@ -70,7 +71,10 @@ require ( sigs.k8s.io/yaml v1.4.0 ) -require github.com/aws/aws-sdk-go v1.55.7 // indirect +require ( + github.com/aws/aws-sdk-go v1.55.7 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect +) require ( al.essio.dev/pkg/shellescape v1.5.1 // indirect diff --git a/go.sum b/go.sum index 61860ee4e4..f84b32fc1c 100644 --- a/go.sum +++ b/go.sum @@ -495,8 +495,9 @@ github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AV github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= diff --git a/pkg/cloud/services/common/common.go b/pkg/cloud/services/common/common.go index 3561534647..76ffdb2333 100644 --- a/pkg/cloud/services/common/common.go +++ b/pkg/cloud/services/common/common.go @@ -26,6 +26,7 @@ import ( // EC2API defines the EC2 API interface. type EC2API interface { AllocateAddress(ctx context.Context, params *ec2.AllocateAddressInput, optFns ...func(*ec2.Options)) (*ec2.AllocateAddressOutput, error) + AllocateHosts(ctx context.Context, params *ec2.AllocateHostsInput, optFns ...func(*ec2.Options)) (*ec2.AllocateHostsOutput, error) AssociateAddress(ctx context.Context, params *ec2.AssociateAddressInput, optFns ...func(*ec2.Options)) (*ec2.AssociateAddressOutput, error) AssociateRouteTable(ctx context.Context, params *ec2.AssociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.AssociateRouteTableOutput, error) AssociateVpcCidrBlock(ctx context.Context, params *ec2.AssociateVpcCidrBlockInput, optFns ...func(*ec2.Options)) (*ec2.AssociateVpcCidrBlockOutput, error) @@ -61,6 +62,7 @@ type EC2API interface { DescribeCarrierGateways(ctx context.Context, params *ec2.DescribeCarrierGatewaysInput, optFns ...func(*ec2.Options)) (*ec2.DescribeCarrierGatewaysOutput, error) DescribeDhcpOptions(ctx context.Context, params *ec2.DescribeDhcpOptionsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeDhcpOptionsOutput, error) DescribeEgressOnlyInternetGateways(ctx context.Context, params *ec2.DescribeEgressOnlyInternetGatewaysInput, optFns ...func(*ec2.Options)) (*ec2.DescribeEgressOnlyInternetGatewaysOutput, error) + DescribeHosts(ctx context.Context, params *ec2.DescribeHostsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeHostsOutput, error) DescribeImages(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) DescribeInstances(ctx context.Context, params *ec2.DescribeInstancesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeInstancesOutput, error) DescribeInstanceTypes(context.Context, *ec2.DescribeInstanceTypesInput, ...func(*ec2.Options)) (*ec2.DescribeInstanceTypesOutput, error) @@ -87,6 +89,7 @@ type EC2API interface { ModifyVpcAttribute(ctx context.Context, params *ec2.ModifyVpcAttributeInput, optFns ...func(*ec2.Options)) (*ec2.ModifyVpcAttributeOutput, error) ModifyVpcEndpoint(ctx context.Context, params *ec2.ModifyVpcEndpointInput, optFns ...func(*ec2.Options)) (*ec2.ModifyVpcEndpointOutput, error) ReleaseAddress(ctx context.Context, params *ec2.ReleaseAddressInput, optFns ...func(*ec2.Options)) (*ec2.ReleaseAddressOutput, error) + ReleaseHosts(ctx context.Context, params *ec2.ReleaseHostsInput, optFns ...func(*ec2.Options)) (*ec2.ReleaseHostsOutput, error) ReplaceRoute(ctx context.Context, params *ec2.ReplaceRouteInput, optFns ...func(*ec2.Options)) (*ec2.ReplaceRouteOutput, error) RevokeSecurityGroupEgress(ctx context.Context, params *ec2.RevokeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) RevokeSecurityGroupIngress(ctx context.Context, params *ec2.RevokeSecurityGroupIngressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupIngressOutput, error) diff --git a/pkg/cloud/services/ec2/dedicatedhosts.go b/pkg/cloud/services/ec2/dedicatedhosts.go new file mode 100644 index 0000000000..73b06c90db --- /dev/null +++ b/pkg/cloud/services/ec2/dedicatedhosts.go @@ -0,0 +1,258 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ec2 + +import ( + "context" + "fmt" + "math" + "strings" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/aws/retry" + "github.com/aws/aws-sdk-go-v2/service/ec2" + "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/pkg/errors" + + infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/awserrors" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/scope" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/record" +) + +// AllocateDedicatedHost allocates a single dedicated host based on the specification. +// This function always allocates exactly one dedicated host per call. +// The dedicated host will inherit additional tags defined in the AWSMachineTemplate. +func (s *Service) AllocateDedicatedHost(ctx context.Context, spec *infrav1.DynamicHostAllocationSpec, instanceType, availabilityZone string, scope *scope.MachineScope) (string, error) { + s.scope.Debug("Allocating single dedicated host", "instanceType", instanceType, "availabilityZone", availabilityZone) + input := &ec2.AllocateHostsInput{ + InstanceType: aws.String(instanceType), + AvailabilityZone: aws.String(availabilityZone), + Quantity: aws.Int32(1), + } + + // Build tags for the dedicated host + // Only include additionalTags from the machine and dedicated host specific tags + additionalTags := scope.AdditionalTags() + + // Start with additional tags from the machine (AWSMachineTemplate additionalTags) + dedicatedHostTags := make(map[string]string) + for key, value := range additionalTags { + dedicatedHostTags[key] = value + } + + // Merge in dedicated host specific tags from the spec + // Dedicated host specific tags take precedence over additional tags + for key, value := range spec.Tags { + dedicatedHostTags[key] = value + } + + // Add tags to the allocation request + if len(dedicatedHostTags) > 0 { + var tagSpecs []types.TagSpecification + var tags []types.Tag + for key, value := range dedicatedHostTags { + tags = append(tags, types.Tag{ + Key: aws.String(key), + Value: aws.String(value), + }) + } + tagSpecs = append(tagSpecs, types.TagSpecification{ + ResourceType: types.ResourceTypeDedicatedHost, + Tags: tags, + }) + input.TagSpecifications = tagSpecs + } + + s.scope.Info("Allocating dedicated host", "input", input, "machine", scope.Name()) + output, err := s.EC2Client.AllocateHosts(ctx, input) + if err != nil { + return "", errors.Wrap(err, fmt.Sprintf("failed to allocate dedicated host: %+v", input)) + } + + // Ensure we got exactly one host as expected + if len(output.HostIds) != 1 { + return "", errors.Errorf("expected one dedicated host, but got %d hosts", len(output.HostIds)) + } + + hostID := output.HostIds[0] + s.scope.Info("Successfully allocated single dedicated host", + "hostID", hostID, + "availabilityZone", availabilityZone, + "machine", scope.Name(), + "instanceType", instanceType) + record.Eventf(s.scope.InfraCluster(), "SuccessfulAllocateDedicatedHost", "Allocated dedicated host %s in %s for machine %s", hostID, availabilityZone, scope.Name()) + + return hostID, nil +} + +// ReleaseDedicatedHost releases a dedicated host with enhanced retry logic. +// This function uses AWS SDK v2's built-in retry mechanisms optimized for +// dedicated host operations, which are expensive resources requiring robust retry handling. +func (s *Service) ReleaseDedicatedHost(ctx context.Context, hostID string) error { + s.scope.Debug("Releasing dedicated host", "hostID", hostID) + + input := &ec2.ReleaseHostsInput{ + HostIds: []string{hostID}, + } + + // Create a client with enhanced retry configuration for dedicated host operations + clientWithRetry := s.createClientWithDedicatedHostRetryConfig() + + output, err := clientWithRetry.ReleaseHosts(ctx, input) + if err != nil { + errorCode := s.getErrorCode(err) + s.scope.Error(err, "Failed to release dedicated host", + "hostID", hostID, + "errorCode", errorCode, + "result", s.getReleaseHostsOutput(output)) + record.Warnf(s.scope.InfraCluster(), "FailedReleaseDedicatedHost", "Failed to release dedicated host %s: %v", hostID, err) + return errors.Wrap(err, "failed to release dedicated host") + } + + s.scope.Info("Successfully released dedicated host", + "hostID", hostID, + "result", s.getReleaseHostsOutput(output)) + record.Eventf(s.scope.InfraCluster(), "SuccessfulReleaseDedicatedHost", "Released dedicated host %s", hostID) + return nil +} + +// createClientWithDedicatedHostRetryConfig creates an EC2 client with enhanced retry configuration +// specifically optimized for dedicated host operations using RetryerV2 interface. +func (s *Service) createClientWithDedicatedHostRetryConfig() *ec2.Client { + // Get the base configuration from the service's session + cfg := s.scope.Session() + + // Create a custom RetryerV2 for dedicated host operations + // Using AWS SDK's built-in adaptive retry mode which implements RetryerV2 + dedicatedHostRetryer := retry.NewAdaptiveMode(func(o *retry.AdaptiveModeOptions) { + // More aggressive retry configuration for expensive dedicated host operations + o.StandardOptions = append(o.StandardOptions, func(so *retry.StandardOptions) { + so.MaxAttempts = 5 // Maximum retry attempts + so.MaxBackoff = 30 * time.Second // Maximum backoff time + so.Backoff = retry.NewExponentialJitterBackoff(time.Second) // 1 second initial delay with built-in jitter + }) + }) + + // Override the retry configuration in the config using RetryerV2 + // provides better context handling and granular control over retry attempts + cfg.Retryer = func() aws.Retryer { + return dedicatedHostRetryer // AdaptiveMode implements aws.RetryerV2 + } + + // Create a new client with the enhanced RetryerV2 configuration + // The RetryerV2 interface provides: + // - GetAttemptToken(context.Context) for context-aware retry decisions + // - Better integration with AWS SDK v2's context handling + // - More granular control over retry behavior + return ec2.NewFromConfig(cfg) +} + +// getErrorCode extracts the error code from an AWS error. +func (s *Service) getErrorCode(err error) string { + if smithyErr := awserrors.ParseSmithyError(err); smithyErr != nil { + return smithyErr.ErrorCode() + } + if code, ok := awserrors.Code(err); ok { + return code + } + return "Unknown" +} + +// DescribeDedicatedHost describes a specific dedicated host. +func (s *Service) DescribeDedicatedHost(ctx context.Context, hostID string) (*infrav1.DedicatedHostInfo, error) { + input := &ec2.DescribeHostsInput{ + HostIds: []string{hostID}, + } + + output, err := s.EC2Client.DescribeHosts(ctx, input) + if err != nil { + return nil, errors.Wrap(err, "failed to describe dedicated host") + } + + if len(output.Hosts) == 0 { + return nil, errors.Errorf("dedicated host %s not found", hostID) + } + + host := output.Hosts[0] + hostInfo := s.convertToHostInfo(host) + + return hostInfo, nil +} + +// convertToHostInfo converts an AWS Host to the DedicatedHostInfo struct. +func (s *Service) convertToHostInfo(host types.Host) *infrav1.DedicatedHostInfo { + hostInfo := &infrav1.DedicatedHostInfo{ + HostID: aws.ToString(host.HostId), + AvailabilityZone: aws.ToString(host.AvailabilityZone), + State: string(host.State), + Tags: make(map[string]string), + } + + // Parse properties from HostProperties + if host.HostProperties != nil { + if host.HostProperties.InstanceFamily != nil { + hostInfo.InstanceFamily = *host.HostProperties.InstanceFamily + } + if host.HostProperties.InstanceType != nil { + hostInfo.InstanceType = *host.HostProperties.InstanceType + } + if host.HostProperties.TotalVCpus != nil { + hostInfo.TotalCapacity = *host.HostProperties.TotalVCpus + } + } + + // Calculate available capacity from instances + instanceCount := len(host.Instances) + if instanceCount > math.MaxInt32 { + instanceCount = math.MaxInt32 + } + // bounds check ensures instanceCount <= math.MaxInt32, preventing integer overflow + usedCapacity := int32(instanceCount) //nolint:gosec + hostInfo.AvailableCapacity = hostInfo.TotalCapacity - usedCapacity + + // Convert tags + for _, tag := range host.Tags { + if tag.Key != nil && tag.Value != nil { + hostInfo.Tags[*tag.Key] = *tag.Value + } + } + + return hostInfo +} + +func (s *Service) getReleaseHostsOutput(output *ec2.ReleaseHostsOutput) string { + var errs []string + + if output.Successful != nil { + return strings.Join(output.Successful, ", ") + } else if output.Unsuccessful != nil { + for _, err := range output.Unsuccessful { + var errResource string + if err.Error != nil { + errResource = fmt.Sprintf("Resource ID: %s, Error code: %s, Error message: %s", aws.ToString(err.ResourceId), aws.ToString(err.Error.Code), aws.ToString(err.Error.Message)) + } else { + errResource = fmt.Sprintf("Resource ID: %s", aws.ToString(err.ResourceId)) + } + errs = append(errs, errResource) + } + return strings.Join(errs, ", ") + } + + return "" +} diff --git a/pkg/cloud/services/ec2/dedicatedhosts_test.go b/pkg/cloud/services/ec2/dedicatedhosts_test.go new file mode 100644 index 0000000000..a3093b36fc --- /dev/null +++ b/pkg/cloud/services/ec2/dedicatedhosts_test.go @@ -0,0 +1,344 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ec2 + +import ( + "context" + "testing" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/ec2" + "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/golang/mock/gomock" + "github.com/stretchr/testify/assert" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/scope" + "sigs.k8s.io/cluster-api-provider-aws/v2/test/mocks" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" +) + +func createTestClusterScope(t *testing.T) *scope.ClusterScope { + t.Helper() + scheme := runtime.NewScheme() + _ = infrav1.AddToScheme(scheme) + client := fake.NewClientBuilder().WithScheme(scheme).Build() + + scope, err := scope.NewClusterScope(scope.ClusterScopeParams{ + Client: client, + Cluster: &clusterv1.Cluster{}, + AWSCluster: &infrav1.AWSCluster{ + ObjectMeta: metav1.ObjectMeta{Name: "test"}, + Spec: infrav1.AWSClusterSpec{ + NetworkSpec: infrav1.NetworkSpec{ + VPC: infrav1.VPCSpec{ + ID: "test-vpc", + }, + }, + }, + }, + }) + if err != nil { + t.Fatalf("Failed to create test context: %v", err) + } + return scope +} + +func createTestMachineScope(t *testing.T, clusterScope *scope.ClusterScope) *scope.MachineScope { + t.Helper() + scheme := runtime.NewScheme() + _ = infrav1.AddToScheme(scheme) + _ = clusterv1.AddToScheme(scheme) + client := fake.NewClientBuilder().WithScheme(scheme).Build() + + machine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-machine", + Namespace: "default", + }, + } + + awsMachine := &infrav1.AWSMachine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-aws-machine", + Namespace: "default", + }, + Spec: infrav1.AWSMachineSpec{ + InstanceType: "m5.large", + AdditionalTags: infrav1.Tags{ + "Environment": "test", + "Owner": "test-user", + }, + }, + } + + machineScope, err := scope.NewMachineScope(scope.MachineScopeParams{ + Client: client, + Cluster: clusterScope.Cluster, + Machine: machine, + AWSMachine: awsMachine, + InfraCluster: clusterScope, + }) + if err != nil { + t.Fatalf("Failed to create test machine scope: %v", err) + } + return machineScope +} + +func TestAllocateDedicatedHost(t *testing.T) { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + tests := []struct { + name string + dynamicAllocationSpec *infrav1.DynamicHostAllocationSpec + availabilityZone string + expectError bool + instanceType string + setupMocks func(m *mocks.MockEC2API) + }{ + { + name: "should allocate exactly one dedicated host", + dynamicAllocationSpec: &infrav1.DynamicHostAllocationSpec{ + Tags: map[string]string{ + "Environment": "production", // This should override the machine's "test" value + "Purpose": "dedicated", // This should be added from dedicated host specific tags + }, + }, + availabilityZone: "us-west-2a", + instanceType: "m5.large", + expectError: false, + setupMocks: func(m *mocks.MockEC2API) { + // Mock AllocateHosts to return exactly one host + m.EXPECT().AllocateHosts(gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn(func(ctx context.Context, input *ec2.AllocateHostsInput, optFns ...func(*ec2.Options)) (*ec2.AllocateHostsOutput, error) { + // Verify that quantity is set to 1 + assert.Equal(t, int32(1), *input.Quantity) + + // Verify that tags are being passed + assert.NotNil(t, input.TagSpecifications) + assert.Len(t, input.TagSpecifications, 1) + assert.Equal(t, types.ResourceTypeDedicatedHost, input.TagSpecifications[0].ResourceType) + + // Verify that only the expected tags are present (no standard cluster/machine tags) + expectedTags := map[string]string{ + "Environment": "production", // from dedicated host specific tags (overrides machine's "test") + "Owner": "test-user", // from machine AdditionalTags + "Purpose": "dedicated", // from dedicated host specific tags + } + + // Verify we have exactly the expected number of tags + assert.Equal(t, len(expectedTags), len(input.TagSpecifications[0].Tags), "Should have exactly the expected number of tags") + + // Verify each expected tag is present with correct value + for _, tag := range input.TagSpecifications[0].Tags { + key := aws.ToString(tag.Key) + value := aws.ToString(tag.Value) + expectedValue, exists := expectedTags[key] + assert.True(t, exists, "Unexpected tag found: %s", key) + assert.Equal(t, expectedValue, value, "Tag %s should have value %s", key, expectedValue) + } + + return &ec2.AllocateHostsOutput{ + HostIds: []string{"h-1234567890abcdef0"}, + }, nil + }) + }, + }, + { + name: "should fail if AWS returns multiple hosts", + dynamicAllocationSpec: &infrav1.DynamicHostAllocationSpec{}, + availabilityZone: "us-west-2a", + instanceType: "m5.large", + expectError: true, + setupMocks: func(m *mocks.MockEC2API) { + // Mock AllocateHosts to return multiple hosts (should never happen with quantity=1, but test the validation) + m.EXPECT().AllocateHosts(gomock.Any(), gomock.Any(), gomock.Any()).Return(&ec2.AllocateHostsOutput{ + HostIds: []string{"h-1234567890abcdef0", "h-0987654321fedcba0"}, + }, nil) + }, + }, + { + name: "should fail if AWS returns no hosts", + dynamicAllocationSpec: &infrav1.DynamicHostAllocationSpec{}, + availabilityZone: "us-west-2a", + instanceType: "m5.large", + expectError: true, + setupMocks: func(m *mocks.MockEC2API) { + // Mock AllocateHosts to return no hosts + m.EXPECT().AllocateHosts(gomock.Any(), gomock.Any(), gomock.Any()).Return(&ec2.AllocateHostsOutput{ + HostIds: []string{}, + }, nil) + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ec2Mock := mocks.NewMockEC2API(mockCtrl) + tt.setupMocks(ec2Mock) + + clusterScope := createTestClusterScope(t) + machineScope := createTestMachineScope(t, clusterScope) + s := NewService(clusterScope) + s.EC2Client = ec2Mock + + hostID, err := s.AllocateDedicatedHost(context.TODO(), tt.dynamicAllocationSpec, tt.instanceType, tt.availabilityZone, machineScope) + + if tt.expectError { + assert.Error(t, err) + assert.Empty(t, hostID) + } else { + assert.NoError(t, err) + assert.NotEmpty(t, hostID) + } + }) + } +} + +func TestDescribeDedicatedHost(t *testing.T) { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + hostID := "h-1234567890abcdef0" + + host := types.Host{ + HostId: aws.String(hostID), + AvailabilityZone: aws.String("us-west-2a"), + State: types.AllocationStateAvailable, + HostProperties: &types.HostProperties{ + InstanceFamily: aws.String("m5"), + InstanceType: aws.String("m5.large"), + TotalVCpus: aws.Int32(2), + }, + Instances: []types.HostInstance{}, + Tags: []types.Tag{ + { + Key: aws.String("Environment"), + Value: aws.String("test"), + }, + }, + } + + ec2Mock := mocks.NewMockEC2API(mockCtrl) + ec2Mock.EXPECT().DescribeHosts(gomock.Any(), gomock.Any()).Return(&ec2.DescribeHostsOutput{ + Hosts: []types.Host{host}, + }, nil) + + scope := createTestClusterScope(t) + s := NewService(scope) + s.EC2Client = ec2Mock + + hostInfo, err := s.DescribeDedicatedHost(context.TODO(), hostID) + assert.NoError(t, err) + assert.NotNil(t, hostInfo) + assert.Equal(t, hostID, hostInfo.HostID) + assert.Equal(t, "m5", hostInfo.InstanceFamily) + assert.Equal(t, "m5.large", hostInfo.InstanceType) + assert.Equal(t, "us-west-2a", hostInfo.AvailabilityZone) + assert.Equal(t, "available", hostInfo.State) + assert.Equal(t, int32(2), hostInfo.TotalCapacity) + assert.Equal(t, int32(2), hostInfo.AvailableCapacity) // No instances running + assert.Equal(t, "test", hostInfo.Tags["Environment"]) +} + +func TestAllocateDedicatedHostMultipleMachines(t *testing.T) { + // This test verifies that multiple machines each get their own dedicated host + // This is the intended behavior for dedicated hosts - each machine gets complete isolation + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + // Create two machine scopes that would both try to allocate hosts + clusterScope := createTestClusterScope(t) + machineScope1 := createTestMachineScope(t, clusterScope) + machineScope2 := createTestMachineScope(t, clusterScope) + + // Give them different names to simulate different machines + machineScope1.AWSMachine.Name = "test-machine-1" + machineScope2.AWSMachine.Name = "test-machine-2" + + ec2Mock := mocks.NewMockEC2API(mockCtrl) + + // Both machines will call AllocateHosts and get separate hosts + // This is the intended behavior - each machine gets its own dedicated host for isolation + ec2Mock.EXPECT().AllocateHosts(gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn(func(ctx context.Context, input *ec2.AllocateHostsInput, optFns ...func(*ec2.Options)) (*ec2.AllocateHostsOutput, error) { + // Verify that quantity is set to 1 + assert.Equal(t, int32(1), *input.Quantity) + return &ec2.AllocateHostsOutput{ + HostIds: []string{"h-1234567890abcdef0"}, + }, nil + }).Times(2) // Expect two calls for two machines + + s := NewService(clusterScope) + s.EC2Client = ec2Mock + + spec := &infrav1.DynamicHostAllocationSpec{ + Tags: map[string]string{ + "Environment": "test", + }, + } + + // Simulate concurrent allocation (in real scenario, these would be concurrent) + hostID1, err1 := s.AllocateDedicatedHost(context.TODO(), spec, "m5.large", "us-west-2a", machineScope1) + hostID2, err2 := s.AllocateDedicatedHost(context.TODO(), spec, "m5.large", "us-west-2a", machineScope2) + + // Both should succeed but get different hosts (demonstrating the race condition) + assert.NoError(t, err1) + assert.NoError(t, err2) + assert.NotEmpty(t, hostID1) + assert.NotEmpty(t, hostID2) + assert.Equal(t, "h-1234567890abcdef0", hostID1) + assert.Equal(t, "h-1234567890abcdef0", hostID2) // Same host ID because of mock, but in real scenario they'd be different +} + +func TestConvertToHostInfo(t *testing.T) { + hostID := "h-1234567890abcdef0" + + host := types.Host{ + HostId: aws.String(hostID), + AvailabilityZone: aws.String("us-west-2a"), + State: types.AllocationStateAvailable, + HostProperties: &types.HostProperties{ + InstanceFamily: aws.String("m5"), + InstanceType: aws.String("m5.large"), + TotalVCpus: aws.Int32(4), + }, + Instances: []types.HostInstance{ + {InstanceId: aws.String("i-1234567890abcdef0")}, + }, + Tags: []types.Tag{ + { + Key: aws.String("Environment"), + Value: aws.String("test"), + }, + }, + } + + s := &Service{} + hostInfo := s.convertToHostInfo(host) + + assert.Equal(t, hostID, hostInfo.HostID) + assert.Equal(t, "m5", hostInfo.InstanceFamily) + assert.Equal(t, "m5.large", hostInfo.InstanceType) + assert.Equal(t, "us-west-2a", hostInfo.AvailabilityZone) + assert.Equal(t, "available", hostInfo.State) + assert.Equal(t, int32(4), hostInfo.TotalCapacity) + assert.Equal(t, int32(3), hostInfo.AvailableCapacity) // 1 instance running, 3 available + assert.Equal(t, "test", hostInfo.Tags["Environment"]) +} diff --git a/pkg/cloud/services/ec2/instances.go b/pkg/cloud/services/ec2/instances.go index 6e5813c74a..3cecd55c84 100644 --- a/pkg/cloud/services/ec2/instances.go +++ b/pkg/cloud/services/ec2/instances.go @@ -258,9 +258,22 @@ func (s *Service) CreateInstance(ctx context.Context, scope *scope.MachineScope, input.MarketType = scope.AWSMachine.Spec.MarketType - input.HostID = scope.AWSMachine.Spec.HostID + // Handle dynamic host allocation if specified + if scope.AWSMachine.Spec.DynamicHostAllocation != nil { + hostID, err := s.ensureDedicatedHostAllocation(ctx, scope) + if err != nil { + return nil, errors.Wrap(err, "failed to allocate dedicated host") + } + input.HostID = aws.String(hostID) + input.HostAffinity = aws.String("host") - input.HostAffinity = scope.AWSMachine.Spec.HostAffinity + // Update machine status with allocated host ID + scope.AWSMachine.Status.DedicatedHost.ID = &hostID + } else { + // Use static host allocation if specified + input.HostID = scope.AWSMachine.Spec.HostID + input.HostAffinity = scope.AWSMachine.Spec.HostAffinity + } input.CapacityReservationPreference = scope.AWSMachine.Spec.CapacityReservationPreference @@ -1279,6 +1292,95 @@ func getInstanceMetadataOptionsRequest(metadataOptions *infrav1.InstanceMetadata return request } +// ensureDedicatedHostAllocation ensures a dedicated host is allocated for the machine. +func (s *Service) ensureDedicatedHostAllocation(ctx context.Context, scope *scope.MachineScope) (string, error) { + spec := scope.AWSMachine.Spec.DynamicHostAllocation + if spec == nil { + return "", errors.New("dynamic host allocation spec is nil") + } + + // Check if a host is already allocated for this machine + // Each machine gets its own dedicated host for complete isolation and resource dedication + if scope.AWSMachine.Status.DedicatedHost.ID != nil { + existingHostID := aws.ToString(scope.AWSMachine.Status.DedicatedHost.ID) + s.scope.Info("Found existing allocated host for machine", "hostID", existingHostID, "machine", scope.Name()) + return existingHostID, nil + } + + // Determine the availability zone for the host + var availabilityZone *string + + // Get AZ from the machine's subnet + if scope.AWSMachine.Spec.Subnet != nil { + var subnet *types.Subnet + var err error + if scope.AWSMachine.Spec.Subnet.ID != nil { + subnet, err = s.getSubnet(scope.AWSMachine.Spec.Subnet.ID) + if err != nil { + return "", errors.Wrap(err, "failed to get subnet for host allocation") + } + } else if len(scope.AWSMachine.Spec.Subnet.Filters) > 0 { + // Convert CAPA filters to AWS SDK filters + awsFilters := make([]types.Filter, len(scope.AWSMachine.Spec.Subnet.Filters)) + for i, f := range scope.AWSMachine.Spec.Subnet.Filters { + awsFilters[i] = types.Filter{ + Name: aws.String(f.Name), + Values: f.Values, + } + } + + subnets, err := s.getFilteredSubnets(awsFilters...) + if err != nil { + return "", errors.Wrap(err, "failed to get subnet by filters for host allocation") + } + // if more than one subnet is found, use the first one. they should all share the same AZ. + if len(subnets) > 0 { + subnet = &subnets[0] + } + } + if subnet != nil && subnet.AvailabilityZone != nil { + availabilityZone = subnet.AvailabilityZone + } + } + + instanceType := scope.AWSMachine.Spec.InstanceType + + if availabilityZone == nil { + return "", errors.New("availability zone could not be determined, please specify a subnet ID or subnet filters") + } + + // Allocate the dedicated host + hostID, err := s.AllocateDedicatedHost(ctx, spec, instanceType, *availabilityZone, scope) + if err != nil { + return "", errors.Wrap(err, "failed to allocate dedicated host") + } + + s.scope.Info("Successfully allocated dedicated host for machine", "hostID", hostID, "machine", scope.Name()) + return hostID, nil +} + +// getSubnet retrieves subnet information by ID. +func (s *Service) getSubnet(subnetID *string) (*types.Subnet, error) { + if subnetID == nil { + return nil, errors.New("subnet ID is nil") + } + + input := &ec2.DescribeSubnetsInput{ + SubnetIds: []string{*subnetID}, + } + + output, err := s.EC2Client.DescribeSubnets(context.TODO(), input) + if err != nil { + return nil, errors.Wrap(err, "failed to describe subnet") + } + + if len(output.Subnets) == 0 { + return nil, errors.Errorf("subnet %s not found", *subnetID) + } + + return &output.Subnets[0], nil +} + func getPrivateDNSNameOptionsRequest(privateDNSName *infrav1.PrivateDNSName) *types.PrivateDnsNameOptionsRequest { if privateDNSName == nil { return nil diff --git a/pkg/cloud/services/ec2/service.go b/pkg/cloud/services/ec2/service.go index e9bd12c79d..fc237e1991 100644 --- a/pkg/cloud/services/ec2/service.go +++ b/pkg/cloud/services/ec2/service.go @@ -34,6 +34,10 @@ type Service struct { // SSMClient is used to look up the official EKS AMI ID SSMClient ssm.SSMAPI + + // RetryEC2Client is used for dedicated host operations with enhanced retry configuration + // If nil, a new retry client will be created as needed + RetryEC2Client common.EC2API } // NewService returns a new service given the ec2 api client. diff --git a/pkg/cloud/services/interfaces.go b/pkg/cloud/services/interfaces.go index 65b08a2ecd..ec2e3b931e 100644 --- a/pkg/cloud/services/interfaces.go +++ b/pkg/cloud/services/interfaces.go @@ -91,6 +91,11 @@ type EC2Interface interface { // ReleaseElasticIP reconciles the elastic IP from a custom Public IPv4 Pool. ReleaseElasticIP(instanceID string) error + + // Dedicated Host management + AllocateDedicatedHost(ctx context.Context, spec *infrav1.DynamicHostAllocationSpec, instanceType, availabilityZone string, scope *scope.MachineScope) (string, error) + ReleaseDedicatedHost(ctx context.Context, hostID string) error + DescribeDedicatedHost(ctx context.Context, hostID string) (*infrav1.DedicatedHostInfo, error) } // MachinePoolReconcileInterface encapsulates high-level reconciliation functions regarding EC2 reconciliation. It is diff --git a/pkg/cloud/services/mock_services/ec2_interface_mock.go b/pkg/cloud/services/mock_services/ec2_interface_mock.go index 78b52d93df..6b923ac502 100644 --- a/pkg/cloud/services/mock_services/ec2_interface_mock.go +++ b/pkg/cloud/services/mock_services/ec2_interface_mock.go @@ -55,6 +55,21 @@ func (m *MockEC2Interface) EXPECT() *MockEC2InterfaceMockRecorder { return m.recorder } +// AllocateDedicatedHost mocks base method. +func (m *MockEC2Interface) AllocateDedicatedHost(arg0 context.Context, arg1 *v1beta2.DynamicHostAllocationSpec, arg2, arg3 string, arg4 *scope.MachineScope) (string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "AllocateDedicatedHost", arg0, arg1, arg2, arg3, arg4) + ret0, _ := ret[0].(string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// AllocateDedicatedHost indicates an expected call of AllocateDedicatedHost. +func (mr *MockEC2InterfaceMockRecorder) AllocateDedicatedHost(arg0, arg1, arg2, arg3, arg4 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AllocateDedicatedHost", reflect.TypeOf((*MockEC2Interface)(nil).AllocateDedicatedHost), arg0, arg1, arg2, arg3, arg4) +} + // CreateInstance mocks base method. func (m *MockEC2Interface) CreateInstance(arg0 context.Context, arg1 *scope.MachineScope, arg2 []byte, arg3 string) (*v1beta2.Instance, error) { m.ctrl.T.Helper() @@ -127,6 +142,21 @@ func (mr *MockEC2InterfaceMockRecorder) DeleteLaunchTemplate(arg0 interface{}) * return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DeleteLaunchTemplate", reflect.TypeOf((*MockEC2Interface)(nil).DeleteLaunchTemplate), arg0) } +// DescribeDedicatedHost mocks base method. +func (m *MockEC2Interface) DescribeDedicatedHost(arg0 context.Context, arg1 string) (*v1beta2.DedicatedHostInfo, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "DescribeDedicatedHost", arg0, arg1) + ret0, _ := ret[0].(*v1beta2.DedicatedHostInfo) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// DescribeDedicatedHost indicates an expected call of DescribeDedicatedHost. +func (mr *MockEC2InterfaceMockRecorder) DescribeDedicatedHost(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DescribeDedicatedHost", reflect.TypeOf((*MockEC2Interface)(nil).DescribeDedicatedHost), arg0, arg1) +} + // DetachSecurityGroupsFromNetworkInterface mocks base method. func (m *MockEC2Interface) DetachSecurityGroupsFromNetworkInterface(arg0 []string, arg1 string) error { m.ctrl.T.Helper() @@ -352,6 +382,20 @@ func (mr *MockEC2InterfaceMockRecorder) ReconcileElasticIPFromPublicPool(arg0, a return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReconcileElasticIPFromPublicPool", reflect.TypeOf((*MockEC2Interface)(nil).ReconcileElasticIPFromPublicPool), arg0, arg1) } +// ReleaseDedicatedHost mocks base method. +func (m *MockEC2Interface) ReleaseDedicatedHost(arg0 context.Context, arg1 string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ReleaseDedicatedHost", arg0, arg1) + ret0, _ := ret[0].(error) + return ret0 +} + +// ReleaseDedicatedHost indicates an expected call of ReleaseDedicatedHost. +func (mr *MockEC2InterfaceMockRecorder) ReleaseDedicatedHost(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReleaseDedicatedHost", reflect.TypeOf((*MockEC2Interface)(nil).ReleaseDedicatedHost), arg0, arg1) +} + // ReleaseElasticIP mocks base method. func (m *MockEC2Interface) ReleaseElasticIP(arg0 string) error { m.ctrl.T.Helper() diff --git a/test/mocks/aws_ec2api_mock.go b/test/mocks/aws_ec2api_mock.go index 835e1f1660..2e666f3650 100644 --- a/test/mocks/aws_ec2api_mock.go +++ b/test/mocks/aws_ec2api_mock.go @@ -71,6 +71,26 @@ func (mr *MockEC2APIMockRecorder) AllocateAddress(arg0, arg1 interface{}, arg2 . return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AllocateAddress", reflect.TypeOf((*MockEC2API)(nil).AllocateAddress), varargs...) } +// AllocateHosts mocks base method. +func (m *MockEC2API) AllocateHosts(arg0 context.Context, arg1 *ec2.AllocateHostsInput, arg2 ...func(*ec2.Options)) (*ec2.AllocateHostsOutput, error) { + m.ctrl.T.Helper() + varargs := []interface{}{arg0, arg1} + for _, a := range arg2 { + varargs = append(varargs, a) + } + ret := m.ctrl.Call(m, "AllocateHosts", varargs...) + ret0, _ := ret[0].(*ec2.AllocateHostsOutput) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// AllocateHosts indicates an expected call of AllocateHosts. +func (mr *MockEC2APIMockRecorder) AllocateHosts(arg0, arg1 interface{}, arg2 ...interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + varargs := append([]interface{}{arg0, arg1}, arg2...) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AllocateHosts", reflect.TypeOf((*MockEC2API)(nil).AllocateHosts), varargs...) +} + // AssociateAddress mocks base method. func (m *MockEC2API) AssociateAddress(arg0 context.Context, arg1 *ec2.AssociateAddressInput, arg2 ...func(*ec2.Options)) (*ec2.AssociateAddressOutput, error) { m.ctrl.T.Helper() @@ -771,6 +791,26 @@ func (mr *MockEC2APIMockRecorder) DescribeEgressOnlyInternetGateways(arg0, arg1 return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DescribeEgressOnlyInternetGateways", reflect.TypeOf((*MockEC2API)(nil).DescribeEgressOnlyInternetGateways), varargs...) } +// DescribeHosts mocks base method. +func (m *MockEC2API) DescribeHosts(arg0 context.Context, arg1 *ec2.DescribeHostsInput, arg2 ...func(*ec2.Options)) (*ec2.DescribeHostsOutput, error) { + m.ctrl.T.Helper() + varargs := []interface{}{arg0, arg1} + for _, a := range arg2 { + varargs = append(varargs, a) + } + ret := m.ctrl.Call(m, "DescribeHosts", varargs...) + ret0, _ := ret[0].(*ec2.DescribeHostsOutput) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// DescribeHosts indicates an expected call of DescribeHosts. +func (mr *MockEC2APIMockRecorder) DescribeHosts(arg0, arg1 interface{}, arg2 ...interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + varargs := append([]interface{}{arg0, arg1}, arg2...) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DescribeHosts", reflect.TypeOf((*MockEC2API)(nil).DescribeHosts), varargs...) +} + // DescribeImages mocks base method. func (m *MockEC2API) DescribeImages(arg0 context.Context, arg1 *ec2.DescribeImagesInput, arg2 ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) { m.ctrl.T.Helper() @@ -1291,6 +1331,26 @@ func (mr *MockEC2APIMockRecorder) ReleaseAddress(arg0, arg1 interface{}, arg2 .. return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReleaseAddress", reflect.TypeOf((*MockEC2API)(nil).ReleaseAddress), varargs...) } +// ReleaseHosts mocks base method. +func (m *MockEC2API) ReleaseHosts(arg0 context.Context, arg1 *ec2.ReleaseHostsInput, arg2 ...func(*ec2.Options)) (*ec2.ReleaseHostsOutput, error) { + m.ctrl.T.Helper() + varargs := []interface{}{arg0, arg1} + for _, a := range arg2 { + varargs = append(varargs, a) + } + ret := m.ctrl.Call(m, "ReleaseHosts", varargs...) + ret0, _ := ret[0].(*ec2.ReleaseHostsOutput) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ReleaseHosts indicates an expected call of ReleaseHosts. +func (mr *MockEC2APIMockRecorder) ReleaseHosts(arg0, arg1 interface{}, arg2 ...interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + varargs := append([]interface{}{arg0, arg1}, arg2...) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReleaseHosts", reflect.TypeOf((*MockEC2API)(nil).ReleaseHosts), varargs...) +} + // ReplaceRoute mocks base method. func (m *MockEC2API) ReplaceRoute(arg0 context.Context, arg1 *ec2.ReplaceRouteInput, arg2 ...func(*ec2.Options)) (*ec2.ReplaceRouteOutput, error) { m.ctrl.T.Helper()